1/*
2*******************************************************************************
3* Copyright (C) 1996-2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* rulebasedcollator.cpp
7*
8* (replaced the former tblcoll.cpp)
9*
10* created on: 2012feb14 with new and old collation code
11* created by: Markus W. Scherer
12*/
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_COLLATION
17
18#include "unicode/coll.h"
19#include "unicode/coleitr.h"
20#include "unicode/localpointer.h"
21#include "unicode/locid.h"
22#include "unicode/sortkey.h"
23#include "unicode/tblcoll.h"
24#include "unicode/ucol.h"
25#include "unicode/uiter.h"
26#include "unicode/uloc.h"
27#include "unicode/uniset.h"
28#include "unicode/unistr.h"
29#include "unicode/usetiter.h"
30#include "unicode/utf8.h"
31#include "unicode/uversion.h"
32#include "bocsu.h"
33#include "charstr.h"
34#include "cmemory.h"
35#include "collation.h"
36#include "collationcompare.h"
37#include "collationdata.h"
38#include "collationdatareader.h"
39#include "collationfastlatin.h"
40#include "collationiterator.h"
41#include "collationkeys.h"
42#include "collationroot.h"
43#include "collationsets.h"
44#include "collationsettings.h"
45#include "collationtailoring.h"
46#include "cstring.h"
47#include "uassert.h"
48#include "ucol_imp.h"
49#include "uhash.h"
50#include "uitercollationiterator.h"
51#include "ustr_imp.h"
52#include "utf16collationiterator.h"
53#include "utf8collationiterator.h"
54#include "uvectr64.h"
55
56#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
57
58U_NAMESPACE_BEGIN
59
60namespace {
61
62class FixedSortKeyByteSink : public SortKeyByteSink {
63public:
64    FixedSortKeyByteSink(char *dest, int32_t destCapacity)
65            : SortKeyByteSink(dest, destCapacity) {}
66    virtual ~FixedSortKeyByteSink();
67
68private:
69    virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
70    virtual UBool Resize(int32_t appendCapacity, int32_t length);
71};
72
73FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
74
75void
76FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
77    // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
78    // Fill the buffer completely.
79    int32_t available = capacity_ - length;
80    if (available > 0) {
81        uprv_memcpy(buffer_ + length, bytes, available);
82    }
83}
84
85UBool
86FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
87    return FALSE;
88}
89
90}  // namespace
91
92// Not in an anonymous namespace, so that it can be a friend of CollationKey.
93class CollationKeyByteSink : public SortKeyByteSink {
94public:
95    CollationKeyByteSink(CollationKey &key)
96            : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
97              key_(key) {}
98    virtual ~CollationKeyByteSink();
99
100private:
101    virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
102    virtual UBool Resize(int32_t appendCapacity, int32_t length);
103
104    CollationKey &key_;
105};
106
107CollationKeyByteSink::~CollationKeyByteSink() {}
108
109void
110CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
111    // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
112    if (Resize(n, length)) {
113        uprv_memcpy(buffer_ + length, bytes, n);
114    }
115}
116
117UBool
118CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
119    if (buffer_ == NULL) {
120        return FALSE;  // allocation failed before already
121    }
122    int32_t newCapacity = 2 * capacity_;
123    int32_t altCapacity = length + 2 * appendCapacity;
124    if (newCapacity < altCapacity) {
125        newCapacity = altCapacity;
126    }
127    if (newCapacity < 200) {
128        newCapacity = 200;
129    }
130    uint8_t *newBuffer = key_.reallocate(newCapacity, length);
131    if (newBuffer == NULL) {
132        SetNotOk();
133        return FALSE;
134    }
135    buffer_ = reinterpret_cast<char *>(newBuffer);
136    capacity_ = newCapacity;
137    return TRUE;
138}
139
140RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator &other)
141        : Collator(other),
142          data(other.data),
143          settings(other.settings),
144          tailoring(other.tailoring),
145          validLocale(other.validLocale),
146          explicitlySetAttributes(other.explicitlySetAttributes),
147          actualLocaleIsSameAsValid(other.actualLocaleIsSameAsValid) {
148    settings->addRef();
149    tailoring->addRef();
150}
151
152RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
153                                     const RuleBasedCollator *base, UErrorCode &errorCode)
154        : data(NULL),
155          settings(NULL),
156          tailoring(NULL),
157          validLocale(""),
158          explicitlySetAttributes(0),
159          actualLocaleIsSameAsValid(FALSE) {
160    if(U_FAILURE(errorCode)) { return; }
161    if(bin == NULL || length <= 0 || base == NULL) {
162        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
163        return;
164    }
165    const CollationTailoring *root = CollationRoot::getRoot(errorCode);
166    if(U_FAILURE(errorCode)) { return; }
167    if(base->tailoring != root) {
168        errorCode = U_UNSUPPORTED_ERROR;
169        return;
170    }
171    LocalPointer<CollationTailoring> t(new CollationTailoring(base->tailoring->settings));
172    if(t.isNull() || t->isBogus()) {
173        errorCode = U_MEMORY_ALLOCATION_ERROR;
174        return;
175    }
176    CollationDataReader::read(base->tailoring, bin, length, *t, errorCode);
177    if(U_FAILURE(errorCode)) { return; }
178    t->actualLocale.setToBogus();
179    adoptTailoring(t.orphan());
180}
181
182RuleBasedCollator::RuleBasedCollator(const CollationTailoring *t, const Locale &vl)
183        : data(t->data),
184          settings(t->settings),
185          tailoring(t),
186          validLocale(vl),
187          explicitlySetAttributes(0),
188          actualLocaleIsSameAsValid(FALSE) {
189    settings->addRef();
190    tailoring->addRef();
191}
192
193RuleBasedCollator::~RuleBasedCollator() {
194    SharedObject::clearPtr(settings);
195    SharedObject::clearPtr(tailoring);
196}
197
198void
199RuleBasedCollator::adoptTailoring(CollationTailoring *t) {
200    U_ASSERT(settings == NULL && data == NULL && tailoring == NULL);
201    data = t->data;
202    settings = t->settings;
203    settings->addRef();
204    t->addRef();
205    tailoring = t;
206    validLocale = t->actualLocale;
207    actualLocaleIsSameAsValid = FALSE;
208}
209
210Collator *
211RuleBasedCollator::clone() const {
212    return new RuleBasedCollator(*this);
213}
214
215RuleBasedCollator &RuleBasedCollator::operator=(const RuleBasedCollator &other) {
216    if(this == &other) { return *this; }
217    SharedObject::copyPtr(other.settings, settings);
218    SharedObject::copyPtr(other.tailoring, tailoring);
219    data = tailoring->data;
220    validLocale = other.validLocale;
221    explicitlySetAttributes = other.explicitlySetAttributes;
222    actualLocaleIsSameAsValid = other.actualLocaleIsSameAsValid;
223    return *this;
224}
225
226UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator)
227
228UBool
229RuleBasedCollator::operator==(const Collator& other) const {
230    if(this == &other) { return TRUE; }
231    if(!Collator::operator==(other)) { return FALSE; }
232    const RuleBasedCollator &o = static_cast<const RuleBasedCollator &>(other);
233    if(*settings != *o.settings) { return FALSE; }
234    if(data == o.data) { return TRUE; }
235    UBool thisIsRoot = data->base == NULL;
236    UBool otherIsRoot = o.data->base == NULL;
237    U_ASSERT(!thisIsRoot || !otherIsRoot);  // otherwise their data pointers should be ==
238    if(thisIsRoot != otherIsRoot) { return FALSE; }
239    if((thisIsRoot || !tailoring->rules.isEmpty()) &&
240            (otherIsRoot || !o.tailoring->rules.isEmpty())) {
241        // Shortcut: If both collators have valid rule strings, then compare those.
242        if(tailoring->rules == o.tailoring->rules) { return TRUE; }
243    }
244    // Different rule strings can result in the same or equivalent tailoring.
245    // The rule strings are optional in ICU resource bundles, although included by default.
246    // cloneBinary() drops the rule string.
247    UErrorCode errorCode = U_ZERO_ERROR;
248    LocalPointer<UnicodeSet> thisTailored(getTailoredSet(errorCode));
249    LocalPointer<UnicodeSet> otherTailored(o.getTailoredSet(errorCode));
250    if(U_FAILURE(errorCode)) { return FALSE; }
251    if(*thisTailored != *otherTailored) { return FALSE; }
252    // For completeness, we should compare all of the mappings;
253    // or we should create a list of strings, sort it with one collator,
254    // and check if both collators compare adjacent strings the same
255    // (order & strength, down to quaternary); or similar.
256    // Testing equality of collators seems unusual.
257    return TRUE;
258}
259
260int32_t
261RuleBasedCollator::hashCode() const {
262    int32_t h = settings->hashCode();
263    if(data->base == NULL) { return h; }  // root collator
264    // Do not rely on the rule string, see comments in operator==().
265    UErrorCode errorCode = U_ZERO_ERROR;
266    LocalPointer<UnicodeSet> set(getTailoredSet(errorCode));
267    if(U_FAILURE(errorCode)) { return 0; }
268    UnicodeSetIterator iter(*set);
269    while(iter.next() && !iter.isString()) {
270        h ^= data->getCE32(iter.getCodepoint());
271    }
272    return h;
273}
274
275void
276RuleBasedCollator::setLocales(const Locale &requested, const Locale &valid,
277                              const Locale &actual) {
278    if(actual == tailoring->actualLocale) {
279        actualLocaleIsSameAsValid = FALSE;
280    } else {
281        U_ASSERT(actual == valid);
282        actualLocaleIsSameAsValid = TRUE;
283    }
284    // Do not modify tailoring.actualLocale:
285    // We cannot be sure that that would be thread-safe.
286    validLocale = valid;
287    (void)requested;  // Ignore, see also ticket #10477.
288}
289
290Locale
291RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode& errorCode) const {
292    if(U_FAILURE(errorCode)) {
293        return Locale::getRoot();
294    }
295    switch(type) {
296    case ULOC_ACTUAL_LOCALE:
297        return actualLocaleIsSameAsValid ? validLocale : tailoring->actualLocale;
298    case ULOC_VALID_LOCALE:
299    case ULOC_REQUESTED_LOCALE:  // TODO: Drop this, see ticket #10477.
300        return validLocale;
301    default:
302        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
303        return Locale::getRoot();
304    }
305}
306
307const char *
308RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const {
309    if(U_FAILURE(errorCode)) {
310        return NULL;
311    }
312    const Locale *result;
313    switch(type) {
314    case ULOC_ACTUAL_LOCALE:
315        result = actualLocaleIsSameAsValid ? &validLocale : &tailoring->actualLocale;
316        break;
317    case ULOC_VALID_LOCALE:
318    case ULOC_REQUESTED_LOCALE:  // TODO: Drop this, see ticket #10477.
319        result = &validLocale;
320        break;
321    default:
322        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
323        return NULL;
324    }
325    if(result->isBogus()) { return NULL; }
326    const char *id = result->getName();
327    return id[0] == 0 ? "root" : id;
328}
329
330const UnicodeString&
331RuleBasedCollator::getRules() const {
332    return tailoring->rules;
333}
334
335void
336RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) const {
337    if(delta == UCOL_TAILORING_ONLY) {
338        buffer = tailoring->rules;
339        return;
340    }
341    // UCOL_FULL_RULES
342    buffer.remove();
343    CollationLoader::appendRootRules(buffer);
344    buffer.append(tailoring->rules).getTerminatedBuffer();
345}
346
347void
348RuleBasedCollator::getVersion(UVersionInfo version) const {
349    uprv_memcpy(version, tailoring->version, U_MAX_VERSION_LENGTH);
350    version[0] += (UCOL_RUNTIME_VERSION << 4) + (UCOL_RUNTIME_VERSION >> 4);
351}
352
353UnicodeSet *
354RuleBasedCollator::getTailoredSet(UErrorCode &errorCode) const {
355    if(U_FAILURE(errorCode)) { return NULL; }
356    UnicodeSet *tailored = new UnicodeSet();
357    if(tailored == NULL) {
358        errorCode = U_MEMORY_ALLOCATION_ERROR;
359        return NULL;
360    }
361    if(data->base != NULL) {
362        TailoredSet(tailored).forData(data, errorCode);
363        if(U_FAILURE(errorCode)) {
364            delete tailored;
365            return NULL;
366        }
367    }
368    return tailored;
369}
370
371void
372RuleBasedCollator::internalGetContractionsAndExpansions(
373        UnicodeSet *contractions, UnicodeSet *expansions,
374        UBool addPrefixes, UErrorCode &errorCode) const {
375    if(U_FAILURE(errorCode)) { return; }
376    if(contractions != NULL) {
377        contractions->clear();
378    }
379    if(expansions != NULL) {
380        expansions->clear();
381    }
382    ContractionsAndExpansions(contractions, expansions, NULL, addPrefixes).forData(data, errorCode);
383}
384
385void
386RuleBasedCollator::internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const {
387    if(U_FAILURE(errorCode)) { return; }
388    ContractionsAndExpansions(&set, NULL, NULL, FALSE).forCodePoint(data, c, errorCode);
389}
390
391const CollationSettings &
392RuleBasedCollator::getDefaultSettings() const {
393    return *tailoring->settings;
394}
395
396UColAttributeValue
397RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &errorCode) const {
398    if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
399    int32_t option;
400    switch(attr) {
401    case UCOL_FRENCH_COLLATION:
402        option = CollationSettings::BACKWARD_SECONDARY;
403        break;
404    case UCOL_ALTERNATE_HANDLING:
405        return settings->getAlternateHandling();
406    case UCOL_CASE_FIRST:
407        return settings->getCaseFirst();
408    case UCOL_CASE_LEVEL:
409        option = CollationSettings::CASE_LEVEL;
410        break;
411    case UCOL_NORMALIZATION_MODE:
412        option = CollationSettings::CHECK_FCD;
413        break;
414    case UCOL_STRENGTH:
415        return (UColAttributeValue)settings->getStrength();
416    case UCOL_HIRAGANA_QUATERNARY_MODE:
417        // Deprecated attribute, unsettable.
418        return UCOL_OFF;
419    case UCOL_NUMERIC_COLLATION:
420        option = CollationSettings::NUMERIC;
421        break;
422    default:
423        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
424        return UCOL_DEFAULT;
425    }
426    return ((settings->options & option) == 0) ? UCOL_OFF : UCOL_ON;
427}
428
429void
430RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value,
431                                UErrorCode &errorCode) {
432    UColAttributeValue oldValue = getAttribute(attr, errorCode);
433    if(U_FAILURE(errorCode)) { return; }
434    if(value == oldValue) {
435        setAttributeExplicitly(attr);
436        return;
437    }
438    const CollationSettings &defaultSettings = getDefaultSettings();
439    if(settings == &defaultSettings) {
440        if(value == UCOL_DEFAULT) {
441            setAttributeDefault(attr);
442            return;
443        }
444    }
445    CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
446    if(ownedSettings == NULL) {
447        errorCode = U_MEMORY_ALLOCATION_ERROR;
448        return;
449    }
450
451    switch(attr) {
452    case UCOL_FRENCH_COLLATION:
453        ownedSettings->setFlag(CollationSettings::BACKWARD_SECONDARY, value,
454                               defaultSettings.options, errorCode);
455        break;
456    case UCOL_ALTERNATE_HANDLING:
457        ownedSettings->setAlternateHandling(value, defaultSettings.options, errorCode);
458        break;
459    case UCOL_CASE_FIRST:
460        ownedSettings->setCaseFirst(value, defaultSettings.options, errorCode);
461        break;
462    case UCOL_CASE_LEVEL:
463        ownedSettings->setFlag(CollationSettings::CASE_LEVEL, value,
464                               defaultSettings.options, errorCode);
465        break;
466    case UCOL_NORMALIZATION_MODE:
467        ownedSettings->setFlag(CollationSettings::CHECK_FCD, value,
468                               defaultSettings.options, errorCode);
469        break;
470    case UCOL_STRENGTH:
471        ownedSettings->setStrength(value, defaultSettings.options, errorCode);
472        break;
473    case UCOL_HIRAGANA_QUATERNARY_MODE:
474        // Deprecated attribute. Check for valid values but do not change anything.
475        if(value != UCOL_OFF && value != UCOL_ON && value != UCOL_DEFAULT) {
476            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
477        }
478        break;
479    case UCOL_NUMERIC_COLLATION:
480        ownedSettings->setFlag(CollationSettings::NUMERIC, value, defaultSettings.options, errorCode);
481        break;
482    default:
483        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
484        break;
485    }
486    if(U_FAILURE(errorCode)) { return; }
487    setFastLatinOptions(*ownedSettings);
488    if(value == UCOL_DEFAULT) {
489        setAttributeDefault(attr);
490    } else {
491        setAttributeExplicitly(attr);
492    }
493}
494
495Collator &
496RuleBasedCollator::setMaxVariable(UColReorderCode group, UErrorCode &errorCode) {
497    if(U_FAILURE(errorCode)) { return *this; }
498    // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1.
499    int32_t value;
500    if(group == UCOL_REORDER_CODE_DEFAULT) {
501        value = UCOL_DEFAULT;
502    } else if(UCOL_REORDER_CODE_FIRST <= group && group <= UCOL_REORDER_CODE_CURRENCY) {
503        value = group - UCOL_REORDER_CODE_FIRST;
504    } else {
505        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
506        return *this;
507    }
508    CollationSettings::MaxVariable oldValue = settings->getMaxVariable();
509    if(value == oldValue) {
510        setAttributeExplicitly(ATTR_VARIABLE_TOP);
511        return *this;
512    }
513    const CollationSettings &defaultSettings = getDefaultSettings();
514    if(settings == &defaultSettings) {
515        if(value == UCOL_DEFAULT) {
516            setAttributeDefault(ATTR_VARIABLE_TOP);
517            return *this;
518        }
519    }
520    CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
521    if(ownedSettings == NULL) {
522        errorCode = U_MEMORY_ALLOCATION_ERROR;
523        return *this;
524    }
525
526    if(group == UCOL_REORDER_CODE_DEFAULT) {
527        group = (UColReorderCode)(UCOL_REORDER_CODE_FIRST + defaultSettings.getMaxVariable());
528    }
529    uint32_t varTop = data->getLastPrimaryForGroup(group);
530    U_ASSERT(varTop != 0);
531    ownedSettings->setMaxVariable(value, defaultSettings.options, errorCode);
532    if(U_FAILURE(errorCode)) { return *this; }
533    ownedSettings->variableTop = varTop;
534    setFastLatinOptions(*ownedSettings);
535    if(value == UCOL_DEFAULT) {
536        setAttributeDefault(ATTR_VARIABLE_TOP);
537    } else {
538        setAttributeExplicitly(ATTR_VARIABLE_TOP);
539    }
540    return *this;
541}
542
543UColReorderCode
544RuleBasedCollator::getMaxVariable() const {
545    return (UColReorderCode)(UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
546}
547
548uint32_t
549RuleBasedCollator::getVariableTop(UErrorCode & /*errorCode*/) const {
550    return settings->variableTop;
551}
552
553uint32_t
554RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &errorCode) {
555    if(U_FAILURE(errorCode)) { return 0; }
556    if(varTop == NULL && len !=0) {
557        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
558        return 0;
559    }
560    if(len < 0) { len = u_strlen(varTop); }
561    if(len == 0) {
562        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
563        return 0;
564    }
565    UBool numeric = settings->isNumeric();
566    int64_t ce1, ce2;
567    if(settings->dontCheckFCD()) {
568        UTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
569        ce1 = ci.nextCE(errorCode);
570        ce2 = ci.nextCE(errorCode);
571    } else {
572        FCDUTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
573        ce1 = ci.nextCE(errorCode);
574        ce2 = ci.nextCE(errorCode);
575    }
576    if(ce1 == Collation::NO_CE || ce2 != Collation::NO_CE) {
577        errorCode = U_CE_NOT_FOUND_ERROR;
578        return 0;
579    }
580    setVariableTop((uint32_t)(ce1 >> 32), errorCode);
581    return settings->variableTop;
582}
583
584uint32_t
585RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &errorCode) {
586    return setVariableTop(varTop.getBuffer(), varTop.length(), errorCode);
587}
588
589void
590RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &errorCode) {
591    if(U_FAILURE(errorCode)) { return; }
592    if(varTop != settings->variableTop) {
593        // Pin the variable top to the end of the reordering group which contains it.
594        // Only a few special groups are supported.
595        int32_t group = data->getGroupForPrimary(varTop);
596        if(group < UCOL_REORDER_CODE_FIRST || UCOL_REORDER_CODE_CURRENCY < group) {
597            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
598            return;
599        }
600        uint32_t v = data->getLastPrimaryForGroup(group);
601        U_ASSERT(v != 0 && v >= varTop);
602        varTop = v;
603        if(varTop != settings->variableTop) {
604            CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
605            if(ownedSettings == NULL) {
606                errorCode = U_MEMORY_ALLOCATION_ERROR;
607                return;
608            }
609            ownedSettings->setMaxVariable(group - UCOL_REORDER_CODE_FIRST,
610                                          getDefaultSettings().options, errorCode);
611            if(U_FAILURE(errorCode)) { return; }
612            ownedSettings->variableTop = varTop;
613            setFastLatinOptions(*ownedSettings);
614        }
615    }
616    if(varTop == getDefaultSettings().variableTop) {
617        setAttributeDefault(ATTR_VARIABLE_TOP);
618    } else {
619        setAttributeExplicitly(ATTR_VARIABLE_TOP);
620    }
621}
622
623int32_t
624RuleBasedCollator::getReorderCodes(int32_t *dest, int32_t capacity,
625                                   UErrorCode &errorCode) const {
626    if(U_FAILURE(errorCode)) { return 0; }
627    if(capacity < 0 || (dest == NULL && capacity > 0)) {
628        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
629        return 0;
630    }
631    int32_t length = settings->reorderCodesLength;
632    if(length == 0) { return 0; }
633    if(length > capacity) {
634        errorCode = U_BUFFER_OVERFLOW_ERROR;
635        return length;
636    }
637    uprv_memcpy(dest, settings->reorderCodes, length * 4);
638    return length;
639}
640
641void
642RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length,
643                                   UErrorCode &errorCode) {
644    if(U_FAILURE(errorCode)) { return; }
645    if(length < 0 || (reorderCodes == NULL && length > 0)) {
646        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
647        return;
648    }
649    if(length == settings->reorderCodesLength &&
650            uprv_memcmp(reorderCodes, settings->reorderCodes, length * 4) == 0) {
651        return;
652    }
653    const CollationSettings &defaultSettings = getDefaultSettings();
654    if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_DEFAULT) {
655        if(settings != &defaultSettings) {
656            CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
657            if(ownedSettings == NULL) {
658                errorCode = U_MEMORY_ALLOCATION_ERROR;
659                return;
660            }
661            ownedSettings->aliasReordering(defaultSettings.reorderCodes,
662                                           defaultSettings.reorderCodesLength,
663                                           defaultSettings.reorderTable);
664            setFastLatinOptions(*ownedSettings);
665        }
666        return;
667    }
668    CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
669    if(ownedSettings == NULL) {
670        errorCode = U_MEMORY_ALLOCATION_ERROR;
671        return;
672    }
673    if(length == 0) {
674        ownedSettings->resetReordering();
675    } else {
676        uint8_t reorderTable[256];
677        data->makeReorderTable(reorderCodes, length, reorderTable, errorCode);
678        if(U_FAILURE(errorCode)) { return; }
679        if(!ownedSettings->setReordering(reorderCodes, length, reorderTable)) {
680            errorCode = U_MEMORY_ALLOCATION_ERROR;
681            return;
682        }
683    }
684    setFastLatinOptions(*ownedSettings);
685}
686
687void
688RuleBasedCollator::setFastLatinOptions(CollationSettings &ownedSettings) const {
689    ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
690            data, ownedSettings,
691            ownedSettings.fastLatinPrimaries, LENGTHOF(ownedSettings.fastLatinPrimaries));
692}
693
694UCollationResult
695RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
696                           UErrorCode &errorCode) const {
697    if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
698    return doCompare(left.getBuffer(), left.length(),
699                     right.getBuffer(), right.length(), errorCode);
700}
701
702UCollationResult
703RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
704                           int32_t length, UErrorCode &errorCode) const {
705    if(U_FAILURE(errorCode) || length == 0) { return UCOL_EQUAL; }
706    if(length < 0) {
707        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
708        return UCOL_EQUAL;
709    }
710    int32_t leftLength = left.length();
711    int32_t rightLength = right.length();
712    if(leftLength > length) { leftLength = length; }
713    if(rightLength > length) { rightLength = length; }
714    return doCompare(left.getBuffer(), leftLength,
715                     right.getBuffer(), rightLength, errorCode);
716}
717
718UCollationResult
719RuleBasedCollator::compare(const UChar *left, int32_t leftLength,
720                           const UChar *right, int32_t rightLength,
721                           UErrorCode &errorCode) const {
722    if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
723    if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) {
724        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
725        return UCOL_EQUAL;
726    }
727    // Make sure both or neither strings have a known length.
728    // We do not optimize for mixed length/termination.
729    if(leftLength >= 0) {
730        if(rightLength < 0) { rightLength = u_strlen(right); }
731    } else {
732        if(rightLength >= 0) { leftLength = u_strlen(left); }
733    }
734    return doCompare(left, leftLength, right, rightLength, errorCode);
735}
736
737UCollationResult
738RuleBasedCollator::compareUTF8(const StringPiece &left, const StringPiece &right,
739                               UErrorCode &errorCode) const {
740    if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
741    const uint8_t *leftBytes = reinterpret_cast<const uint8_t *>(left.data());
742    const uint8_t *rightBytes = reinterpret_cast<const uint8_t *>(right.data());
743    if((leftBytes == NULL && !left.empty()) || (rightBytes == NULL && !right.empty())) {
744        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
745        return UCOL_EQUAL;
746    }
747    return doCompare(leftBytes, left.length(), rightBytes, right.length(), errorCode);
748}
749
750UCollationResult
751RuleBasedCollator::internalCompareUTF8(const char *left, int32_t leftLength,
752                                       const char *right, int32_t rightLength,
753                                       UErrorCode &errorCode) const {
754    if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
755    if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) {
756        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
757        return UCOL_EQUAL;
758    }
759    // Make sure both or neither strings have a known length.
760    // We do not optimize for mixed length/termination.
761    if(leftLength >= 0) {
762        if(rightLength < 0) { rightLength = uprv_strlen(right); }
763    } else {
764        if(rightLength >= 0) { leftLength = uprv_strlen(left); }
765    }
766    return doCompare(reinterpret_cast<const uint8_t *>(left), leftLength,
767                     reinterpret_cast<const uint8_t *>(right), rightLength, errorCode);
768}
769
770namespace {
771
772/**
773 * Abstract iterator for identical-level string comparisons.
774 * Returns FCD code points and handles temporary switching to NFD.
775 */
776class NFDIterator {
777public:
778    NFDIterator() : index(-1), length(0) {}
779    virtual ~NFDIterator() {}
780    /**
781     * Returns the next code point from the internal normalization buffer,
782     * or else the next text code point.
783     * Returns -1 at the end of the text.
784     */
785    UChar32 nextCodePoint() {
786        if(index >= 0) {
787            if(index == length) {
788                index = -1;
789            } else {
790                UChar32 c;
791                U16_NEXT_UNSAFE(decomp, index, c);
792                return c;
793            }
794        }
795        return nextRawCodePoint();
796    }
797    /**
798     * @param nfcImpl
799     * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint()
800     * @return the first code point in c's decomposition,
801     *         or c itself if it was decomposed already or if it does not decompose
802     */
803    UChar32 nextDecomposedCodePoint(const Normalizer2Impl &nfcImpl, UChar32 c) {
804        if(index >= 0) { return c; }
805        decomp = nfcImpl.getDecomposition(c, buffer, length);
806        if(decomp == NULL) { return c; }
807        index = 0;
808        U16_NEXT_UNSAFE(decomp, index, c);
809        return c;
810    }
811protected:
812    /**
813     * Returns the next text code point in FCD order.
814     * Returns -1 at the end of the text.
815     */
816    virtual UChar32 nextRawCodePoint() = 0;
817private:
818    const UChar *decomp;
819    UChar buffer[4];
820    int32_t index;
821    int32_t length;
822};
823
824class UTF16NFDIterator : public NFDIterator {
825public:
826    UTF16NFDIterator(const UChar *text, const UChar *textLimit) : s(text), limit(textLimit) {}
827protected:
828    virtual UChar32 nextRawCodePoint() {
829        if(s == limit) { return U_SENTINEL; }
830        UChar32 c = *s++;
831        if(limit == NULL && c == 0) {
832            s = NULL;
833            return U_SENTINEL;
834        }
835        UChar trail;
836        if(U16_IS_LEAD(c) && s != limit && U16_IS_TRAIL(trail = *s)) {
837            ++s;
838            c = U16_GET_SUPPLEMENTARY(c, trail);
839        }
840        return c;
841    }
842
843    const UChar *s;
844    const UChar *limit;
845};
846
847class FCDUTF16NFDIterator : public UTF16NFDIterator {
848public:
849    FCDUTF16NFDIterator(const Normalizer2Impl &nfcImpl, const UChar *text, const UChar *textLimit)
850            : UTF16NFDIterator(NULL, NULL) {
851        UErrorCode errorCode = U_ZERO_ERROR;
852        const UChar *spanLimit = nfcImpl.makeFCD(text, textLimit, NULL, errorCode);
853        if(U_FAILURE(errorCode)) { return; }
854        if(spanLimit == textLimit || (textLimit == NULL && *spanLimit == 0)) {
855            s = text;
856            limit = spanLimit;
857        } else {
858            str.setTo(text, (int32_t)(spanLimit - text));
859            {
860                ReorderingBuffer buffer(nfcImpl, str);
861                if(buffer.init(str.length(), errorCode)) {
862                    nfcImpl.makeFCD(spanLimit, textLimit, &buffer, errorCode);
863                }
864            }
865            if(U_SUCCESS(errorCode)) {
866                s = str.getBuffer();
867                limit = s + str.length();
868            }
869        }
870    }
871private:
872    UnicodeString str;
873};
874
875class UTF8NFDIterator : public NFDIterator {
876public:
877    UTF8NFDIterator(const uint8_t *text, int32_t textLength)
878        : s(text), pos(0), length(textLength) {}
879protected:
880    virtual UChar32 nextRawCodePoint() {
881        if(pos == length || (s[pos] == 0 && length < 0)) { return U_SENTINEL; }
882        UChar32 c;
883        U8_NEXT_OR_FFFD(s, pos, length, c);
884        return c;
885    }
886
887    const uint8_t *s;
888    int32_t pos;
889    int32_t length;
890};
891
892class FCDUTF8NFDIterator : public NFDIterator {
893public:
894    FCDUTF8NFDIterator(const CollationData *data, const uint8_t *text, int32_t textLength)
895            : u8ci(data, FALSE, text, 0, textLength) {}
896protected:
897    virtual UChar32 nextRawCodePoint() {
898        UErrorCode errorCode = U_ZERO_ERROR;
899        return u8ci.nextCodePoint(errorCode);
900    }
901private:
902    FCDUTF8CollationIterator u8ci;
903};
904
905class UIterNFDIterator : public NFDIterator {
906public:
907    UIterNFDIterator(UCharIterator &it) : iter(it) {}
908protected:
909    virtual UChar32 nextRawCodePoint() {
910        return uiter_next32(&iter);
911    }
912private:
913    UCharIterator &iter;
914};
915
916class FCDUIterNFDIterator : public NFDIterator {
917public:
918    FCDUIterNFDIterator(const CollationData *data, UCharIterator &it, int32_t startIndex)
919            : uici(data, FALSE, it, startIndex) {}
920protected:
921    virtual UChar32 nextRawCodePoint() {
922        UErrorCode errorCode = U_ZERO_ERROR;
923        return uici.nextCodePoint(errorCode);
924    }
925private:
926    FCDUIterCollationIterator uici;
927};
928
929UCollationResult compareNFDIter(const Normalizer2Impl &nfcImpl,
930                                NFDIterator &left, NFDIterator &right) {
931    for(;;) {
932        // Fetch the next FCD code point from each string.
933        UChar32 leftCp = left.nextCodePoint();
934        UChar32 rightCp = right.nextCodePoint();
935        if(leftCp == rightCp) {
936            if(leftCp < 0) { break; }
937            continue;
938        }
939        // If they are different, then decompose each and compare again.
940        if(leftCp < 0) {
941            leftCp = -2;  // end of string
942        } else if(leftCp == 0xfffe) {
943            leftCp = -1;  // U+FFFE: merge separator
944        } else {
945            leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp);
946        }
947        if(rightCp < 0) {
948            rightCp = -2;  // end of string
949        } else if(rightCp == 0xfffe) {
950            rightCp = -1;  // U+FFFE: merge separator
951        } else {
952            rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp);
953        }
954        if(leftCp < rightCp) { return UCOL_LESS; }
955        if(leftCp > rightCp) { return UCOL_GREATER; }
956    }
957    return UCOL_EQUAL;
958}
959
960}  // namespace
961
962UCollationResult
963RuleBasedCollator::doCompare(const UChar *left, int32_t leftLength,
964                             const UChar *right, int32_t rightLength,
965                             UErrorCode &errorCode) const {
966    // U_FAILURE(errorCode) checked by caller.
967    if(left == right && leftLength == rightLength) {
968        return UCOL_EQUAL;
969    }
970
971    // Identical-prefix test.
972    const UChar *leftLimit;
973    const UChar *rightLimit;
974    int32_t equalPrefixLength = 0;
975    if(leftLength < 0) {
976        leftLimit = NULL;
977        rightLimit = NULL;
978        UChar c;
979        while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
980            if(c == 0) { return UCOL_EQUAL; }
981            ++equalPrefixLength;
982        }
983    } else {
984        leftLimit = left + leftLength;
985        rightLimit = right + rightLength;
986        for(;;) {
987            if(equalPrefixLength == leftLength) {
988                if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
989                break;
990            } else if(equalPrefixLength == rightLength ||
991                      left[equalPrefixLength] != right[equalPrefixLength]) {
992                break;
993            }
994            ++equalPrefixLength;
995        }
996    }
997
998    UBool numeric = settings->isNumeric();
999    if(equalPrefixLength > 0) {
1000        if((equalPrefixLength != leftLength &&
1001                    data->isUnsafeBackward(left[equalPrefixLength], numeric)) ||
1002                (equalPrefixLength != rightLength &&
1003                    data->isUnsafeBackward(right[equalPrefixLength], numeric))) {
1004            // Identical prefix: Back up to the start of a contraction or reordering sequence.
1005            while(--equalPrefixLength > 0 &&
1006                    data->isUnsafeBackward(left[equalPrefixLength], numeric)) {}
1007        }
1008        // Notes:
1009        // - A longer string can compare equal to a prefix of it if only ignorables follow.
1010        // - With a backward level, a longer string can compare less-than a prefix of it.
1011
1012        // Pass the actual start of each string into the CollationIterators,
1013        // plus the equalPrefixLength position,
1014        // so that prefix matches back into the equal prefix work.
1015    }
1016
1017    int32_t result;
1018    int32_t fastLatinOptions = settings->fastLatinOptions;
1019    if(fastLatinOptions >= 0 &&
1020            (equalPrefixLength == leftLength ||
1021                left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX) &&
1022            (equalPrefixLength == rightLength ||
1023                right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX)) {
1024        if(leftLength >= 0) {
1025            result = CollationFastLatin::compareUTF16(data->fastLatinTable,
1026                                                      settings->fastLatinPrimaries,
1027                                                      fastLatinOptions,
1028                                                      left + equalPrefixLength,
1029                                                      leftLength - equalPrefixLength,
1030                                                      right + equalPrefixLength,
1031                                                      rightLength - equalPrefixLength);
1032        } else {
1033            result = CollationFastLatin::compareUTF16(data->fastLatinTable,
1034                                                      settings->fastLatinPrimaries,
1035                                                      fastLatinOptions,
1036                                                      left + equalPrefixLength, -1,
1037                                                      right + equalPrefixLength, -1);
1038        }
1039    } else {
1040        result = CollationFastLatin::BAIL_OUT_RESULT;
1041    }
1042
1043    if(result == CollationFastLatin::BAIL_OUT_RESULT) {
1044        if(settings->dontCheckFCD()) {
1045            UTF16CollationIterator leftIter(data, numeric,
1046                                            left, left + equalPrefixLength, leftLimit);
1047            UTF16CollationIterator rightIter(data, numeric,
1048                                            right, right + equalPrefixLength, rightLimit);
1049            result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1050        } else {
1051            FCDUTF16CollationIterator leftIter(data, numeric,
1052                                              left, left + equalPrefixLength, leftLimit);
1053            FCDUTF16CollationIterator rightIter(data, numeric,
1054                                                right, right + equalPrefixLength, rightLimit);
1055            result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1056        }
1057    }
1058    if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1059        return (UCollationResult)result;
1060    }
1061
1062    // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1063    // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1064    // and the benefit seems unlikely to be measurable.
1065
1066    // Compare identical level.
1067    const Normalizer2Impl &nfcImpl = data->nfcImpl;
1068    left += equalPrefixLength;
1069    right += equalPrefixLength;
1070    if(settings->dontCheckFCD()) {
1071        UTF16NFDIterator leftIter(left, leftLimit);
1072        UTF16NFDIterator rightIter(right, rightLimit);
1073        return compareNFDIter(nfcImpl, leftIter, rightIter);
1074    } else {
1075        FCDUTF16NFDIterator leftIter(nfcImpl, left, leftLimit);
1076        FCDUTF16NFDIterator rightIter(nfcImpl, right, rightLimit);
1077        return compareNFDIter(nfcImpl, leftIter, rightIter);
1078    }
1079}
1080
1081UCollationResult
1082RuleBasedCollator::doCompare(const uint8_t *left, int32_t leftLength,
1083                             const uint8_t *right, int32_t rightLength,
1084                             UErrorCode &errorCode) const {
1085    // U_FAILURE(errorCode) checked by caller.
1086    if(left == right && leftLength == rightLength) {
1087        return UCOL_EQUAL;
1088    }
1089
1090    // Identical-prefix test.
1091    int32_t equalPrefixLength = 0;
1092    if(leftLength < 0) {
1093        uint8_t c;
1094        while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
1095            if(c == 0) { return UCOL_EQUAL; }
1096            ++equalPrefixLength;
1097        }
1098    } else {
1099        for(;;) {
1100            if(equalPrefixLength == leftLength) {
1101                if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
1102                break;
1103            } else if(equalPrefixLength == rightLength ||
1104                      left[equalPrefixLength] != right[equalPrefixLength]) {
1105                break;
1106            }
1107            ++equalPrefixLength;
1108        }
1109    }
1110    // Back up to the start of a partially-equal code point.
1111    if(equalPrefixLength > 0 &&
1112            ((equalPrefixLength != leftLength && U8_IS_TRAIL(left[equalPrefixLength])) ||
1113            (equalPrefixLength != rightLength && U8_IS_TRAIL(right[equalPrefixLength])))) {
1114        while(--equalPrefixLength > 0 && U8_IS_TRAIL(left[equalPrefixLength])) {}
1115    }
1116
1117    UBool numeric = settings->isNumeric();
1118    if(equalPrefixLength > 0) {
1119        UBool unsafe = FALSE;
1120        if(equalPrefixLength != leftLength) {
1121            int32_t i = equalPrefixLength;
1122            UChar32 c;
1123            U8_NEXT_OR_FFFD(left, i, leftLength, c);
1124            unsafe = data->isUnsafeBackward(c, numeric);
1125        }
1126        if(!unsafe && equalPrefixLength != rightLength) {
1127            int32_t i = equalPrefixLength;
1128            UChar32 c;
1129            U8_NEXT_OR_FFFD(right, i, rightLength, c);
1130            unsafe = data->isUnsafeBackward(c, numeric);
1131        }
1132        if(unsafe) {
1133            // Identical prefix: Back up to the start of a contraction or reordering sequence.
1134            UChar32 c;
1135            do {
1136                U8_PREV_OR_FFFD(left, 0, equalPrefixLength, c);
1137            } while(equalPrefixLength > 0 && data->isUnsafeBackward(c, numeric));
1138        }
1139        // See the notes in the UTF-16 version.
1140
1141        // Pass the actual start of each string into the CollationIterators,
1142        // plus the equalPrefixLength position,
1143        // so that prefix matches back into the equal prefix work.
1144    }
1145
1146    int32_t result;
1147    int32_t fastLatinOptions = settings->fastLatinOptions;
1148    if(fastLatinOptions >= 0 &&
1149            (equalPrefixLength == leftLength ||
1150                left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD) &&
1151            (equalPrefixLength == rightLength ||
1152                right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD)) {
1153        if(leftLength >= 0) {
1154            result = CollationFastLatin::compareUTF8(data->fastLatinTable,
1155                                                     settings->fastLatinPrimaries,
1156                                                     fastLatinOptions,
1157                                                     left + equalPrefixLength,
1158                                                     leftLength - equalPrefixLength,
1159                                                     right + equalPrefixLength,
1160                                                     rightLength - equalPrefixLength);
1161        } else {
1162            result = CollationFastLatin::compareUTF8(data->fastLatinTable,
1163                                                     settings->fastLatinPrimaries,
1164                                                     fastLatinOptions,
1165                                                     left + equalPrefixLength, -1,
1166                                                     right + equalPrefixLength, -1);
1167        }
1168    } else {
1169        result = CollationFastLatin::BAIL_OUT_RESULT;
1170    }
1171
1172    if(result == CollationFastLatin::BAIL_OUT_RESULT) {
1173        if(settings->dontCheckFCD()) {
1174            UTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
1175            UTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
1176            result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1177        } else {
1178            FCDUTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
1179            FCDUTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
1180            result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1181        }
1182    }
1183    if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1184        return (UCollationResult)result;
1185    }
1186
1187    // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1188    // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1189    // and the benefit seems unlikely to be measurable.
1190
1191    // Compare identical level.
1192    const Normalizer2Impl &nfcImpl = data->nfcImpl;
1193    left += equalPrefixLength;
1194    right += equalPrefixLength;
1195    if(leftLength > 0) {
1196        leftLength -= equalPrefixLength;
1197        rightLength -= equalPrefixLength;
1198    }
1199    if(settings->dontCheckFCD()) {
1200        UTF8NFDIterator leftIter(left, leftLength);
1201        UTF8NFDIterator rightIter(right, rightLength);
1202        return compareNFDIter(nfcImpl, leftIter, rightIter);
1203    } else {
1204        FCDUTF8NFDIterator leftIter(data, left, leftLength);
1205        FCDUTF8NFDIterator rightIter(data, right, rightLength);
1206        return compareNFDIter(nfcImpl, leftIter, rightIter);
1207    }
1208}
1209
1210UCollationResult
1211RuleBasedCollator::compare(UCharIterator &left, UCharIterator &right,
1212                           UErrorCode &errorCode) const {
1213    if(U_FAILURE(errorCode) || &left == &right) { return UCOL_EQUAL; }
1214    UBool numeric = settings->isNumeric();
1215
1216    // Identical-prefix test.
1217    int32_t equalPrefixLength = 0;
1218    {
1219        UChar32 leftUnit;
1220        UChar32 rightUnit;
1221        while((leftUnit = left.next(&left)) == (rightUnit = right.next(&right))) {
1222            if(leftUnit < 0) { return UCOL_EQUAL; }
1223            ++equalPrefixLength;
1224        }
1225
1226        // Back out the code units that differed, for the real collation comparison.
1227        if(leftUnit >= 0) { left.previous(&left); }
1228        if(rightUnit >= 0) { right.previous(&right); }
1229
1230        if(equalPrefixLength > 0) {
1231            if((leftUnit >= 0 && data->isUnsafeBackward(leftUnit, numeric)) ||
1232                    (rightUnit >= 0 && data->isUnsafeBackward(rightUnit, numeric))) {
1233                // Identical prefix: Back up to the start of a contraction or reordering sequence.
1234                do {
1235                    --equalPrefixLength;
1236                    leftUnit = left.previous(&left);
1237                    right.previous(&right);
1238                } while(equalPrefixLength > 0 && data->isUnsafeBackward(leftUnit, numeric));
1239            }
1240            // See the notes in the UTF-16 version.
1241        }
1242    }
1243
1244    UCollationResult result;
1245    if(settings->dontCheckFCD()) {
1246        UIterCollationIterator leftIter(data, numeric, left);
1247        UIterCollationIterator rightIter(data, numeric, right);
1248        result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1249    } else {
1250        FCDUIterCollationIterator leftIter(data, numeric, left, equalPrefixLength);
1251        FCDUIterCollationIterator rightIter(data, numeric, right, equalPrefixLength);
1252        result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1253    }
1254    if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1255        return result;
1256    }
1257
1258    // Compare identical level.
1259    left.move(&left, equalPrefixLength, UITER_ZERO);
1260    right.move(&right, equalPrefixLength, UITER_ZERO);
1261    const Normalizer2Impl &nfcImpl = data->nfcImpl;
1262    if(settings->dontCheckFCD()) {
1263        UIterNFDIterator leftIter(left);
1264        UIterNFDIterator rightIter(right);
1265        return compareNFDIter(nfcImpl, leftIter, rightIter);
1266    } else {
1267        FCDUIterNFDIterator leftIter(data, left, equalPrefixLength);
1268        FCDUIterNFDIterator rightIter(data, right, equalPrefixLength);
1269        return compareNFDIter(nfcImpl, leftIter, rightIter);
1270    }
1271}
1272
1273CollationKey &
1274RuleBasedCollator::getCollationKey(const UnicodeString &s, CollationKey &key,
1275                                   UErrorCode &errorCode) const {
1276    return getCollationKey(s.getBuffer(), s.length(), key, errorCode);
1277}
1278
1279CollationKey &
1280RuleBasedCollator::getCollationKey(const UChar *s, int32_t length, CollationKey& key,
1281                                   UErrorCode &errorCode) const {
1282    if(U_FAILURE(errorCode)) {
1283        return key.setToBogus();
1284    }
1285    if(s == NULL && length != 0) {
1286        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1287        return key.setToBogus();
1288    }
1289    key.reset();  // resets the "bogus" state
1290    CollationKeyByteSink sink(key);
1291    writeSortKey(s, length, sink, errorCode);
1292    if(U_FAILURE(errorCode)) {
1293        key.setToBogus();
1294    } else if(key.isBogus()) {
1295        errorCode = U_MEMORY_ALLOCATION_ERROR;
1296    } else {
1297        key.setLength(sink.NumberOfBytesAppended());
1298    }
1299    return key;
1300}
1301
1302int32_t
1303RuleBasedCollator::getSortKey(const UnicodeString &s,
1304                              uint8_t *dest, int32_t capacity) const {
1305    return getSortKey(s.getBuffer(), s.length(), dest, capacity);
1306}
1307
1308int32_t
1309RuleBasedCollator::getSortKey(const UChar *s, int32_t length,
1310                              uint8_t *dest, int32_t capacity) const {
1311    if((s == NULL && length != 0) || capacity < 0 || (dest == NULL && capacity > 0)) {
1312        return 0;
1313    }
1314    uint8_t noDest[1] = { 0 };
1315    if(dest == NULL) {
1316        // Distinguish pure preflighting from an allocation error.
1317        dest = noDest;
1318        capacity = 0;
1319    }
1320    FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), capacity);
1321    UErrorCode errorCode = U_ZERO_ERROR;
1322    writeSortKey(s, length, sink, errorCode);
1323    return U_SUCCESS(errorCode) ? sink.NumberOfBytesAppended() : 0;
1324}
1325
1326void
1327RuleBasedCollator::writeSortKey(const UChar *s, int32_t length,
1328                                SortKeyByteSink &sink, UErrorCode &errorCode) const {
1329    if(U_FAILURE(errorCode)) { return; }
1330    const UChar *limit = (length >= 0) ? s + length : NULL;
1331    UBool numeric = settings->isNumeric();
1332    CollationKeys::LevelCallback callback;
1333    if(settings->dontCheckFCD()) {
1334        UTF16CollationIterator iter(data, numeric, s, s, limit);
1335        CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
1336                                                  sink, Collation::PRIMARY_LEVEL,
1337                                                  callback, TRUE, errorCode);
1338    } else {
1339        FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
1340        CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
1341                                                  sink, Collation::PRIMARY_LEVEL,
1342                                                  callback, TRUE, errorCode);
1343    }
1344    if(settings->getStrength() == UCOL_IDENTICAL) {
1345        writeIdenticalLevel(s, limit, sink, errorCode);
1346    }
1347    static const char terminator = 0;  // TERMINATOR_BYTE
1348    sink.Append(&terminator, 1);
1349}
1350
1351void
1352RuleBasedCollator::writeIdenticalLevel(const UChar *s, const UChar *limit,
1353                                       SortKeyByteSink &sink, UErrorCode &errorCode) const {
1354    // NFD quick check
1355    const UChar *nfdQCYesLimit = data->nfcImpl.decompose(s, limit, NULL, errorCode);
1356    if(U_FAILURE(errorCode)) { return; }
1357    sink.Append(Collation::LEVEL_SEPARATOR_BYTE);
1358    UChar32 prev = 0;
1359    if(nfdQCYesLimit != s) {
1360        prev = u_writeIdenticalLevelRun(prev, s, (int32_t)(nfdQCYesLimit - s), sink);
1361    }
1362    // Is there non-NFD text?
1363    int32_t destLengthEstimate;
1364    if(limit != NULL) {
1365        if(nfdQCYesLimit == limit) { return; }
1366        destLengthEstimate = (int32_t)(limit - nfdQCYesLimit);
1367    } else {
1368        // s is NUL-terminated
1369        if(*nfdQCYesLimit == 0) { return; }
1370        destLengthEstimate = -1;
1371    }
1372    UnicodeString nfd;
1373    data->nfcImpl.decompose(nfdQCYesLimit, limit, nfd, destLengthEstimate, errorCode);
1374    u_writeIdenticalLevelRun(prev, nfd.getBuffer(), nfd.length(), sink);
1375}
1376
1377namespace {
1378
1379/**
1380 * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary()
1381 * with an instance of this callback class.
1382 * When another level is about to be written, the callback
1383 * records the level and the number of bytes that will be written until
1384 * the sink (which is actually a FixedSortKeyByteSink) fills up.
1385 *
1386 * When internalNextSortKeyPart() is called again, it restarts with the last level
1387 * and ignores as many bytes as were written previously for that level.
1388 */
1389class PartLevelCallback : public CollationKeys::LevelCallback {
1390public:
1391    PartLevelCallback(const SortKeyByteSink &s)
1392            : sink(s), level(Collation::PRIMARY_LEVEL) {
1393        levelCapacity = sink.GetRemainingCapacity();
1394    }
1395    virtual ~PartLevelCallback() {}
1396    virtual UBool needToWrite(Collation::Level l) {
1397        if(!sink.Overflowed()) {
1398            // Remember a level that will be at least partially written.
1399            level = l;
1400            levelCapacity = sink.GetRemainingCapacity();
1401            return TRUE;
1402        } else {
1403            return FALSE;
1404        }
1405    }
1406    Collation::Level getLevel() const { return level; }
1407    int32_t getLevelCapacity() const { return levelCapacity; }
1408
1409private:
1410    const SortKeyByteSink &sink;
1411    Collation::Level level;
1412    int32_t levelCapacity;
1413};
1414
1415}  // namespace
1416
1417int32_t
1418RuleBasedCollator::internalNextSortKeyPart(UCharIterator *iter, uint32_t state[2],
1419                                           uint8_t *dest, int32_t count, UErrorCode &errorCode) const {
1420    if(U_FAILURE(errorCode)) { return 0; }
1421    if(iter == NULL || state == NULL || count < 0 || (count > 0 && dest == NULL)) {
1422        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1423        return 0;
1424    }
1425    if(count == 0) { return 0; }
1426
1427    FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), count);
1428    sink.IgnoreBytes((int32_t)state[1]);
1429    iter->move(iter, 0, UITER_START);
1430
1431    Collation::Level level = (Collation::Level)state[0];
1432    if(level <= Collation::QUATERNARY_LEVEL) {
1433        UBool numeric = settings->isNumeric();
1434        PartLevelCallback callback(sink);
1435        if(settings->dontCheckFCD()) {
1436            UIterCollationIterator ci(data, numeric, *iter);
1437            CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
1438                                                      sink, level, callback, FALSE, errorCode);
1439        } else {
1440            FCDUIterCollationIterator ci(data, numeric, *iter, 0);
1441            CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
1442                                                      sink, level, callback, FALSE, errorCode);
1443        }
1444        if(U_FAILURE(errorCode)) { return 0; }
1445        if(sink.NumberOfBytesAppended() > count) {
1446            state[0] = (uint32_t)callback.getLevel();
1447            state[1] = (uint32_t)callback.getLevelCapacity();
1448            return count;
1449        }
1450        // All of the normal levels are done.
1451        if(settings->getStrength() == UCOL_IDENTICAL) {
1452            level = Collation::IDENTICAL_LEVEL;
1453            iter->move(iter, 0, UITER_START);
1454        }
1455        // else fall through to setting ZERO_LEVEL
1456    }
1457
1458    if(level == Collation::IDENTICAL_LEVEL) {
1459        int32_t levelCapacity = sink.GetRemainingCapacity();
1460        UnicodeString s;
1461        for(;;) {
1462            UChar32 c = iter->next(iter);
1463            if(c < 0) { break; }
1464            s.append((UChar)c);
1465        }
1466        const UChar *sArray = s.getBuffer();
1467        writeIdenticalLevel(sArray, sArray + s.length(), sink, errorCode);
1468        if(U_FAILURE(errorCode)) { return 0; }
1469        if(sink.NumberOfBytesAppended() > count) {
1470            state[0] = (uint32_t)level;
1471            state[1] = (uint32_t)levelCapacity;
1472            return count;
1473        }
1474    }
1475
1476    // ZERO_LEVEL: Fill the remainder of dest with 00 bytes.
1477    state[0] = (uint32_t)Collation::ZERO_LEVEL;
1478    state[1] = 0;
1479    int32_t length = sink.NumberOfBytesAppended();
1480    int32_t i = length;
1481    while(i < count) { dest[i++] = 0; }
1482    return length;
1483}
1484
1485void
1486RuleBasedCollator::internalGetCEs(const UnicodeString &str, UVector64 &ces,
1487                                  UErrorCode &errorCode) const {
1488    if(U_FAILURE(errorCode)) { return; }
1489    const UChar *s = str.getBuffer();
1490    const UChar *limit = s + str.length();
1491    UBool numeric = settings->isNumeric();
1492    if(settings->dontCheckFCD()) {
1493        UTF16CollationIterator iter(data, numeric, s, s, limit);
1494        int64_t ce;
1495        while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
1496            ces.addElement(ce, errorCode);
1497        }
1498    } else {
1499        FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
1500        int64_t ce;
1501        while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
1502            ces.addElement(ce, errorCode);
1503        }
1504    }
1505}
1506
1507namespace {
1508
1509void appendSubtag(CharString &s, char letter, const char *subtag, int32_t length,
1510                  UErrorCode &errorCode) {
1511    if(U_FAILURE(errorCode) || length == 0) { return; }
1512    if(!s.isEmpty()) {
1513        s.append('_', errorCode);
1514    }
1515    s.append(letter, errorCode);
1516    for(int32_t i = 0; i < length; ++i) {
1517        s.append(uprv_toupper(subtag[i]), errorCode);
1518    }
1519}
1520
1521void appendAttribute(CharString &s, char letter, UColAttributeValue value,
1522                     UErrorCode &errorCode) {
1523    if(U_FAILURE(errorCode)) { return; }
1524    if(!s.isEmpty()) {
1525        s.append('_', errorCode);
1526    }
1527    static const char *valueChars = "1234...........IXO..SN..LU......";
1528    s.append(letter, errorCode);
1529    s.append(valueChars[value], errorCode);
1530}
1531
1532}  // namespace
1533
1534int32_t
1535RuleBasedCollator::internalGetShortDefinitionString(const char *locale,
1536                                                    char *buffer, int32_t capacity,
1537                                                    UErrorCode &errorCode) const {
1538    if(U_FAILURE(errorCode)) { return 0; }
1539    if(buffer == NULL ? capacity != 0 : capacity < 0) {
1540        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1541        return 0;
1542    }
1543    if(locale == NULL) {
1544        locale = internalGetLocaleID(ULOC_VALID_LOCALE, errorCode);
1545    }
1546
1547    char resultLocale[ULOC_FULLNAME_CAPACITY + 1];
1548    int32_t length = ucol_getFunctionalEquivalent(resultLocale, ULOC_FULLNAME_CAPACITY,
1549                                                  "collation", locale,
1550                                                  NULL, &errorCode);
1551    if(U_FAILURE(errorCode)) { return 0; }
1552    if(length == 0) {
1553        uprv_strcpy(resultLocale, "root");
1554    } else {
1555        resultLocale[length] = 0;
1556    }
1557
1558    // Append items in alphabetic order of their short definition letters.
1559    CharString result;
1560    char subtag[ULOC_KEYWORD_AND_VALUES_CAPACITY];
1561
1562    if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING)) {
1563        appendAttribute(result, 'A', getAttribute(UCOL_ALTERNATE_HANDLING, errorCode), errorCode);
1564    }
1565    // ATTR_VARIABLE_TOP not supported because 'B' was broken.
1566    // See ICU tickets #10372 and #10386.
1567    if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST)) {
1568        appendAttribute(result, 'C', getAttribute(UCOL_CASE_FIRST, errorCode), errorCode);
1569    }
1570    if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION)) {
1571        appendAttribute(result, 'D', getAttribute(UCOL_NUMERIC_COLLATION, errorCode), errorCode);
1572    }
1573    if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL)) {
1574        appendAttribute(result, 'E', getAttribute(UCOL_CASE_LEVEL, errorCode), errorCode);
1575    }
1576    if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION)) {
1577        appendAttribute(result, 'F', getAttribute(UCOL_FRENCH_COLLATION, errorCode), errorCode);
1578    }
1579    // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default.
1580    length = uloc_getKeywordValue(resultLocale, "collation", subtag, LENGTHOF(subtag), &errorCode);
1581    appendSubtag(result, 'K', subtag, length, errorCode);
1582    length = uloc_getLanguage(resultLocale, subtag, LENGTHOF(subtag), &errorCode);
1583    appendSubtag(result, 'L', subtag, length, errorCode);
1584    if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE)) {
1585        appendAttribute(result, 'N', getAttribute(UCOL_NORMALIZATION_MODE, errorCode), errorCode);
1586    }
1587    length = uloc_getCountry(resultLocale, subtag, LENGTHOF(subtag), &errorCode);
1588    appendSubtag(result, 'R', subtag, length, errorCode);
1589    if(attributeHasBeenSetExplicitly(UCOL_STRENGTH)) {
1590        appendAttribute(result, 'S', getAttribute(UCOL_STRENGTH, errorCode), errorCode);
1591    }
1592    length = uloc_getVariant(resultLocale, subtag, LENGTHOF(subtag), &errorCode);
1593    appendSubtag(result, 'V', subtag, length, errorCode);
1594    length = uloc_getScript(resultLocale, subtag, LENGTHOF(subtag), &errorCode);
1595    appendSubtag(result, 'Z', subtag, length, errorCode);
1596
1597    if(U_FAILURE(errorCode)) { return 0; }
1598    if(result.length() <= capacity) {
1599        uprv_memcpy(buffer, result.data(), result.length());
1600    }
1601    return u_terminateChars(buffer, capacity, result.length(), &errorCode);
1602}
1603
1604UBool
1605RuleBasedCollator::isUnsafe(UChar32 c) const {
1606    return data->isUnsafeBackward(c, settings->isNumeric());
1607}
1608
1609void
1610RuleBasedCollator::computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode) {
1611    t->maxExpansions = CollationElementIterator::computeMaxExpansions(t->data, errorCode);
1612}
1613
1614UBool
1615RuleBasedCollator::initMaxExpansions(UErrorCode &errorCode) const {
1616    umtx_initOnce(tailoring->maxExpansionsInitOnce, computeMaxExpansions, tailoring, errorCode);
1617    return U_SUCCESS(errorCode);
1618}
1619
1620CollationElementIterator *
1621RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const {
1622    UErrorCode errorCode = U_ZERO_ERROR;
1623    if(!initMaxExpansions(errorCode)) { return NULL; }
1624    CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
1625    if(U_FAILURE(errorCode)) {
1626        delete cei;
1627        return NULL;
1628    }
1629    return cei;
1630}
1631
1632CollationElementIterator *
1633RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const {
1634    UErrorCode errorCode = U_ZERO_ERROR;
1635    if(!initMaxExpansions(errorCode)) { return NULL; }
1636    CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
1637    if(U_FAILURE(errorCode)) {
1638        delete cei;
1639        return NULL;
1640    }
1641    return cei;
1642}
1643
1644int32_t
1645RuleBasedCollator::getMaxExpansion(int32_t order) const {
1646    UErrorCode errorCode = U_ZERO_ERROR;
1647    (void)initMaxExpansions(errorCode);
1648    return CollationElementIterator::getMaxExpansion(tailoring->maxExpansions, order);
1649}
1650
1651U_NAMESPACE_END
1652
1653#endif  // !UCONFIG_NO_COLLATION
1654