1/*
2*******************************************************************************
3* Copyright (C) 2009-2014, International Business Machines Corporation and
4* others. All Rights Reserved.
5*******************************************************************************
6*/
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_COLLATION
11
12#include "unicode/alphaindex.h"
13#include "unicode/coll.h"
14#include "unicode/localpointer.h"
15#include "unicode/normalizer2.h"
16#include "unicode/tblcoll.h"
17#include "unicode/uchar.h"
18#include "unicode/ulocdata.h"
19#include "unicode/uniset.h"
20#include "unicode/uobject.h"
21#include "unicode/usetiter.h"
22#include "unicode/utf16.h"
23
24#include "cmemory.h"
25#include "cstring.h"
26#include "uassert.h"
27#include "uvector.h"
28#include "uvectr64.h"
29
30//#include <string>
31//#include <iostream>
32
33#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
34
35U_NAMESPACE_BEGIN
36
37namespace {
38
39/**
40 * Prefix string for Chinese index buckets.
41 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
42 */
43const UChar BASE[1] = { 0xFDD0 };
44const int32_t BASE_LENGTH = 1;
45
46UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
47                                const UnicodeString &one, const UnicodeString &other);
48
49}  // namespace
50
51static int32_t U_CALLCONV
52collatorComparator(const void *context, const void *left, const void *right);
53
54static int32_t U_CALLCONV
55recordCompareFn(const void *context, const void *left, const void *right);
56
57//  UVector<Record *> support function, delete a Record.
58static void U_CALLCONV
59alphaIndex_deleteRecord(void *obj) {
60    delete static_cast<AlphabeticIndex::Record *>(obj);
61}
62
63namespace {
64
65UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
66                           UErrorCode &errorCode) {
67    if (U_FAILURE(errorCode)) { return NULL; }
68    if (owned.isValid()) {
69        return owned.orphan();
70    }
71    UnicodeString *p = new UnicodeString(s);
72    if (p == NULL) {
73        errorCode = U_MEMORY_ALLOCATION_ERROR;
74    }
75    return p;
76}
77
78inline UnicodeString *getString(const UVector &list, int32_t i) {
79    return static_cast<UnicodeString *>(list[i]);
80}
81
82inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
83    return static_cast<AlphabeticIndex::Bucket *>(list[i]);
84}
85
86inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
87    return static_cast<AlphabeticIndex::Record *>(list[i]);
88}
89
90/**
91 * Like Java Collections.binarySearch(List, String, Comparator).
92 *
93 * @return the index>=0 where the item was found,
94 *         or the index<0 for inserting the string at ~index in sorted order
95 */
96int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
97    if (list.size() == 0) { return ~0; }
98    int32_t start = 0;
99    int32_t limit = list.size();
100    for (;;) {
101        int32_t i = (start + limit) / 2;
102        const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
103        UErrorCode errorCode = U_ZERO_ERROR;
104        UCollationResult cmp = coll.compare(s, *si, errorCode);
105        if (cmp == UCOL_EQUAL) {
106            return i;
107        } else if (cmp < 0) {
108            if (i == start) {
109                return ~start;  // insert s before *si
110            }
111            limit = i;
112        } else {
113            if (i == start) {
114                return ~(start + 1);  // insert s after *si
115            }
116            start = i;
117        }
118    }
119}
120
121}  // namespace
122
123// The BucketList is not in the anonymous namespace because only Clang
124// seems to support its use in other classes from there.
125// However, we also don't need U_I18N_API because it is not used from outside the i18n library.
126class BucketList : public UObject {
127public:
128    BucketList(UVector *bucketList, UVector *publicBucketList)
129            : bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
130        int32_t displayIndex = 0;
131        for (int32_t i = 0; i < publicBucketList->size(); ++i) {
132            getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
133        }
134    }
135
136    // The virtual destructor must not be inline.
137    // See ticket #8454 for details.
138    virtual ~BucketList();
139
140    int32_t getBucketCount() const {
141        return immutableVisibleList_->size();
142    }
143
144    int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
145                           UErrorCode &errorCode) {
146        // binary search
147        int32_t start = 0;
148        int32_t limit = bucketList_->size();
149        while ((start + 1) < limit) {
150            int32_t i = (start + limit) / 2;
151            const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
152            UCollationResult nameVsBucket =
153                collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
154            if (nameVsBucket < 0) {
155                limit = i;
156            } else {
157                start = i;
158            }
159        }
160        const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
161        if (bucket->displayBucket_ != NULL) {
162            bucket = bucket->displayBucket_;
163        }
164        return bucket->displayIndex_;
165    }
166
167    /** All of the buckets, visible and invisible. */
168    UVector *bucketList_;
169    /** Just the visible buckets. */
170    UVector *immutableVisibleList_;
171};
172
173BucketList::~BucketList() {
174    delete bucketList_;
175    if (immutableVisibleList_ != bucketList_) {
176        delete immutableVisibleList_;
177    }
178}
179
180AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
181    delete buckets_;
182    delete collatorPrimaryOnly_;
183}
184
185int32_t
186AlphabeticIndex::ImmutableIndex::getBucketCount() const {
187    return buckets_->getBucketCount();
188}
189
190int32_t
191AlphabeticIndex::ImmutableIndex::getBucketIndex(
192        const UnicodeString &name, UErrorCode &errorCode) const {
193    return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
194}
195
196const AlphabeticIndex::Bucket *
197AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
198    if (0 <= index && index < buckets_->getBucketCount()) {
199        return icu::getBucket(*buckets_->immutableVisibleList_, index);
200    } else {
201        return NULL;
202    }
203}
204
205AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
206        : inputList_(NULL),
207          labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
208          maxLabelCount_(99),
209          initialLabels_(NULL), firstCharsInScripts_(NULL),
210          collator_(NULL), collatorPrimaryOnly_(NULL),
211          buckets_(NULL) {
212    init(&locale, status);
213}
214
215
216AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
217        : inputList_(NULL),
218          labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
219          maxLabelCount_(99),
220          initialLabels_(NULL), firstCharsInScripts_(NULL),
221          collator_(collator), collatorPrimaryOnly_(NULL),
222          buckets_(NULL) {
223    init(NULL, status);
224}
225
226
227
228AlphabeticIndex::~AlphabeticIndex() {
229    delete collator_;
230    delete collatorPrimaryOnly_;
231    delete firstCharsInScripts_;
232    delete buckets_;
233    delete inputList_;
234    delete initialLabels_;
235}
236
237
238AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
239    if (U_FAILURE(status)) {
240        return *this;
241    }
242    initialLabels_->addAll(additions);
243    clearBuckets();
244    return *this;
245}
246
247
248AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
249    addIndexExemplars(locale, status);
250    clearBuckets();
251    return *this;
252}
253
254
255AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
256    if (U_FAILURE(errorCode)) { return NULL; }
257    // In C++, the ImmutableIndex must own its copy of the BucketList,
258    // even if it contains no records, for proper memory management.
259    // We could clone the buckets_ if they are not NULL,
260    // but that would be worth it only if this method is called multiple times,
261    // or called after using the old-style bucket iterator API.
262    LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
263    LocalPointer<RuleBasedCollator> coll(
264        static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone()));
265    if (immutableBucketList.isNull() || coll.isNull()) {
266        errorCode = U_MEMORY_ALLOCATION_ERROR;
267        return NULL;
268    }
269    ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
270    if (immIndex == NULL) {
271        errorCode = U_MEMORY_ALLOCATION_ERROR;
272        return NULL;
273    }
274    // The ImmutableIndex adopted its parameter objects.
275    immutableBucketList.orphan();
276    coll.orphan();
277    return immIndex;
278}
279
280int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
281    initBuckets(status);
282    if (U_FAILURE(status)) {
283        return 0;
284    }
285    return buckets_->getBucketCount();
286}
287
288
289int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
290    if (U_FAILURE(status) || inputList_ == NULL) {
291        return 0;
292    }
293    return inputList_->size();
294}
295
296void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
297    const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
298    if (U_FAILURE(errorCode)) { return; }
299
300    const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
301    const UnicodeString &overflowBoundary =
302        *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
303
304    // We make a sorted array of elements.
305    // Some of the input may be redundant.
306    // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
307    // We filter out those cases.
308    UnicodeSetIterator iter(*initialLabels_);
309    while (iter.next()) {
310        const UnicodeString *item = &iter.getString();
311        LocalPointer<UnicodeString> ownedItem;
312        UBool checkDistinct;
313        int32_t itemLength = item->length();
314        if (!item->hasMoreChar32Than(0, itemLength, 1)) {
315            checkDistinct = FALSE;
316        } else if(item->charAt(itemLength - 1) == 0x2a &&  // '*'
317                item->charAt(itemLength - 2) != 0x2a) {
318            // Use a label if it is marked with one trailing star,
319            // even if the label string sorts the same when all contractions are suppressed.
320            ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
321            item = ownedItem.getAlias();
322            if (item == NULL) {
323                errorCode = U_MEMORY_ALLOCATION_ERROR;
324                return;
325            }
326            checkDistinct = FALSE;
327        } else {
328            checkDistinct = TRUE;
329        }
330        if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
331            // Ignore a primary-ignorable or non-alphabetic index character.
332        } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
333            // Ignore an index character that will land in the overflow bucket.
334        } else if (checkDistinct &&
335                collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
336            // Ignore a multi-code point index character that does not sort distinctly
337            // from the sequence of its separate characters.
338        } else {
339            int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
340            if (insertionPoint < 0) {
341                indexCharacters.insertElementAt(
342                    ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
343            } else {
344                const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
345                if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
346                    indexCharacters.setElementAt(
347                        ownedString(*item, ownedItem, errorCode), insertionPoint);
348                }
349            }
350        }
351    }
352    if (U_FAILURE(errorCode)) { return; }
353
354    // if the result is still too large, cut down to maxCount elements, by removing every nth element
355
356    int32_t size = indexCharacters.size() - 1;
357    if (size > maxLabelCount_) {
358        int32_t count = 0;
359        int32_t old = -1;
360        for (int32_t i = 0; i < indexCharacters.size();) {
361            ++count;
362            int32_t bump = count * maxLabelCount_ / size;
363            if (bump == old) {
364                indexCharacters.removeElementAt(i);
365            } else {
366                old = bump;
367                ++i;
368            }
369        }
370    }
371}
372
373namespace {
374
375const UnicodeString &fixLabel(const UnicodeString &current, UnicodeString &temp) {
376    if (!current.startsWith(BASE, BASE_LENGTH)) {
377        return current;
378    }
379    UChar rest = current.charAt(BASE_LENGTH);
380    if (0x2800 < rest && rest <= 0x28FF) { // stroke count
381        int32_t count = rest-0x2800;
382        temp.setTo((UChar)(0x30 + count % 10));
383        if (count >= 10) {
384            count /= 10;
385            temp.insert(0, (UChar)(0x30 + count % 10));
386            if (count >= 10) {
387                count /= 10;
388                temp.insert(0, (UChar)(0x30 + count));
389            }
390        }
391        return temp.append((UChar)0x5283);
392    }
393    return temp.setTo(current, BASE_LENGTH);
394}
395
396UBool hasMultiplePrimaryWeights(
397        const RuleBasedCollator &coll, uint32_t variableTop,
398        const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) {
399    ces.removeAllElements();
400    coll.internalGetCEs(s, ces, errorCode);
401    if (U_FAILURE(errorCode)) { return FALSE; }
402    UBool seenPrimary = FALSE;
403    for (int32_t i = 0; i < ces.size(); ++i) {
404        int64_t ce = ces.elementAti(i);
405        uint32_t p = (uint32_t)(ce >> 32);
406        if (p > variableTop) {
407            // not primary ignorable
408            if (seenPrimary) {
409                return TRUE;
410            }
411            seenPrimary = TRUE;
412        }
413    }
414    return FALSE;
415}
416
417}  // namespace
418
419BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
420    // Initialize indexCharacters.
421    UVector indexCharacters(errorCode);
422    indexCharacters.setDeleter(uprv_deleteUObject);
423    initLabels(indexCharacters, errorCode);
424    if (U_FAILURE(errorCode)) { return NULL; }
425
426    // Variables for hasMultiplePrimaryWeights().
427    UVector64 ces(errorCode);
428    uint32_t variableTop;
429    if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
430        variableTop = collatorPrimaryOnly_->getVariableTop(errorCode);
431    } else {
432        variableTop = 0;
433    }
434    UBool hasInvisibleBuckets = FALSE;
435
436    // Helper arrays for Chinese Pinyin collation.
437    Bucket *asciiBuckets[26] = {
438        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
439        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
440    };
441    Bucket *pinyinBuckets[26] = {
442        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
443        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
444    };
445    UBool hasPinyin = FALSE;
446
447    LocalPointer<UVector> bucketList(new UVector(errorCode));
448    if (bucketList.isNull()) {
449        errorCode = U_MEMORY_ALLOCATION_ERROR;
450        return NULL;
451    }
452    bucketList->setDeleter(uprv_deleteUObject);
453
454    // underflow bucket
455    Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
456    if (bucket == NULL) {
457        errorCode = U_MEMORY_ALLOCATION_ERROR;
458        return NULL;
459    }
460    bucketList->addElement(bucket, errorCode);
461    if (U_FAILURE(errorCode)) { return NULL; }
462
463    UnicodeString temp;
464
465    // fix up the list, adding underflow, additions, overflow
466    // Insert inflow labels as needed.
467    int32_t scriptIndex = -1;
468    const UnicodeString *scriptUpperBoundary = &emptyString_;
469    for (int32_t i = 0; i < indexCharacters.size(); ++i) {
470        UnicodeString &current = *getString(indexCharacters, i);
471        if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
472            // We crossed the script boundary into a new script.
473            const UnicodeString &inflowBoundary = *scriptUpperBoundary;
474            UBool skippedScript = FALSE;
475            for (;;) {
476                scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
477                if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
478                    break;
479                }
480                skippedScript = TRUE;
481            }
482            if (skippedScript && bucketList->size() > 1) {
483                // We are skipping one or more scripts,
484                // and we are not just getting out of the underflow label.
485                bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
486                if (bucket == NULL) {
487                    errorCode = U_MEMORY_ALLOCATION_ERROR;
488                    return NULL;
489                }
490                bucketList->addElement(bucket, errorCode);
491            }
492        }
493        // Add a bucket with the current label.
494        bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
495        if (bucket == NULL) {
496            errorCode = U_MEMORY_ALLOCATION_ERROR;
497            return NULL;
498        }
499        bucketList->addElement(bucket, errorCode);
500        // Remember ASCII and Pinyin buckets for Pinyin redirects.
501        UChar c;
502        if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) {  // A-Z
503            asciiBuckets[c - 0x41] = bucket;
504        } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
505                0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
506            pinyinBuckets[c - 0x41] = bucket;
507            hasPinyin = TRUE;
508        }
509        // Check for multiple primary weights.
510        if (!current.startsWith(BASE, BASE_LENGTH) &&
511                hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current,
512                                          ces, errorCode) &&
513                current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
514            // "AE-ligature" or "Sch" etc.
515            for (int32_t i = bucketList->size() - 2;; --i) {
516                Bucket *singleBucket = getBucket(*bucketList, i);
517                if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
518                    // There is no single-character bucket since the last
519                    // underflow or inflow label.
520                    break;
521                }
522                if (singleBucket->displayBucket_ == NULL &&
523                        !hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop,
524                                                   singleBucket->lowerBoundary_,
525                                                   ces, errorCode)) {
526                    // Add an invisible bucket that redirects strings greater than the expansion
527                    // to the previous single-character bucket.
528                    // For example, after ... Q R S Sch we add Sch\uFFFF->S
529                    // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
530                    bucket = new Bucket(emptyString_,
531                        UnicodeString(current).append((UChar)0xFFFF),
532                        U_ALPHAINDEX_NORMAL);
533                    if (bucket == NULL) {
534                        errorCode = U_MEMORY_ALLOCATION_ERROR;
535                        return NULL;
536                    }
537                    bucket->displayBucket_ = singleBucket;
538                    bucketList->addElement(bucket, errorCode);
539                    hasInvisibleBuckets = TRUE;
540                    break;
541                }
542            }
543        }
544    }
545    if (U_FAILURE(errorCode)) { return NULL; }
546    if (bucketList->size() == 1) {
547        // No real labels, show only the underflow label.
548        BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
549        if (bl == NULL) {
550            errorCode = U_MEMORY_ALLOCATION_ERROR;
551            return NULL;
552        }
553        bucketList.orphan();
554        return bl;
555    }
556    // overflow bucket
557    bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
558    if (bucket == NULL) {
559        errorCode = U_MEMORY_ALLOCATION_ERROR;
560        return NULL;
561    }
562    bucketList->addElement(bucket, errorCode); // final
563
564    if (hasPinyin) {
565        // Redirect Pinyin buckets.
566        Bucket *asciiBucket = NULL;
567        for (int32_t i = 0; i < 26; ++i) {
568            if (asciiBuckets[i] != NULL) {
569                asciiBucket = asciiBuckets[i];
570            }
571            if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
572                pinyinBuckets[i]->displayBucket_ = asciiBucket;
573                hasInvisibleBuckets = TRUE;
574            }
575        }
576    }
577
578    if (U_FAILURE(errorCode)) { return NULL; }
579    if (!hasInvisibleBuckets) {
580        BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
581        if (bl == NULL) {
582            errorCode = U_MEMORY_ALLOCATION_ERROR;
583            return NULL;
584        }
585        bucketList.orphan();
586        return bl;
587    }
588    // Merge inflow buckets that are visually adjacent.
589    // Iterate backwards: Merge inflow into overflow rather than the other way around.
590    int32_t i = bucketList->size() - 1;
591    Bucket *nextBucket = getBucket(*bucketList, i);
592    while (--i > 0) {
593        bucket = getBucket(*bucketList, i);
594        if (bucket->displayBucket_ != NULL) {
595            continue;  // skip invisible buckets
596        }
597        if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
598            if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
599                bucket->displayBucket_ = nextBucket;
600                continue;
601            }
602        }
603        nextBucket = bucket;
604    }
605
606    LocalPointer<UVector> publicBucketList(new UVector(errorCode));
607    if (bucketList.isNull()) {
608        errorCode = U_MEMORY_ALLOCATION_ERROR;
609        return NULL;
610    }
611    // Do not call publicBucketList->setDeleter():
612    // This vector shares its objects with the bucketList.
613    for (int32_t i = 0; i < bucketList->size(); ++i) {
614        bucket = getBucket(*bucketList, i);
615        if (bucket->displayBucket_ == NULL) {
616            publicBucketList->addElement(bucket, errorCode);
617        }
618    }
619    if (U_FAILURE(errorCode)) { return NULL; }
620    BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
621    if (bl == NULL) {
622        errorCode = U_MEMORY_ALLOCATION_ERROR;
623        return NULL;
624    }
625    bucketList.orphan();
626    publicBucketList.orphan();
627    return bl;
628}
629
630/**
631 * Creates an index, and buckets and sorts the list of records into the index.
632 */
633void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
634    if (U_FAILURE(errorCode) || buckets_ != NULL) {
635        return;
636    }
637    buckets_ = createBucketList(errorCode);
638    if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
639        return;
640    }
641
642    // Sort the records by name.
643    // Stable sort preserves input order of collation duplicates.
644    inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
645
646    // Now, we traverse all of the input, which is now sorted.
647    // If the item doesn't go in the current bucket, we find the next bucket that contains it.
648    // This makes the process order n*log(n), since we just sort the list and then do a linear process.
649    // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
650    // we need to improve it for that case.
651
652    Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
653    int32_t bucketIndex = 1;
654    Bucket *nextBucket;
655    const UnicodeString *upperBoundary;
656    if (bucketIndex < buckets_->bucketList_->size()) {
657        nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
658        upperBoundary = &nextBucket->lowerBoundary_;
659    } else {
660        nextBucket = NULL;
661        upperBoundary = NULL;
662    }
663    for (int32_t i = 0; i < inputList_->size(); ++i) {
664        Record *r = getRecord(*inputList_, i);
665        // if the current bucket isn't the right one, find the one that is
666        // We have a special flag for the last bucket so that we don't look any further
667        while (upperBoundary != NULL &&
668                collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
669            currentBucket = nextBucket;
670            // now reset the boundary that we compare against
671            if (bucketIndex < buckets_->bucketList_->size()) {
672                nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
673                upperBoundary = &nextBucket->lowerBoundary_;
674            } else {
675                upperBoundary = NULL;
676            }
677        }
678        // now put the record into the bucket.
679        Bucket *bucket = currentBucket;
680        if (bucket->displayBucket_ != NULL) {
681            bucket = bucket->displayBucket_;
682        }
683        if (bucket->records_ == NULL) {
684            bucket->records_ = new UVector(errorCode);
685            if (bucket->records_ == NULL) {
686                errorCode = U_MEMORY_ALLOCATION_ERROR;
687                return;
688            }
689        }
690        bucket->records_->addElement(r, errorCode);
691    }
692}
693
694void AlphabeticIndex::clearBuckets() {
695    if (buckets_ != NULL) {
696        delete buckets_;
697        buckets_ = NULL;
698        internalResetBucketIterator();
699    }
700}
701
702void AlphabeticIndex::internalResetBucketIterator() {
703    labelsIterIndex_ = -1;
704    currentBucket_ = NULL;
705}
706
707
708void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
709    LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
710    if (U_FAILURE(status)) {
711        return;
712    }
713
714    UnicodeSet exemplars;
715    ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
716    if (U_SUCCESS(status)) {
717        initialLabels_->addAll(exemplars);
718        return;
719    }
720    status = U_ZERO_ERROR;  // Clear out U_MISSING_RESOURCE_ERROR
721
722    // The locale data did not include explicit Index characters.
723    // Synthesize a set of them from the locale's standard exemplar characters.
724    ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
725    if (U_FAILURE(status)) {
726        return;
727    }
728
729    // question: should we add auxiliary exemplars?
730    if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
731        exemplars.add(0x61, 0x7A);
732    }
733    if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
734        // cut down to small list
735        exemplars.remove(0xAC00, 0xD7A3).
736            add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
737            add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
738            add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
739            add(0xD30C).add(0xD558);
740    }
741    if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
742        // cut down to small list
743        // make use of the fact that Ethiopic is allocated in 8's, where
744        // the base is 0 mod 8.
745        UnicodeSet ethiopic(
746            UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
747        UnicodeSetIterator it(ethiopic);
748        while (it.next() && !it.isString()) {
749            if ((it.getCodepoint() & 0x7) != 0) {
750                exemplars.remove(it.getCodepoint());
751            }
752        }
753    }
754
755    // Upper-case any that aren't already so.
756    //   (We only do this for synthesized index characters.)
757    UnicodeSetIterator it(exemplars);
758    UnicodeString upperC;
759    while (it.next()) {
760        const UnicodeString &exemplarC = it.getString();
761        upperC = exemplarC;
762        upperC.toUpper(locale);
763        initialLabels_->add(upperC);
764    }
765}
766
767UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
768    UnicodeSet contractions;
769    collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode);
770    if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE; }
771    initialLabels_->addAll(contractions);
772    UnicodeSetIterator iter(contractions);
773    while (iter.next()) {
774        const UnicodeString &s = iter.getString();
775        U_ASSERT (s.startsWith(BASE, BASE_LENGTH));
776        UChar c = s.charAt(s.length() - 1);
777        if (0x41 <= c && c <= 0x5A) {  // A-Z
778            // There are Pinyin labels, add ASCII A-Z labels as well.
779            initialLabels_->add(0x41, 0x5A);  // A-Z
780            break;
781        }
782    }
783    return TRUE;
784}
785
786
787/*
788 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
789 */
790static const UChar CGJ = 0x034F;
791UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
792    UnicodeString result;
793    if (item.length() == 0) {
794        return result;
795    }
796    int32_t i = 0;
797    for (;;) {
798        UChar32  cp = item.char32At(i);
799        result.append(cp);
800        i = item.moveIndex32(i, 1);
801        if (i >= item.length()) {
802            break;
803        }
804        result.append(CGJ);
805    }
806    return result;
807}
808
809
810UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
811    return FALSE;
812}
813
814
815UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
816    return FALSE;
817}
818
819
820const RuleBasedCollator &AlphabeticIndex::getCollator() const {
821    return *collator_;
822}
823
824
825const UnicodeString &AlphabeticIndex::getInflowLabel() const {
826    return inflowLabel_;
827}
828
829const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
830    return overflowLabel_;
831}
832
833
834const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
835    return underflowLabel_;
836}
837
838
839AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
840    inflowLabel_ = label;
841    clearBuckets();
842    return *this;
843}
844
845
846AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
847    overflowLabel_ = label;
848    clearBuckets();
849    return *this;
850}
851
852
853AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
854    underflowLabel_ = label;
855    clearBuckets();
856    return *this;
857}
858
859
860int32_t AlphabeticIndex::getMaxLabelCount() const {
861    return maxLabelCount_;
862}
863
864
865AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
866    if (U_FAILURE(status)) {
867        return *this;
868    }
869    if (maxLabelCount <= 0) {
870        status = U_ILLEGAL_ARGUMENT_ERROR;
871        return *this;
872    }
873    maxLabelCount_ = maxLabelCount;
874    clearBuckets();
875    return *this;
876}
877
878
879//
880//  init() - Common code for constructors.
881//
882
883void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
884    if (U_FAILURE(status)) { return; }
885    if (locale == NULL && collator_ == NULL) {
886        status = U_ILLEGAL_ARGUMENT_ERROR;
887        return;
888    }
889
890    initialLabels_         = new UnicodeSet();
891    if (initialLabels_ == NULL) {
892        status = U_MEMORY_ALLOCATION_ERROR;
893        return;
894    }
895
896    inflowLabel_.setTo((UChar)0x2026);    // Ellipsis
897    overflowLabel_ = inflowLabel_;
898    underflowLabel_ = inflowLabel_;
899
900    if (collator_ == NULL) {
901        Collator *coll = Collator::createInstance(*locale, status);
902        if (U_FAILURE(status)) {
903            delete coll;
904            return;
905        }
906        if (coll == NULL) {
907            status = U_MEMORY_ALLOCATION_ERROR;
908            return;
909        }
910        collator_ = dynamic_cast<RuleBasedCollator *>(coll);
911        if (collator_ == NULL) {
912            delete coll;
913            status = U_UNSUPPORTED_ERROR;
914            return;
915        }
916    }
917    collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());
918    if (collatorPrimaryOnly_ == NULL) {
919        status = U_MEMORY_ALLOCATION_ERROR;
920        return;
921    }
922    collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
923    firstCharsInScripts_ = firstStringsInScript(status);
924    if (U_FAILURE(status)) { return; }
925    firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
926    // Guard against a degenerate collator where
927    // some script boundary strings are primary ignorable.
928    for (;;) {
929        if (U_FAILURE(status)) { return; }
930        if (firstCharsInScripts_->isEmpty()) {
931            // AlphabeticIndex requires some non-ignorable script boundary strings.
932            status = U_ILLEGAL_ARGUMENT_ERROR;
933            return;
934        }
935        if (collatorPrimaryOnly_->compare(
936                *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
937                emptyString_, status) == UCOL_EQUAL) {
938            firstCharsInScripts_->removeElementAt(0);
939        } else {
940            break;
941        }
942    }
943
944    // Chinese index characters, which are specific to each of the several Chinese tailorings,
945    // take precedence over the single locale data exemplar set per language.
946    if (!addChineseIndexCharacters(status) && locale != NULL) {
947        addIndexExemplars(*locale, status);
948    }
949}
950
951
952//
953//  Comparison function for UVector<UnicodeString *> sorting with a collator.
954//
955static int32_t U_CALLCONV
956collatorComparator(const void *context, const void *left, const void *right) {
957    const UElement *leftElement = static_cast<const UElement *>(left);
958    const UElement *rightElement = static_cast<const UElement *>(right);
959    const UnicodeString *leftString  = static_cast<const UnicodeString *>(leftElement->pointer);
960    const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
961
962    if (leftString == rightString) {
963        // Catches case where both are NULL
964        return 0;
965    }
966    if (leftString == NULL) {
967        return 1;
968    };
969    if (rightString == NULL) {
970        return -1;
971    }
972    const Collator *col = static_cast<const Collator *>(context);
973    UErrorCode errorCode = U_ZERO_ERROR;
974    return col->compare(*leftString, *rightString, errorCode);
975}
976
977//
978//  Comparison function for UVector<Record *> sorting with a collator.
979//
980static int32_t U_CALLCONV
981recordCompareFn(const void *context, const void *left, const void *right) {
982    const UElement *leftElement = static_cast<const UElement *>(left);
983    const UElement *rightElement = static_cast<const UElement *>(right);
984    const AlphabeticIndex::Record *leftRec  = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
985    const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
986    const Collator *col = static_cast<const Collator *>(context);
987    UErrorCode errorCode = U_ZERO_ERROR;
988    return col->compare(leftRec->name_, rightRec->name_, errorCode);
989}
990
991UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
992    if (U_FAILURE(status)) {
993        return NULL;
994    }
995    LocalPointer<UVector> dest(new UVector(status));
996    if (dest.isNull()) {
997        status = U_MEMORY_ALLOCATION_ERROR;
998        return NULL;
999    }
1000    dest->setDeleter(uprv_deleteUObject);
1001    // Fetch the script-first-primary contractions which are defined in the root collator.
1002    // They all start with U+FDD1.
1003    UnicodeSet set;
1004    collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status);
1005    if (U_FAILURE(status)) {
1006        return NULL;
1007    }
1008    if (set.isEmpty()) {
1009        status = U_UNSUPPORTED_ERROR;
1010        return NULL;
1011    }
1012    UnicodeSetIterator iter(set);
1013    while (iter.next()) {
1014        const UnicodeString &boundary = iter.getString();
1015        uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1));
1016        if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) {
1017            // Ignore boundaries for the special reordering groups.
1018            // Take only those for "real scripts" (where the sample character is a Letter,
1019            // and the one for unassigned implicit weights (Cn).
1020            continue;
1021        }
1022        UnicodeString *s = new UnicodeString(boundary);
1023        if (s == NULL) {
1024            status = U_MEMORY_ALLOCATION_ERROR;
1025            return NULL;
1026        }
1027        dest->addElement(s, status);
1028    }
1029    return dest.orphan();
1030}
1031
1032
1033namespace {
1034
1035/**
1036 * Returns true if one index character string is "better" than the other.
1037 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
1038 * better, and otherwise binary-less-than is better.
1039 */
1040UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
1041                                const UnicodeString &one, const UnicodeString &other) {
1042    // This is called with primary-equal strings, but never with one.equals(other).
1043    UErrorCode status = U_ZERO_ERROR;
1044    UnicodeString n1 = nfkdNormalizer.normalize(one, status);
1045    UnicodeString n2 = nfkdNormalizer.normalize(other, status);
1046    if (U_FAILURE(status)) { return FALSE; }
1047    int32_t result = n1.countChar32() - n2.countChar32();
1048    if (result != 0) {
1049        return result < 0;
1050    }
1051    result = n1.compareCodePointOrder(n2);
1052    if (result != 0) {
1053        return result < 0;
1054    }
1055    return one.compareCodePointOrder(other) < 0;
1056}
1057
1058}  // namespace
1059
1060//
1061//  Constructor & Destructor for AlphabeticIndex::Record
1062//
1063//     Records are internal only, instances are not directly surfaced in the public API.
1064//     This class is mostly struct-like, with all public fields.
1065
1066AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
1067        : name_(name), data_(data) {}
1068
1069AlphabeticIndex::Record::~Record() {
1070}
1071
1072
1073AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
1074    if (U_FAILURE(status)) {
1075        return *this;
1076    }
1077    if (inputList_ == NULL) {
1078        inputList_ = new UVector(status);
1079        if (inputList_ == NULL) {
1080            status = U_MEMORY_ALLOCATION_ERROR;
1081            return *this;
1082        }
1083        inputList_->setDeleter(alphaIndex_deleteRecord);
1084    }
1085    Record *r = new Record(name, data);
1086    if (r == NULL) {
1087        status = U_MEMORY_ALLOCATION_ERROR;
1088        return *this;
1089    }
1090    inputList_->addElement(r, status);
1091    clearBuckets();
1092    //std::string ss;
1093    //std::string ss2;
1094    //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1095    //             "   sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1096    return *this;
1097}
1098
1099
1100AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
1101    if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
1102        inputList_->removeAllElements();
1103        clearBuckets();
1104    }
1105    return *this;
1106}
1107
1108int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
1109    initBuckets(status);
1110    if (U_FAILURE(status)) {
1111        return 0;
1112    }
1113    return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
1114}
1115
1116
1117int32_t AlphabeticIndex::getBucketIndex() const {
1118    return labelsIterIndex_;
1119}
1120
1121
1122UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
1123    if (U_FAILURE(status)) {
1124        return FALSE;
1125    }
1126    if (buckets_ == NULL && currentBucket_ != NULL) {
1127        status = U_ENUM_OUT_OF_SYNC_ERROR;
1128        return FALSE;
1129    }
1130    initBuckets(status);
1131    if (U_FAILURE(status)) {
1132        return FALSE;
1133    }
1134    ++labelsIterIndex_;
1135    if (labelsIterIndex_ >= buckets_->getBucketCount()) {
1136        labelsIterIndex_ = buckets_->getBucketCount();
1137        return FALSE;
1138    }
1139    currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
1140    resetRecordIterator();
1141    return TRUE;
1142}
1143
1144const UnicodeString &AlphabeticIndex::getBucketLabel() const {
1145    if (currentBucket_ != NULL) {
1146        return currentBucket_->label_;
1147    } else {
1148        return emptyString_;
1149    }
1150}
1151
1152
1153UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
1154    if (currentBucket_ != NULL) {
1155        return currentBucket_->labelType_;
1156    } else {
1157        return U_ALPHAINDEX_NORMAL;
1158    }
1159}
1160
1161
1162int32_t AlphabeticIndex::getBucketRecordCount() const {
1163    if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
1164        return currentBucket_->records_->size();
1165    } else {
1166        return 0;
1167    }
1168}
1169
1170
1171AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
1172    if (U_FAILURE(status)) {
1173        return *this;
1174    }
1175    internalResetBucketIterator();
1176    return *this;
1177}
1178
1179
1180UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
1181    if (U_FAILURE(status)) {
1182        return FALSE;
1183    }
1184    if (currentBucket_ == NULL) {
1185        // We are trying to iterate over the items in a bucket, but there is no
1186        // current bucket from the enumeration of buckets.
1187        status = U_INVALID_STATE_ERROR;
1188        return FALSE;
1189    }
1190    if (buckets_ == NULL) {
1191        status = U_ENUM_OUT_OF_SYNC_ERROR;
1192        return FALSE;
1193    }
1194    if (currentBucket_->records_ == NULL) {
1195        return FALSE;
1196    }
1197    ++itemsIterIndex_;
1198    if (itemsIterIndex_ >= currentBucket_->records_->size()) {
1199        itemsIterIndex_  = currentBucket_->records_->size();
1200        return FALSE;
1201    }
1202    return TRUE;
1203}
1204
1205
1206const UnicodeString &AlphabeticIndex::getRecordName() const {
1207    const UnicodeString *retStr = &emptyString_;
1208    if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1209        itemsIterIndex_ >= 0 &&
1210        itemsIterIndex_ < currentBucket_->records_->size()) {
1211            Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1212            retStr = &item->name_;
1213    }
1214    return *retStr;
1215}
1216
1217const void *AlphabeticIndex::getRecordData() const {
1218    const void *retPtr = NULL;
1219    if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1220        itemsIterIndex_ >= 0 &&
1221        itemsIterIndex_ < currentBucket_->records_->size()) {
1222            Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1223            retPtr = item->data_;
1224    }
1225    return retPtr;
1226}
1227
1228
1229AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
1230    itemsIterIndex_ = -1;
1231    return *this;
1232}
1233
1234
1235
1236AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
1237                                const UnicodeString &lowerBoundary,
1238                                UAlphabeticIndexLabelType type)
1239        : label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
1240          displayBucket_(NULL), displayIndex_(-1),
1241          records_(NULL) {
1242}
1243
1244
1245AlphabeticIndex::Bucket::~Bucket() {
1246    delete records_;
1247}
1248
1249U_NAMESPACE_END
1250
1251#endif  // !UCONFIG_NO_COLLATION
1252