1/* GENERATED SOURCE. DO NOT MODIFY. */
2/*
3 *******************************************************************************
4 * Copyright (C) 2008-2016, Google Inc, International Business Machines Corporation
5 * and others. All Rights Reserved.
6 *******************************************************************************
7 */
8package android.icu.text;
9
10import java.util.ArrayList;
11import java.util.Collections;
12import java.util.Comparator;
13import java.util.Iterator;
14import java.util.List;
15import java.util.Locale;
16
17import android.icu.lang.UCharacter;
18import android.icu.text.AlphabeticIndex.Bucket;
19import android.icu.text.AlphabeticIndex.Bucket.LabelType;
20import android.icu.util.LocaleData;
21import android.icu.util.ULocale;
22
23/**
24 * AlphabeticIndex supports the creation of a UI index appropriate for a given language.
25 * It can support either direct use, or use with a client that doesn't support localized collation.
26 * The following is an example of what an index might look like in a UI:
27 *
28 * <pre>
29 *  <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z  ...</b>
30 *
31 *  <b>A</b>
32 *     Addison
33 *     Albertson
34 *     Azensky
35 *  <b>B</b>
36 *     Baecker
37 *  ...
38 * </pre>
39 *
40 * The class can generate a list of labels for use as a UI "index", that is, a list of
41 * clickable characters (or character sequences) that allow the user to see a segment
42 * (bucket) of a larger "target" list. That is, each label corresponds to a bucket in
43 * the target list, where everything in the bucket is greater than or equal to the character
44 * (according to the locale's collation). Strings can be added to the index;
45 * they will be in sorted order in the right bucket.
46 * <p>
47 * The class also supports having buckets for strings before the first (underflow),
48 * after the last (overflow), and between scripts (inflow). For example, if the index
49 * is constructed with labels for Russian and English, Greek characters would fall
50 * into an inflow bucket between the other two scripts.
51 *
52 * <p><em>Note:</em> If you expect to have a lot of ASCII or Latin characters
53 * as well as characters from the user's language,
54 * then it is a good idea to call addLabels(ULocale.English).
55 *
56 * <h2>Direct Use</h2>
57 * <p>The following shows an example of building an index directly.
58 *  The "show..." methods below are just to illustrate usage.
59 *
60 * <pre>
61 * // Create a simple index where the values for the strings are Integers, and add the strings
62 *
63 * AlphabeticIndex&lt;Integer&gt; index = new AlphabeticIndex&lt;Integer&gt;(desiredLocale).addLabels(additionalLocale);
64 * int counter = 0;
65 * for (String item : test) {
66 *     index.addRecord(item, counter++);
67 * }
68 * ...
69 * // Show index at top. We could skip or gray out empty buckets
70 *
71 * for (AlphabeticIndex.Bucket&lt;Integer&gt; bucket : index) {
72 *     if (showAll || bucket.size() != 0) {
73 *         showLabelAtTop(UI, bucket.getLabel());
74 *     }
75 * }
76 *  ...
77 * // Show the buckets with their contents, skipping empty buckets
78 *
79 * for (AlphabeticIndex.Bucket&lt;Integer&gt; bucket : index) {
80 *     if (bucket.size() != 0) {
81 *         showLabelInList(UI, bucket.getLabel());
82 *         for (AlphabeticIndex.Record&lt;Integer&gt; item : bucket) {
83 *             showIndexedItem(UI, item.getName(), item.getData());
84 *         }
85 * </pre>
86 *
87 * The caller can build different UIs using this class.
88 * For example, an index character could be omitted or grayed-out
89 * if its bucket is empty. Small buckets could also be combined based on size, such as:
90 *
91 * <pre>
92 * <b>... A-F G-N O-Z ...</b>
93 * </pre>
94 *
95 * <h2>Client Support</h2>
96 * <p>Callers can also use the {@link AlphabeticIndex.ImmutableIndex}, or the AlphabeticIndex itself,
97 * to support sorting on a client that doesn't support AlphabeticIndex functionality.
98 *
99 * <p>The ImmutableIndex is both immutable and thread-safe.
100 * The corresponding AlphabeticIndex methods are not thread-safe because
101 * they "lazily" build the index buckets.
102 * <ul>
103 * <li>ImmutableIndex.getBucket(index) provides random access to all
104 *     buckets and their labels and label types.
105 * <li>AlphabeticIndex.getBucketLabels() or the bucket iterator on either class
106 *     can be used to get a list of the labels,
107 *     such as "...", "A", "B",..., and send that list to the client.
108 * <li>When the client has a new name, it sends that name to the server.
109 * The server needs to call the following methods,
110 * and communicate the bucketIndex and collationKey back to the client.
111 *
112 * <pre>
113 * int bucketIndex = index.getBucketIndex(name);
114 * String label = immutableIndex.getBucket(bucketIndex).getLabel();  // optional
115 * RawCollationKey collationKey = collator.getRawCollationKey(name, null);
116 * </pre>
117 *
118 * <li>The client would put the name (and associated information) into its bucket for bucketIndex. The collationKey is a
119 * sequence of bytes that can be compared with a binary compare, and produce the right localized result.</li>
120 * </ul>
121 *
122 * @author Mark Davis
123 */
124public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {
125    /**
126     * Prefix string for Chinese index buckets.
127     * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
128     */
129    private static final String BASE = "\uFDD0";
130
131    private static final char CGJ = '\u034F';
132
133    private static final Comparator<String> binaryCmp = new UTF16.StringComparator(true, false, 0);
134
135    private final RuleBasedCollator collatorOriginal;
136    private final RuleBasedCollator collatorPrimaryOnly;
137    private RuleBasedCollator collatorExternal;
138
139    // Comparator for records, so that the Record class can be static.
140    private final Comparator<Record<V>> recordComparator = new Comparator<Record<V>>() {
141        public int compare(Record<V> o1, Record<V> o2) {
142            return collatorOriginal.compare(o1.name, o2.name);
143        }
144    };
145
146    private final List<String> firstCharsInScripts;
147
148    // We accumulate these as we build up the input parameters
149    private final UnicodeSet initialLabels = new UnicodeSet();
150    private List<Record<V>> inputList;
151
152    // Lazy evaluated: null means that we have not built yet.
153    private BucketList<V> buckets;
154
155    private String overflowLabel = "\u2026";
156    private String underflowLabel = "\u2026";
157    private String inflowLabel = "\u2026";
158
159    /**
160     * Immutable, thread-safe version of {@link AlphabeticIndex}.
161     * This class provides thread-safe methods for bucketing,
162     * and random access to buckets and their properties,
163     * but does not offer adding records to the index.
164     *
165     * @param <V> The Record value type is unused. It can be omitted for this class
166     * if it was omitted for the AlphabeticIndex that built it.
167     */
168    public static final class ImmutableIndex<V> implements Iterable<Bucket<V>> {
169        private final BucketList<V> buckets;
170        private final Collator collatorPrimaryOnly;
171
172        private ImmutableIndex(BucketList<V> bucketList, Collator collatorPrimaryOnly) {
173            this.buckets = bucketList;
174            this.collatorPrimaryOnly = collatorPrimaryOnly;
175        }
176
177        /**
178         * Returns the number of index buckets and labels, including underflow/inflow/overflow.
179         *
180         * @return the number of index buckets
181         */
182        public int getBucketCount() {
183            return buckets.getBucketCount();
184        }
185
186        /**
187         * Finds the index bucket for the given name and returns the number of that bucket.
188         * Use {@link #getBucket(int)} to get the bucket's properties.
189         *
190         * @param name the string to be sorted into an index bucket
191         * @return the bucket number for the name
192         */
193        public int getBucketIndex(CharSequence name) {
194            return buckets.getBucketIndex(name, collatorPrimaryOnly);
195        }
196
197        /**
198         * Returns the index-th bucket. Returns null if the index is out of range.
199         *
200         * @param index bucket number
201         * @return the index-th bucket
202         */
203        public Bucket<V> getBucket(int index) {
204            if (0 <= index && index < buckets.getBucketCount()) {
205                return buckets.immutableVisibleList.get(index);
206            } else {
207                return null;
208            }
209        }
210
211        /**
212         * {@inheritDoc}
213         */
214        public Iterator<Bucket<V>> iterator() {
215            return buckets.iterator();
216        }
217    }
218
219    /**
220     * Create the index object.
221     *
222     * @param locale
223     *            The locale for the index.
224     */
225    public AlphabeticIndex(ULocale locale) {
226        this(locale, null);
227    }
228
229    /**
230     * Create the index object.
231     *
232     * @param locale
233     *            The locale for the index.
234     */
235    public AlphabeticIndex(Locale locale) {
236        this(ULocale.forLocale(locale), null);
237    }
238
239    /**
240     * Create an AlphabeticIndex that uses a specific collator.
241     *
242     * <p>The index will be created with no labels; the addLabels() function must be called
243     * after creation to add the desired labels to the index.
244     *
245     * <p>The index will work directly with the supplied collator. If the caller will need to
246     * continue working with the collator it should be cloned first, so that the
247     * collator provided to the AlphabeticIndex remains unchanged after creation of the index.
248     *
249     * @param collator The collator to use to order the contents of this index.
250     */
251    public AlphabeticIndex(RuleBasedCollator collator) {
252        this(null, collator);
253    }
254
255    /**
256     * Internal constructor containing implementation used by public constructors.
257     */
258    private AlphabeticIndex(ULocale locale, RuleBasedCollator collator) {
259        collatorOriginal = collator != null ? collator : (RuleBasedCollator) Collator.getInstance(locale);
260        try {
261            collatorPrimaryOnly = collatorOriginal.cloneAsThawed();
262        } catch (Exception e) {
263            // should never happen
264            throw new IllegalStateException("Collator cannot be cloned", e);
265        }
266        collatorPrimaryOnly.setStrength(Collator.PRIMARY);
267        collatorPrimaryOnly.freeze();
268
269        firstCharsInScripts = getFirstCharactersInScripts();
270        Collections.sort(firstCharsInScripts, collatorPrimaryOnly);
271        // Guard against a degenerate collator where
272        // some script boundary strings are primary ignorable.
273        for (;;) {
274            if (firstCharsInScripts.isEmpty()) {
275                throw new IllegalArgumentException(
276                        "AlphabeticIndex requires some non-ignorable script boundary strings");
277            }
278            if (collatorPrimaryOnly.compare(firstCharsInScripts.get(0), "") == 0) {
279                firstCharsInScripts.remove(0);
280            } else {
281                break;
282            }
283        }
284
285        // Chinese index characters, which are specific to each of the several Chinese tailorings,
286        // take precedence over the single locale data exemplar set per language.
287        if (!addChineseIndexCharacters() && locale != null) {
288            addIndexExemplars(locale);
289        }
290    }
291
292    /**
293     * Add more index characters (aside from what are in the locale)
294     * @param additions additional characters to add to the index, such as A-Z.
295     * @return this, for chaining
296     */
297    public AlphabeticIndex<V> addLabels(UnicodeSet additions) {
298        initialLabels.addAll(additions);
299        buckets = null;
300        return this;
301    }
302
303    /**
304     * Add more index characters (aside from what are in the locale)
305     * @param additions additional characters to add to the index, such as those in Swedish.
306     * @return this, for chaining
307     */
308    public AlphabeticIndex<V> addLabels(ULocale... additions) {
309        for (ULocale addition : additions) {
310            addIndexExemplars(addition);
311        }
312        buckets = null;
313        return this;
314    }
315
316    /**
317     * Add more index characters (aside from what are in the locale)
318     * @param additions additional characters to add to the index, such as those in Swedish.
319     * @return this, for chaining
320     */
321    public AlphabeticIndex<V> addLabels(Locale... additions) {
322        for (Locale addition : additions) {
323            addIndexExemplars(ULocale.forLocale(addition));
324        }
325        buckets = null;
326        return this;
327    }
328
329    /**
330     * Set the overflow label
331     * @param overflowLabel see class description
332     * @return this, for chaining
333     */
334    public AlphabeticIndex<V> setOverflowLabel(String overflowLabel) {
335        this.overflowLabel = overflowLabel;
336        buckets = null;
337        return this;
338    }
339
340    /**
341     * Get the default label used in the IndexCharacters' locale for underflow, eg the last item in: X Y Z ...
342     *
343     * @return underflow label
344     */
345    public String getUnderflowLabel() {
346        return underflowLabel; // TODO get localized version
347    }
348
349
350    /**
351     * Set the underflowLabel label
352     * @param underflowLabel see class description
353     * @return this, for chaining
354     */
355    public AlphabeticIndex<V> setUnderflowLabel(String underflowLabel) {
356        this.underflowLabel = underflowLabel;
357        buckets = null;
358        return this;
359    }
360
361    /**
362     * Get the default label used in the IndexCharacters' locale for overflow, eg the first item in: ... A B C
363     *
364     * @return overflow label
365     */
366    public String getOverflowLabel() {
367        return overflowLabel; // TODO get localized version
368    }
369
370
371    /**
372     * Set the inflowLabel label
373     * @param inflowLabel see class description
374     * @return this, for chaining
375     */
376    public AlphabeticIndex<V> setInflowLabel(String inflowLabel) {
377        this.inflowLabel = inflowLabel;
378        buckets = null;
379        return this;
380    }
381
382    /**
383     * Get the default label used for abbreviated buckets <i>between</i> other labels. For example, consider the labels
384     * for Latin and Greek are used: X Y Z ... &#x0391; &#x0392; &#x0393;.
385     *
386     * @return inflow label
387     */
388    public String getInflowLabel() {
389        return inflowLabel; // TODO get localized version
390    }
391
392
393    /**
394     * Get the limit on the number of labels in the index. The number of buckets can be slightly larger: see getBucketCount().
395     *
396     * @return maxLabelCount maximum number of labels.
397     */
398    public int getMaxLabelCount() {
399        return maxLabelCount;
400    }
401
402    /**
403     * Set a limit on the number of labels in the index. The number of buckets can be slightly larger: see
404     * getBucketCount().
405     *
406     * @param maxLabelCount Set the maximum number of labels. Currently, if the number is exceeded, then every
407     *         nth item is removed to bring the count down. A more sophisticated mechanism may be available in the
408     *         future.
409     * @return this, for chaining
410     */
411    public AlphabeticIndex<V> setMaxLabelCount(int maxLabelCount) {
412        this.maxLabelCount = maxLabelCount;
413        buckets = null;
414        return this;
415    }
416
417    /**
418     * Determine the best labels to use. This is based on the exemplars, but we also process to make sure that they are unique,
419     * and sort differently, and that the overall list is small enough.
420     */
421    private List<String> initLabels() {
422        Normalizer2 nfkdNormalizer = Normalizer2.getNFKDInstance();
423        List<String> indexCharacters = new ArrayList<String>();
424
425        String firstScriptBoundary = firstCharsInScripts.get(0);
426        String overflowBoundary = firstCharsInScripts.get(firstCharsInScripts.size() - 1);
427
428        // We make a sorted array of elements.
429        // Some of the input may be redundant.
430        // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
431        // We filter out those cases.
432        for (String item : initialLabels) {
433            boolean checkDistinct;
434            if (!UTF16.hasMoreCodePointsThan(item, 1)) {
435                checkDistinct = false;
436            } else if(item.charAt(item.length() - 1) == '*' &&
437                    item.charAt(item.length() - 2) != '*') {
438                // Use a label if it is marked with one trailing star,
439                // even if the label string sorts the same when all contractions are suppressed.
440                item = item.substring(0, item.length() - 1);
441                checkDistinct = false;
442            } else {
443                checkDistinct = true;
444            }
445            if (collatorPrimaryOnly.compare(item, firstScriptBoundary) < 0) {
446                // Ignore a primary-ignorable or non-alphabetic index character.
447            } else if (collatorPrimaryOnly.compare(item, overflowBoundary) >= 0) {
448                // Ignore an index character that will land in the overflow bucket.
449            } else if (checkDistinct && collatorPrimaryOnly.compare(item, separated(item)) == 0) {
450                // Ignore a multi-code point index character that does not sort distinctly
451                // from the sequence of its separate characters.
452            } else {
453                int insertionPoint = Collections.binarySearch(indexCharacters, item, collatorPrimaryOnly);
454                if (insertionPoint < 0) {
455                    indexCharacters.add(~insertionPoint, item);
456                } else {
457                    String itemAlreadyIn = indexCharacters.get(insertionPoint);
458                    if (isOneLabelBetterThanOther(nfkdNormalizer, item, itemAlreadyIn)) {
459                        indexCharacters.set(insertionPoint, item);
460                    }
461                }
462            }
463        }
464
465        // if the result is still too large, cut down to maxLabelCount elements, by removing every nth element
466
467        final int size = indexCharacters.size() - 1;
468        if (size > maxLabelCount) {
469            int count = 0;
470            int old = -1;
471            for (Iterator<String> it = indexCharacters.iterator(); it.hasNext();) {
472                ++count;
473                it.next();
474                final int bump = count * maxLabelCount / size;
475                if (bump == old) {
476                    it.remove();
477                } else {
478                    old = bump;
479                }
480            }
481        }
482
483        return indexCharacters;
484    }
485
486    private static String fixLabel(String current) {
487        if (!current.startsWith(BASE)) {
488            return current;
489        }
490        int rest = current.charAt(BASE.length());
491        if (0x2800 < rest && rest <= 0x28FF) { // stroke count
492            return (rest-0x2800) + "\u5283";
493        }
494        return current.substring(BASE.length());
495    }
496
497    /**
498     * This method is called to get the index exemplars. Normally these come from the locale directly,
499     * but if they aren't available, we have to synthesize them.
500     */
501    private void addIndexExemplars(ULocale locale) {
502        UnicodeSet exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_INDEX);
503        if (exemplars != null) {
504            initialLabels.addAll(exemplars);
505            return;
506        }
507
508        // The locale data did not include explicit Index characters.
509        // Synthesize a set of them from the locale's standard exemplar characters.
510        exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_STANDARD);
511
512        exemplars = exemplars.cloneAsThawed();
513        // question: should we add auxiliary exemplars?
514        if (exemplars.containsSome('a', 'z') || exemplars.size() == 0) {
515            exemplars.addAll('a', 'z');
516        }
517        if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
518            // cut down to small list
519            exemplars.remove(0xAC00, 0xD7A3).
520                add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
521                add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
522                add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
523                add(0xD30C).add(0xD558);
524        }
525        if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
526            // cut down to small list
527            // make use of the fact that Ethiopic is allocated in 8's, where
528            // the base is 0 mod 8.
529            UnicodeSet ethiopic = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
530            UnicodeSetIterator it = new UnicodeSetIterator(ethiopic);
531            while (it.next() && it.codepoint != UnicodeSetIterator.IS_STRING) {
532                if ((it.codepoint & 0x7) != 0) {
533                    exemplars.remove(it.codepoint);
534                }
535            }
536        }
537
538        // Upper-case any that aren't already so.
539        //   (We only do this for synthesized index characters.)
540        for (String item : exemplars) {
541            initialLabels.add(UCharacter.toUpperCase(locale, item));
542        }
543    }
544
545    /**
546     * Add Chinese index characters from the tailoring.
547     */
548    private boolean addChineseIndexCharacters() {
549        UnicodeSet contractions = new UnicodeSet();
550        try {
551            collatorPrimaryOnly.internalAddContractions(BASE.charAt(0), contractions);
552        } catch (Exception e) {
553            return false;
554        }
555        if (contractions.isEmpty()) { return false; }
556        initialLabels.addAll(contractions);
557        for (String s : contractions) {
558            assert(s.startsWith(BASE));
559            char c = s.charAt(s.length() - 1);
560            if (0x41 <= c && c <= 0x5A) {  // A-Z
561                // There are Pinyin labels, add ASCII A-Z labels as well.
562                initialLabels.add(0x41, 0x5A);  // A-Z
563                break;
564            }
565        }
566        return true;
567    }
568
569    /**
570     * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
571     * <p>This is used to test whether contractions sort differently from their components.
572     */
573    private String separated(String item) {
574        StringBuilder result = new StringBuilder();
575        // add a CGJ except within surrogates
576        char last = item.charAt(0);
577        result.append(last);
578        for (int i = 1; i < item.length(); ++i) {
579            char ch = item.charAt(i);
580            if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
581                result.append(CGJ);
582            }
583            result.append(ch);
584            last = ch;
585        }
586        return result.toString();
587    }
588
589    /**
590     * Builds an immutable, thread-safe version of this instance, without data records.
591     *
592     * @return an immutable index instance
593     */
594    public ImmutableIndex<V> buildImmutableIndex() {
595        // The current AlphabeticIndex Java code never modifies the bucket list once built.
596        // If it contains no records, we can use it.
597        // addRecord() sets buckets=null rather than inserting the new record into it.
598        BucketList<V> immutableBucketList;
599        if (inputList != null && !inputList.isEmpty()) {
600            // We need a bucket list with no records.
601            immutableBucketList = createBucketList();
602        } else {
603            if (buckets == null) {
604                buckets = createBucketList();
605            }
606            immutableBucketList = buckets;
607        }
608        return new ImmutableIndex<V>(immutableBucketList, collatorPrimaryOnly);
609    }
610
611    /**
612     * Get the labels.
613     *
614     * @return The list of bucket labels, after processing.
615     */
616    public List<String> getBucketLabels() {
617        initBuckets();
618        ArrayList<String> result = new ArrayList<String>();
619        for (Bucket<V> bucket : buckets) {
620            result.add(bucket.getLabel());
621        }
622        return result;
623    }
624
625    /**
626     * Get a clone of the collator used internally. Note that for performance reasons, the clone is only done once, and
627     * then stored. The next time it is accessed, the same instance is returned.
628     * <p>
629     * <b><i>Don't use this method across threads if you are changing the settings on the collator, at least not without
630     * synchronizing.</i></b>
631     *
632     * @return a clone of the collator used internally
633     */
634    public RuleBasedCollator getCollator() {
635        if (collatorExternal == null) {
636            try {
637                collatorExternal = (RuleBasedCollator) (collatorOriginal.clone());
638            } catch (Exception e) {
639                // should never happen
640                throw new IllegalStateException("Collator cannot be cloned", e);
641            }
642        }
643        return collatorExternal;
644    }
645
646    /**
647     * Add a record (name and data) to the index. The name will be used to sort the items into buckets, and to sort
648     * within the bucket. Two records may have the same name. When they do, the sort order is according to the order added:
649     * the first added comes first.
650     *
651     * @param name
652     *            Name, such as a name
653     * @param data
654     *            Data, such as an address or link
655     * @return this, for chaining
656     */
657    public AlphabeticIndex<V> addRecord(CharSequence name, V data) {
658        // TODO instead of invalidating, just add to unprocessed list.
659        buckets = null; // invalidate old bucketlist
660        if (inputList == null) {
661            inputList = new ArrayList<Record<V>>();
662        }
663        inputList.add(new Record<V>(name, data));
664        return this;
665    }
666
667    /**
668     * Get the bucket number for the given name. This routine permits callers to implement their own bucket handling
669     * mechanisms, including client-server handling. For example, when a new name is created on the client, it can ask
670     * the server for the bucket for that name, and the sortkey (using getCollator). Once the client has that
671     * information, it can put the name into the right bucket, and sort it within that bucket, without having access to
672     * the index or collator.
673     * <p>
674     * Note that the bucket number (and sort key) are only valid for the settings of the current AlphabeticIndex; if
675     * those are changed, then the bucket number and sort key must be regenerated.
676     *
677     * @param name
678     *            Name, such as a name
679     * @return the bucket index for the name
680     */
681    public int getBucketIndex(CharSequence name) {
682        initBuckets();
683        return buckets.getBucketIndex(name, collatorPrimaryOnly);
684    }
685
686    /**
687     * Clear the index.
688     *
689     * @return this, for chaining
690     */
691    public AlphabeticIndex<V> clearRecords() {
692        if (inputList != null && !inputList.isEmpty()) {
693            inputList.clear();
694            buckets = null;
695        }
696        return this;
697    }
698
699    /**
700     * Return the number of buckets in the index. This will be the same as the number of labels, plus buckets for the underflow, overflow, and inflow(s).
701     *
702     * @return number of buckets
703     */
704    public int getBucketCount() {
705        initBuckets();
706        return buckets.getBucketCount();
707    }
708
709    /**
710     * Return the number of records in the index: that is, the total number of distinct &lt;name,data&gt; pairs added with addRecord(...), over all the buckets.
711     *
712     * @return total number of records in buckets
713     */
714    public int getRecordCount() {
715        return inputList != null ? inputList.size() : 0;
716    }
717
718    /**
719     * Return an iterator over the buckets.
720     *
721     * @return iterator over buckets.
722     */
723    public Iterator<Bucket<V>> iterator() {
724        initBuckets();
725        return buckets.iterator();
726    }
727
728    /**
729     * Creates an index, and buckets and sorts the list of records into the index.
730     */
731    private void initBuckets() {
732        if (buckets != null) {
733            return;
734        }
735        buckets = createBucketList();
736        if (inputList == null || inputList.isEmpty()) {
737            return;
738        }
739
740        // Sort the records by name.
741        // Stable sort preserves input order of collation duplicates.
742        Collections.sort(inputList, recordComparator);
743
744        // Now, we traverse all of the input, which is now sorted.
745        // If the item doesn't go in the current bucket, we find the next bucket that contains it.
746        // This makes the process order n*log(n), since we just sort the list and then do a linear process.
747        // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
748        // we need to improve it for that case.
749
750        Iterator<Bucket<V>> bucketIterator = buckets.fullIterator();
751        Bucket<V> currentBucket = bucketIterator.next();
752        Bucket<V> nextBucket;
753        String upperBoundary;
754        if (bucketIterator.hasNext()) {
755            nextBucket = bucketIterator.next();
756            upperBoundary = nextBucket.lowerBoundary;
757        } else {
758            nextBucket = null;
759            upperBoundary = null;
760        }
761        for (Record<V> r : inputList) {
762            // if the current bucket isn't the right one, find the one that is
763            // We have a special flag for the last bucket so that we don't look any further
764            while (upperBoundary != null &&
765                    collatorPrimaryOnly.compare(r.name, upperBoundary) >= 0) {
766                currentBucket = nextBucket;
767                // now reset the boundary that we compare against
768                if (bucketIterator.hasNext()) {
769                    nextBucket = bucketIterator.next();
770                    upperBoundary = nextBucket.lowerBoundary;
771                } else {
772                    upperBoundary = null;
773                }
774            }
775            // now put the record into the bucket.
776            Bucket<V> bucket = currentBucket;
777            if (bucket.displayBucket != null) {
778                bucket = bucket.displayBucket;
779            }
780            if (bucket.records == null) {
781                bucket.records = new ArrayList<Record<V>>();
782            }
783            bucket.records.add(r);
784        }
785    }
786
787    private int maxLabelCount = 99;
788
789    /**
790     * Returns true if one index character string is "better" than the other.
791     * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
792     * better, and otherwise binary-less-than is better.
793     */
794    private static boolean isOneLabelBetterThanOther(Normalizer2 nfkdNormalizer, String one, String other) {
795        // This is called with primary-equal strings, but never with one.equals(other).
796        String n1 = nfkdNormalizer.normalize(one);
797        String n2 = nfkdNormalizer.normalize(other);
798        int result = n1.codePointCount(0, n1.length()) - n2.codePointCount(0, n2.length());
799        if (result != 0) {
800            return result < 0;
801        }
802        result = binaryCmp.compare(n1, n2);
803        if (result != 0) {
804            return result < 0;
805        }
806        return binaryCmp.compare(one, other) < 0;
807    }
808
809    /**
810     * A (name, data) pair, to be sorted by name into one of the index buckets.
811     * The user data is not used by the index implementation.
812     */
813    public static class Record<V> {
814        private final CharSequence name;
815        private final V data;
816
817        private Record(CharSequence name, V data) {
818            this.name = name;
819            this.data = data;
820        }
821
822        /**
823         * Get the name
824         *
825         * @return the name
826         */
827        public CharSequence getName() {
828            return name;
829        }
830
831        /**
832         * Get the data
833         *
834         * @return the data
835         */
836        public V getData() {
837            return data;
838        }
839
840        /**
841         * Standard toString()
842         */
843        public String toString() {
844            return name + "=" + data;
845        }
846    }
847
848    /**
849     * An index "bucket" with a label string and type.
850     * It is referenced by {@link AlphabeticIndex#getBucketIndex(CharSequence)}
851     * and {@link AlphabeticIndex.ImmutableIndex#getBucketIndex(CharSequence)},
852     * returned by {@link AlphabeticIndex.ImmutableIndex#getBucket(int)},
853     * and {@link AlphabeticIndex#addRecord(CharSequence, Object)} adds a record
854     * into a bucket according to the record's name.
855     *
856     * @param <V>
857     *            Data type
858     */
859    public static class Bucket<V> implements Iterable<Record<V>> {
860        private final String label;
861        private final String lowerBoundary;
862        private final LabelType labelType;
863        private Bucket<V> displayBucket;
864        private int displayIndex;
865        private List<Record<V>> records;
866
867        /**
868         * Type of the label
869         */
870        public enum LabelType {
871            /**
872             * Normal
873             */
874            NORMAL,
875            /**
876             * Underflow (before the first)
877             */
878            UNDERFLOW,
879            /**
880             * Inflow (between scripts)
881             */
882            INFLOW,
883            /**
884             * Overflow (after the last)
885             */
886            OVERFLOW
887        }
888
889        /**
890         * Set up the bucket.
891         *
892         * @param label
893         *            label for the bucket
894         * @param labelType
895         *            is an underflow, overflow, or inflow bucket
896         */
897        private Bucket(String label, String lowerBoundary, LabelType labelType) {
898            this.label = label;
899            this.lowerBoundary = lowerBoundary;
900            this.labelType = labelType;
901        }
902
903        /**
904         * Get the label
905         *
906         * @return label for the bucket
907         */
908        public String getLabel() {
909            return label;
910        }
911
912        /**
913         * Is a normal, underflow, overflow, or inflow bucket
914         *
915         * @return is an underflow, overflow, or inflow bucket
916         */
917        public LabelType getLabelType() {
918            return labelType;
919        }
920
921        /**
922         * Get the number of records in the bucket.
923         *
924         * @return number of records in bucket
925         */
926        public int size() {
927            return records == null ? 0 : records.size();
928        }
929
930        /**
931         * Iterator over the records in the bucket
932         */
933        public Iterator<Record<V>> iterator() {
934            if (records == null) {
935                return Collections.<Record<V>>emptyList().iterator();
936            }
937            return records.iterator();
938        }
939
940        /**
941         * Standard toString()
942         */
943        @Override
944        public String toString() {
945            return "{" +
946            "labelType=" + labelType
947            + ", " +
948            "lowerBoundary=" + lowerBoundary
949            + ", " +
950            "label=" + label
951            + "}"
952            ;
953        }
954    }
955
956    private BucketList<V> createBucketList() {
957        // Initialize indexCharacters.
958        List<String> indexCharacters = initLabels();
959
960        // Variables for hasMultiplePrimaryWeights().
961        long variableTop;
962        if (collatorPrimaryOnly.isAlternateHandlingShifted()) {
963            variableTop = collatorPrimaryOnly.getVariableTop() & 0xffffffffL;
964        } else {
965            variableTop = 0;
966        }
967        boolean hasInvisibleBuckets = false;
968
969        // Helper arrays for Chinese Pinyin collation.
970        @SuppressWarnings({ "rawtypes", "unchecked" })
971        Bucket<V>[] asciiBuckets = new Bucket[26];
972        @SuppressWarnings({ "rawtypes", "unchecked" })
973        Bucket<V>[] pinyinBuckets = new Bucket[26];
974        boolean hasPinyin = false;
975
976        ArrayList<Bucket<V>> bucketList = new ArrayList<Bucket<V>>();
977
978        // underflow bucket
979        bucketList.add(new Bucket<V>(getUnderflowLabel(), "", LabelType.UNDERFLOW));
980
981        // fix up the list, adding underflow, additions, overflow
982        // Insert inflow labels as needed.
983        int scriptIndex = -1;
984        String scriptUpperBoundary = "";
985        for (String current : indexCharacters) {
986            if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) >= 0) {
987                // We crossed the script boundary into a new script.
988                String inflowBoundary = scriptUpperBoundary;
989                boolean skippedScript = false;
990                for (;;) {
991                    scriptUpperBoundary = firstCharsInScripts.get(++scriptIndex);
992                    if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) < 0) {
993                        break;
994                    }
995                    skippedScript = true;
996                }
997                if (skippedScript && bucketList.size() > 1) {
998                    // We are skipping one or more scripts,
999                    // and we are not just getting out of the underflow label.
1000                    bucketList.add(new Bucket<V>(getInflowLabel(), inflowBoundary,
1001                            LabelType.INFLOW));
1002                }
1003            }
1004            // Add a bucket with the current label.
1005            Bucket<V> bucket = new Bucket<V>(fixLabel(current), current, LabelType.NORMAL);
1006            bucketList.add(bucket);
1007            // Remember ASCII and Pinyin buckets for Pinyin redirects.
1008            char c;
1009            if (current.length() == 1 && 'A' <= (c = current.charAt(0)) && c <= 'Z') {
1010                asciiBuckets[c - 'A'] = bucket;
1011            } else if (current.length() == BASE.length() + 1 && current.startsWith(BASE) &&
1012                    'A' <= (c = current.charAt(BASE.length())) && c <= 'Z') {
1013                pinyinBuckets[c - 'A'] = bucket;
1014                hasPinyin = true;
1015            }
1016            // Check for multiple primary weights.
1017            if (!current.startsWith(BASE) &&
1018                    hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, current) &&
1019                    !current.endsWith("\uffff")) {
1020                // "Æ" or "Sch" etc.
1021                for (int i = bucketList.size() - 2;; --i) {
1022                    Bucket<V> singleBucket = bucketList.get(i);
1023                    if (singleBucket.labelType != LabelType.NORMAL) {
1024                        // There is no single-character bucket since the last
1025                        // underflow or inflow label.
1026                        break;
1027                    }
1028                    if (singleBucket.displayBucket == null &&
1029                            !hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, singleBucket.lowerBoundary)) {
1030                        // Add an invisible bucket that redirects strings greater than the expansion
1031                        // to the previous single-character bucket.
1032                        // For example, after ... Q R S Sch we add Sch\uFFFF->S
1033                        // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
1034                        bucket = new Bucket<V>("", current + "\uFFFF", LabelType.NORMAL);
1035                        bucket.displayBucket = singleBucket;
1036                        bucketList.add(bucket);
1037                        hasInvisibleBuckets = true;
1038                        break;
1039                    }
1040                }
1041            }
1042        }
1043        if (bucketList.size() == 1) {
1044            // No real labels, show only the underflow label.
1045            return new BucketList<V>(bucketList, bucketList);
1046        }
1047        // overflow bucket
1048        bucketList.add(new Bucket<V>(getOverflowLabel(), scriptUpperBoundary, LabelType.OVERFLOW)); // final
1049
1050        if (hasPinyin) {
1051            // Redirect Pinyin buckets.
1052            Bucket<V> asciiBucket = null;
1053            for (int i = 0; i < 26; ++i) {
1054                if (asciiBuckets[i] != null) {
1055                    asciiBucket = asciiBuckets[i];
1056                }
1057                if (pinyinBuckets[i] != null && asciiBucket != null) {
1058                    pinyinBuckets[i].displayBucket = asciiBucket;
1059                    hasInvisibleBuckets = true;
1060                }
1061            }
1062        }
1063
1064        if (!hasInvisibleBuckets) {
1065            return new BucketList<V>(bucketList, bucketList);
1066        }
1067        // Merge inflow buckets that are visually adjacent.
1068        // Iterate backwards: Merge inflow into overflow rather than the other way around.
1069        int i = bucketList.size() - 1;
1070        Bucket<V> nextBucket = bucketList.get(i);
1071        while (--i > 0) {
1072            Bucket<V> bucket = bucketList.get(i);
1073            if (bucket.displayBucket != null) {
1074                continue;  // skip invisible buckets
1075            }
1076            if (bucket.labelType == LabelType.INFLOW) {
1077                if (nextBucket.labelType != LabelType.NORMAL) {
1078                    bucket.displayBucket = nextBucket;
1079                    continue;
1080                }
1081            }
1082            nextBucket = bucket;
1083        }
1084
1085        ArrayList<Bucket<V>> publicBucketList = new ArrayList<Bucket<V>>();
1086        for (Bucket<V> bucket : bucketList) {
1087            if (bucket.displayBucket == null) {
1088                publicBucketList.add(bucket);
1089            }
1090        }
1091        return new BucketList<V>(bucketList, publicBucketList);
1092    }
1093
1094    private static class BucketList<V> implements Iterable<Bucket<V>> {
1095        private final ArrayList<Bucket<V>> bucketList;
1096        private final List<Bucket<V>> immutableVisibleList;
1097
1098        private BucketList(ArrayList<Bucket<V>> bucketList, ArrayList<Bucket<V>> publicBucketList) {
1099            this.bucketList = bucketList;
1100
1101            int displayIndex = 0;
1102            for (Bucket<V> bucket : publicBucketList) {
1103                bucket.displayIndex = displayIndex++;
1104            }
1105            immutableVisibleList = Collections.unmodifiableList(publicBucketList);
1106        }
1107
1108        private int getBucketCount() {
1109            return immutableVisibleList.size();
1110        }
1111
1112        private int getBucketIndex(CharSequence name, Collator collatorPrimaryOnly) {
1113            // binary search
1114            int start = 0;
1115            int limit = bucketList.size();
1116            while ((start + 1) < limit) {
1117                int i = (start + limit) / 2;
1118                Bucket<V> bucket = bucketList.get(i);
1119                int nameVsBucket = collatorPrimaryOnly.compare(name, bucket.lowerBoundary);
1120                if (nameVsBucket < 0) {
1121                    limit = i;
1122                } else {
1123                    start = i;
1124                }
1125            }
1126            Bucket<V> bucket = bucketList.get(start);
1127            if (bucket.displayBucket != null) {
1128                bucket = bucket.displayBucket;
1129            }
1130            return bucket.displayIndex;
1131        }
1132
1133        /**
1134         * Private iterator over all the buckets, visible and invisible
1135         */
1136        private Iterator<Bucket<V>> fullIterator() {
1137            return bucketList.iterator();
1138        }
1139
1140        /**
1141         * Iterator over just the visible buckets.
1142         */
1143        public Iterator<Bucket<V>> iterator() {
1144            return immutableVisibleList.iterator(); // use immutable list to prevent remove().
1145        }
1146    }
1147
1148    private static boolean hasMultiplePrimaryWeights(
1149            RuleBasedCollator coll, long variableTop, String s) {
1150        long[] ces = coll.internalGetCEs(s);
1151        boolean seenPrimary = false;
1152        for (int i = 0; i < ces.length; ++i) {
1153            long ce = ces[i];
1154            long p = ce >>> 32;
1155            if (p > variableTop) {
1156                // not primary ignorable
1157                if (seenPrimary) {
1158                    return true;
1159                }
1160                seenPrimary = true;
1161            }
1162        }
1163        return false;
1164    }
1165
1166    // TODO: Surely we have at least a ticket for porting these mask values to UCharacter.java?!
1167    private static final int GC_LU_MASK = 1 << UCharacter.UPPERCASE_LETTER;
1168    private static final int GC_LL_MASK = 1 << UCharacter.LOWERCASE_LETTER;
1169    private static final int GC_LT_MASK = 1 << UCharacter.TITLECASE_LETTER;
1170    private static final int GC_LM_MASK = 1 << UCharacter.MODIFIER_LETTER;
1171    private static final int GC_LO_MASK = 1 << UCharacter.OTHER_LETTER;
1172    private static final int GC_L_MASK =
1173            GC_LU_MASK|GC_LL_MASK|GC_LT_MASK|GC_LM_MASK|GC_LO_MASK;
1174    private static final int GC_CN_MASK = 1 << UCharacter.GENERAL_OTHER_TYPES;
1175
1176    /**
1177     * Return a list of the first character in each script. Only exposed for testing.
1178     *
1179     * @return list of first characters in each script
1180     * @deprecated This API is ICU internal, only for testing.
1181     * @hide original deprecated declaration
1182     * @hide draft / provisional / internal are hidden on Android
1183     */
1184    @Deprecated
1185    public List<String> getFirstCharactersInScripts() {
1186        List<String> dest = new ArrayList<String>(200);
1187        // Fetch the script-first-primary contractions which are defined in the root collator.
1188        // They all start with U+FDD1.
1189        UnicodeSet set = new UnicodeSet();
1190        collatorPrimaryOnly.internalAddContractions(0xFDD1, set);
1191        if (set.isEmpty()) {
1192            throw new UnsupportedOperationException(
1193                    "AlphabeticIndex requires script-first-primary contractions");
1194        }
1195        for (String boundary : set) {
1196            int gcMask = 1 << UCharacter.getType(boundary.codePointAt(1));
1197            if ((gcMask & (GC_L_MASK | GC_CN_MASK)) == 0) {
1198                // Ignore boundaries for the special reordering groups.
1199                // Take only those for "real scripts" (where the sample character is a Letter,
1200                // and the one for unassigned implicit weights (Cn).
1201                continue;
1202            }
1203            dest.add(boundary);
1204        }
1205        return dest;
1206    }
1207}
1208