1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5 *******************************************************************************
6 * Copyright (C) 2008-2016, Google Inc, International Business Machines Corporation
7 * and others. All Rights Reserved.
8 *******************************************************************************
9 */
10package android.icu.text;
11
12import java.util.ArrayList;
13import java.util.Collections;
14import java.util.Comparator;
15import java.util.Iterator;
16import java.util.List;
17import java.util.Locale;
18
19import android.icu.lang.UCharacter;
20import android.icu.text.AlphabeticIndex.Bucket;
21import android.icu.text.AlphabeticIndex.Bucket.LabelType;
22import android.icu.util.LocaleData;
23import android.icu.util.ULocale;
24
25/**
26 * AlphabeticIndex supports the creation of a UI index appropriate for a given language.
27 * It can support either direct use, or use with a client that doesn't support localized collation.
28 * The following is an example of what an index might look like in a UI:
29 *
30 * <pre>
31 *  <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z  ...</b>
32 *
33 *  <b>A</b>
34 *     Addison
35 *     Albertson
36 *     Azensky
37 *  <b>B</b>
38 *     Baecker
39 *  ...
40 * </pre>
41 *
42 * The class can generate a list of labels for use as a UI "index", that is, a list of
43 * clickable characters (or character sequences) that allow the user to see a segment
44 * (bucket) of a larger "target" list. That is, each label corresponds to a bucket in
45 * the target list, where everything in the bucket is greater than or equal to the character
46 * (according to the locale's collation). Strings can be added to the index;
47 * they will be in sorted order in the right bucket.
48 * <p>
49 * The class also supports having buckets for strings before the first (underflow),
50 * after the last (overflow), and between scripts (inflow). For example, if the index
51 * is constructed with labels for Russian and English, Greek characters would fall
52 * into an inflow bucket between the other two scripts.
53 *
54 * <p><em>Note:</em> If you expect to have a lot of ASCII or Latin characters
55 * as well as characters from the user's language,
56 * then it is a good idea to call addLabels(ULocale.English).
57 *
58 * <h2>Direct Use</h2>
59 * <p>The following shows an example of building an index directly.
60 *  The "show..." methods below are just to illustrate usage.
61 *
62 * <pre>
63 * // Create a simple index where the values for the strings are Integers, and add the strings
64 *
65 * AlphabeticIndex&lt;Integer&gt; index = new AlphabeticIndex&lt;Integer&gt;(desiredLocale).addLabels(additionalLocale);
66 * int counter = 0;
67 * for (String item : test) {
68 *     index.addRecord(item, counter++);
69 * }
70 * ...
71 * // Show index at top. We could skip or gray out empty buckets
72 *
73 * for (AlphabeticIndex.Bucket&lt;Integer&gt; bucket : index) {
74 *     if (showAll || bucket.size() != 0) {
75 *         showLabelAtTop(UI, bucket.getLabel());
76 *     }
77 * }
78 *  ...
79 * // Show the buckets with their contents, skipping empty buckets
80 *
81 * for (AlphabeticIndex.Bucket&lt;Integer&gt; bucket : index) {
82 *     if (bucket.size() != 0) {
83 *         showLabelInList(UI, bucket.getLabel());
84 *         for (AlphabeticIndex.Record&lt;Integer&gt; item : bucket) {
85 *             showIndexedItem(UI, item.getName(), item.getData());
86 *         }
87 * </pre>
88 *
89 * The caller can build different UIs using this class.
90 * For example, an index character could be omitted or grayed-out
91 * if its bucket is empty. Small buckets could also be combined based on size, such as:
92 *
93 * <pre>
94 * <b>... A-F G-N O-Z ...</b>
95 * </pre>
96 *
97 * <h2>Client Support</h2>
98 * <p>Callers can also use the {@link AlphabeticIndex.ImmutableIndex}, or the AlphabeticIndex itself,
99 * to support sorting on a client that doesn't support AlphabeticIndex functionality.
100 *
101 * <p>The ImmutableIndex is both immutable and thread-safe.
102 * The corresponding AlphabeticIndex methods are not thread-safe because
103 * they "lazily" build the index buckets.
104 * <ul>
105 * <li>ImmutableIndex.getBucket(index) provides random access to all
106 *     buckets and their labels and label types.
107 * <li>AlphabeticIndex.getBucketLabels() or the bucket iterator on either class
108 *     can be used to get a list of the labels,
109 *     such as "...", "A", "B",..., and send that list to the client.
110 * <li>When the client has a new name, it sends that name to the server.
111 * The server needs to call the following methods,
112 * and communicate the bucketIndex and collationKey back to the client.
113 *
114 * <pre>
115 * int bucketIndex = index.getBucketIndex(name);
116 * String label = immutableIndex.getBucket(bucketIndex).getLabel();  // optional
117 * RawCollationKey collationKey = collator.getRawCollationKey(name, null);
118 * </pre>
119 *
120 * <li>The client would put the name (and associated information) into its bucket for bucketIndex. The collationKey is a
121 * sequence of bytes that can be compared with a binary compare, and produce the right localized result.</li>
122 * </ul>
123 *
124 * @author Mark Davis
125 */
126public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {
127    /**
128     * Prefix string for Chinese index buckets.
129     * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
130     */
131    private static final String BASE = "\uFDD0";
132
133    private static final char CGJ = '\u034F';
134
135    private static final Comparator<String> binaryCmp = new UTF16.StringComparator(true, false, 0);
136
137    private final RuleBasedCollator collatorOriginal;
138    private final RuleBasedCollator collatorPrimaryOnly;
139    private RuleBasedCollator collatorExternal;
140
141    // Comparator for records, so that the Record class can be static.
142    private final Comparator<Record<V>> recordComparator = new Comparator<Record<V>>() {
143        @Override
144        public int compare(Record<V> o1, Record<V> o2) {
145            return collatorOriginal.compare(o1.name, o2.name);
146        }
147    };
148
149    private final List<String> firstCharsInScripts;
150
151    // We accumulate these as we build up the input parameters
152    private final UnicodeSet initialLabels = new UnicodeSet();
153    private List<Record<V>> inputList;
154
155    // Lazy evaluated: null means that we have not built yet.
156    private BucketList<V> buckets;
157
158    private String overflowLabel = "\u2026";
159    private String underflowLabel = "\u2026";
160    private String inflowLabel = "\u2026";
161
162    /**
163     * Immutable, thread-safe version of {@link AlphabeticIndex}.
164     * This class provides thread-safe methods for bucketing,
165     * and random access to buckets and their properties,
166     * but does not offer adding records to the index.
167     *
168     * @param <V> The Record value type is unused. It can be omitted for this class
169     * if it was omitted for the AlphabeticIndex that built it.
170     */
171    public static final class ImmutableIndex<V> implements Iterable<Bucket<V>> {
172        private final BucketList<V> buckets;
173        private final Collator collatorPrimaryOnly;
174
175        private ImmutableIndex(BucketList<V> bucketList, Collator collatorPrimaryOnly) {
176            this.buckets = bucketList;
177            this.collatorPrimaryOnly = collatorPrimaryOnly;
178        }
179
180        /**
181         * Returns the number of index buckets and labels, including underflow/inflow/overflow.
182         *
183         * @return the number of index buckets
184         */
185        public int getBucketCount() {
186            return buckets.getBucketCount();
187        }
188
189        /**
190         * Finds the index bucket for the given name and returns the number of that bucket.
191         * Use {@link #getBucket(int)} to get the bucket's properties.
192         *
193         * @param name the string to be sorted into an index bucket
194         * @return the bucket number for the name
195         */
196        public int getBucketIndex(CharSequence name) {
197            return buckets.getBucketIndex(name, collatorPrimaryOnly);
198        }
199
200        /**
201         * Returns the index-th bucket. Returns null if the index is out of range.
202         *
203         * @param index bucket number
204         * @return the index-th bucket
205         */
206        public Bucket<V> getBucket(int index) {
207            if (0 <= index && index < buckets.getBucketCount()) {
208                return buckets.immutableVisibleList.get(index);
209            } else {
210                return null;
211            }
212        }
213
214        /**
215         * {@inheritDoc}
216         */
217        @Override
218        public Iterator<Bucket<V>> iterator() {
219            return buckets.iterator();
220        }
221    }
222
223    /**
224     * Create the index object.
225     *
226     * @param locale
227     *            The locale for the index.
228     */
229    public AlphabeticIndex(ULocale locale) {
230        this(locale, null);
231    }
232
233    /**
234     * Create the index object.
235     *
236     * @param locale
237     *            The locale for the index.
238     */
239    public AlphabeticIndex(Locale locale) {
240        this(ULocale.forLocale(locale), null);
241    }
242
243    /**
244     * Create an AlphabeticIndex that uses a specific collator.
245     *
246     * <p>The index will be created with no labels; the addLabels() function must be called
247     * after creation to add the desired labels to the index.
248     *
249     * <p>The index will work directly with the supplied collator. If the caller will need to
250     * continue working with the collator it should be cloned first, so that the
251     * collator provided to the AlphabeticIndex remains unchanged after creation of the index.
252     *
253     * @param collator The collator to use to order the contents of this index.
254     */
255    public AlphabeticIndex(RuleBasedCollator collator) {
256        this(null, collator);
257    }
258
259    /**
260     * Internal constructor containing implementation used by public constructors.
261     */
262    private AlphabeticIndex(ULocale locale, RuleBasedCollator collator) {
263        collatorOriginal = collator != null ? collator : (RuleBasedCollator) Collator.getInstance(locale);
264        try {
265            collatorPrimaryOnly = collatorOriginal.cloneAsThawed();
266        } catch (Exception e) {
267            // should never happen
268            throw new IllegalStateException("Collator cannot be cloned", e);
269        }
270        collatorPrimaryOnly.setStrength(Collator.PRIMARY);
271        collatorPrimaryOnly.freeze();
272
273        firstCharsInScripts = getFirstCharactersInScripts();
274        Collections.sort(firstCharsInScripts, collatorPrimaryOnly);
275        // Guard against a degenerate collator where
276        // some script boundary strings are primary ignorable.
277        for (;;) {
278            if (firstCharsInScripts.isEmpty()) {
279                throw new IllegalArgumentException(
280                        "AlphabeticIndex requires some non-ignorable script boundary strings");
281            }
282            if (collatorPrimaryOnly.compare(firstCharsInScripts.get(0), "") == 0) {
283                firstCharsInScripts.remove(0);
284            } else {
285                break;
286            }
287        }
288
289        // Chinese index characters, which are specific to each of the several Chinese tailorings,
290        // take precedence over the single locale data exemplar set per language.
291        if (!addChineseIndexCharacters() && locale != null) {
292            addIndexExemplars(locale);
293        }
294    }
295
296    /**
297     * Add more index characters (aside from what are in the locale)
298     * @param additions additional characters to add to the index, such as A-Z.
299     * @return this, for chaining
300     */
301    public AlphabeticIndex<V> addLabels(UnicodeSet additions) {
302        initialLabels.addAll(additions);
303        buckets = null;
304        return this;
305    }
306
307    /**
308     * Add more index characters (aside from what are in the locale)
309     * @param additions additional characters to add to the index, such as those in Swedish.
310     * @return this, for chaining
311     */
312    public AlphabeticIndex<V> addLabels(ULocale... additions) {
313        for (ULocale addition : additions) {
314            addIndexExemplars(addition);
315        }
316        buckets = null;
317        return this;
318    }
319
320    /**
321     * Add more index characters (aside from what are in the locale)
322     * @param additions additional characters to add to the index, such as those in Swedish.
323     * @return this, for chaining
324     */
325    public AlphabeticIndex<V> addLabels(Locale... additions) {
326        for (Locale addition : additions) {
327            addIndexExemplars(ULocale.forLocale(addition));
328        }
329        buckets = null;
330        return this;
331    }
332
333    /**
334     * Set the overflow label
335     * @param overflowLabel see class description
336     * @return this, for chaining
337     */
338    public AlphabeticIndex<V> setOverflowLabel(String overflowLabel) {
339        this.overflowLabel = overflowLabel;
340        buckets = null;
341        return this;
342    }
343
344    /**
345     * Get the default label used in the IndexCharacters' locale for underflow, eg the last item in: X Y Z ...
346     *
347     * @return underflow label
348     */
349    public String getUnderflowLabel() {
350        return underflowLabel; // TODO get localized version
351    }
352
353
354    /**
355     * Set the underflowLabel label
356     * @param underflowLabel see class description
357     * @return this, for chaining
358     */
359    public AlphabeticIndex<V> setUnderflowLabel(String underflowLabel) {
360        this.underflowLabel = underflowLabel;
361        buckets = null;
362        return this;
363    }
364
365    /**
366     * Get the default label used in the IndexCharacters' locale for overflow, eg the first item in: ... A B C
367     *
368     * @return overflow label
369     */
370    public String getOverflowLabel() {
371        return overflowLabel; // TODO get localized version
372    }
373
374
375    /**
376     * Set the inflowLabel label
377     * @param inflowLabel see class description
378     * @return this, for chaining
379     */
380    public AlphabeticIndex<V> setInflowLabel(String inflowLabel) {
381        this.inflowLabel = inflowLabel;
382        buckets = null;
383        return this;
384    }
385
386    /**
387     * Get the default label used for abbreviated buckets <i>between</i> other labels. For example, consider the labels
388     * for Latin and Greek are used: X Y Z ... &#x0391; &#x0392; &#x0393;.
389     *
390     * @return inflow label
391     */
392    public String getInflowLabel() {
393        return inflowLabel; // TODO get localized version
394    }
395
396
397    /**
398     * Get the limit on the number of labels in the index. The number of buckets can be slightly larger: see getBucketCount().
399     *
400     * @return maxLabelCount maximum number of labels.
401     */
402    public int getMaxLabelCount() {
403        return maxLabelCount;
404    }
405
406    /**
407     * Set a limit on the number of labels in the index. The number of buckets can be slightly larger: see
408     * getBucketCount().
409     *
410     * @param maxLabelCount Set the maximum number of labels. Currently, if the number is exceeded, then every
411     *         nth item is removed to bring the count down. A more sophisticated mechanism may be available in the
412     *         future.
413     * @return this, for chaining
414     */
415    public AlphabeticIndex<V> setMaxLabelCount(int maxLabelCount) {
416        this.maxLabelCount = maxLabelCount;
417        buckets = null;
418        return this;
419    }
420
421    /**
422     * Determine the best labels to use. This is based on the exemplars, but we also process to make sure that they are unique,
423     * and sort differently, and that the overall list is small enough.
424     */
425    private List<String> initLabels() {
426        Normalizer2 nfkdNormalizer = Normalizer2.getNFKDInstance();
427        List<String> indexCharacters = new ArrayList<String>();
428
429        String firstScriptBoundary = firstCharsInScripts.get(0);
430        String overflowBoundary = firstCharsInScripts.get(firstCharsInScripts.size() - 1);
431
432        // We make a sorted array of elements.
433        // Some of the input may be redundant.
434        // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
435        // We filter out those cases.
436        for (String item : initialLabels) {
437            boolean checkDistinct;
438            if (!UTF16.hasMoreCodePointsThan(item, 1)) {
439                checkDistinct = false;
440            } else if(item.charAt(item.length() - 1) == '*' &&
441                    item.charAt(item.length() - 2) != '*') {
442                // Use a label if it is marked with one trailing star,
443                // even if the label string sorts the same when all contractions are suppressed.
444                item = item.substring(0, item.length() - 1);
445                checkDistinct = false;
446            } else {
447                checkDistinct = true;
448            }
449            if (collatorPrimaryOnly.compare(item, firstScriptBoundary) < 0) {
450                // Ignore a primary-ignorable or non-alphabetic index character.
451            } else if (collatorPrimaryOnly.compare(item, overflowBoundary) >= 0) {
452                // Ignore an index character that will land in the overflow bucket.
453            } else if (checkDistinct && collatorPrimaryOnly.compare(item, separated(item)) == 0) {
454                // Ignore a multi-code point index character that does not sort distinctly
455                // from the sequence of its separate characters.
456            } else {
457                int insertionPoint = Collections.binarySearch(indexCharacters, item, collatorPrimaryOnly);
458                if (insertionPoint < 0) {
459                    indexCharacters.add(~insertionPoint, item);
460                } else {
461                    String itemAlreadyIn = indexCharacters.get(insertionPoint);
462                    if (isOneLabelBetterThanOther(nfkdNormalizer, item, itemAlreadyIn)) {
463                        indexCharacters.set(insertionPoint, item);
464                    }
465                }
466            }
467        }
468
469        // if the result is still too large, cut down to maxLabelCount elements, by removing every nth element
470
471        final int size = indexCharacters.size() - 1;
472        if (size > maxLabelCount) {
473            int count = 0;
474            int old = -1;
475            for (Iterator<String> it = indexCharacters.iterator(); it.hasNext();) {
476                ++count;
477                it.next();
478                final int bump = count * maxLabelCount / size;
479                if (bump == old) {
480                    it.remove();
481                } else {
482                    old = bump;
483                }
484            }
485        }
486
487        return indexCharacters;
488    }
489
490    private static String fixLabel(String current) {
491        if (!current.startsWith(BASE)) {
492            return current;
493        }
494        int rest = current.charAt(BASE.length());
495        if (0x2800 < rest && rest <= 0x28FF) { // stroke count
496            return (rest-0x2800) + "\u5283";
497        }
498        return current.substring(BASE.length());
499    }
500
501    /**
502     * This method is called to get the index exemplars. Normally these come from the locale directly,
503     * but if they aren't available, we have to synthesize them.
504     */
505    private void addIndexExemplars(ULocale locale) {
506        UnicodeSet exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_INDEX);
507        if (exemplars != null) {
508            initialLabels.addAll(exemplars);
509            return;
510        }
511
512        // The locale data did not include explicit Index characters.
513        // Synthesize a set of them from the locale's standard exemplar characters.
514        exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_STANDARD);
515
516        exemplars = exemplars.cloneAsThawed();
517        // question: should we add auxiliary exemplars?
518        if (exemplars.containsSome('a', 'z') || exemplars.size() == 0) {
519            exemplars.addAll('a', 'z');
520        }
521        if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
522            // cut down to small list
523            exemplars.remove(0xAC00, 0xD7A3).
524                add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
525                add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
526                add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
527                add(0xD30C).add(0xD558);
528        }
529        if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
530            // cut down to small list
531            // make use of the fact that Ethiopic is allocated in 8's, where
532            // the base is 0 mod 8.
533            UnicodeSet ethiopic = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
534            UnicodeSetIterator it = new UnicodeSetIterator(ethiopic);
535            while (it.next() && it.codepoint != UnicodeSetIterator.IS_STRING) {
536                if ((it.codepoint & 0x7) != 0) {
537                    exemplars.remove(it.codepoint);
538                }
539            }
540        }
541
542        // Upper-case any that aren't already so.
543        //   (We only do this for synthesized index characters.)
544        for (String item : exemplars) {
545            initialLabels.add(UCharacter.toUpperCase(locale, item));
546        }
547    }
548
549    /**
550     * Add Chinese index characters from the tailoring.
551     */
552    private boolean addChineseIndexCharacters() {
553        UnicodeSet contractions = new UnicodeSet();
554        try {
555            collatorPrimaryOnly.internalAddContractions(BASE.charAt(0), contractions);
556        } catch (Exception e) {
557            return false;
558        }
559        if (contractions.isEmpty()) { return false; }
560        initialLabels.addAll(contractions);
561        for (String s : contractions) {
562            assert(s.startsWith(BASE));
563            char c = s.charAt(s.length() - 1);
564            if (0x41 <= c && c <= 0x5A) {  // A-Z
565                // There are Pinyin labels, add ASCII A-Z labels as well.
566                initialLabels.add(0x41, 0x5A);  // A-Z
567                break;
568            }
569        }
570        return true;
571    }
572
573    /**
574     * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
575     * <p>This is used to test whether contractions sort differently from their components.
576     */
577    private String separated(String item) {
578        StringBuilder result = new StringBuilder();
579        // add a CGJ except within surrogates
580        char last = item.charAt(0);
581        result.append(last);
582        for (int i = 1; i < item.length(); ++i) {
583            char ch = item.charAt(i);
584            if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
585                result.append(CGJ);
586            }
587            result.append(ch);
588            last = ch;
589        }
590        return result.toString();
591    }
592
593    /**
594     * Builds an immutable, thread-safe version of this instance, without data records.
595     *
596     * @return an immutable index instance
597     */
598    public ImmutableIndex<V> buildImmutableIndex() {
599        // The current AlphabeticIndex Java code never modifies the bucket list once built.
600        // If it contains no records, we can use it.
601        // addRecord() sets buckets=null rather than inserting the new record into it.
602        BucketList<V> immutableBucketList;
603        if (inputList != null && !inputList.isEmpty()) {
604            // We need a bucket list with no records.
605            immutableBucketList = createBucketList();
606        } else {
607            if (buckets == null) {
608                buckets = createBucketList();
609            }
610            immutableBucketList = buckets;
611        }
612        return new ImmutableIndex<V>(immutableBucketList, collatorPrimaryOnly);
613    }
614
615    /**
616     * Get the labels.
617     *
618     * @return The list of bucket labels, after processing.
619     */
620    public List<String> getBucketLabels() {
621        initBuckets();
622        ArrayList<String> result = new ArrayList<String>();
623        for (Bucket<V> bucket : buckets) {
624            result.add(bucket.getLabel());
625        }
626        return result;
627    }
628
629    /**
630     * Get a clone of the collator used internally. Note that for performance reasons, the clone is only done once, and
631     * then stored. The next time it is accessed, the same instance is returned.
632     * <p>
633     * <b><i>Don't use this method across threads if you are changing the settings on the collator, at least not without
634     * synchronizing.</i></b>
635     *
636     * @return a clone of the collator used internally
637     */
638    public RuleBasedCollator getCollator() {
639        if (collatorExternal == null) {
640            try {
641                collatorExternal = (RuleBasedCollator) (collatorOriginal.clone());
642            } catch (Exception e) {
643                // should never happen
644                throw new IllegalStateException("Collator cannot be cloned", e);
645            }
646        }
647        return collatorExternal;
648    }
649
650    /**
651     * Add a record (name and data) to the index. The name will be used to sort the items into buckets, and to sort
652     * within the bucket. Two records may have the same name. When they do, the sort order is according to the order added:
653     * the first added comes first.
654     *
655     * @param name
656     *            Name, such as a name
657     * @param data
658     *            Data, such as an address or link
659     * @return this, for chaining
660     */
661    public AlphabeticIndex<V> addRecord(CharSequence name, V data) {
662        // TODO instead of invalidating, just add to unprocessed list.
663        buckets = null; // invalidate old bucketlist
664        if (inputList == null) {
665            inputList = new ArrayList<Record<V>>();
666        }
667        inputList.add(new Record<V>(name, data));
668        return this;
669    }
670
671    /**
672     * Get the bucket number for the given name. This routine permits callers to implement their own bucket handling
673     * mechanisms, including client-server handling. For example, when a new name is created on the client, it can ask
674     * the server for the bucket for that name, and the sortkey (using getCollator). Once the client has that
675     * information, it can put the name into the right bucket, and sort it within that bucket, without having access to
676     * the index or collator.
677     * <p>
678     * Note that the bucket number (and sort key) are only valid for the settings of the current AlphabeticIndex; if
679     * those are changed, then the bucket number and sort key must be regenerated.
680     *
681     * @param name
682     *            Name, such as a name
683     * @return the bucket index for the name
684     */
685    public int getBucketIndex(CharSequence name) {
686        initBuckets();
687        return buckets.getBucketIndex(name, collatorPrimaryOnly);
688    }
689
690    /**
691     * Clear the index.
692     *
693     * @return this, for chaining
694     */
695    public AlphabeticIndex<V> clearRecords() {
696        if (inputList != null && !inputList.isEmpty()) {
697            inputList.clear();
698            buckets = null;
699        }
700        return this;
701    }
702
703    /**
704     * Return the number of buckets in the index. This will be the same as the number of labels, plus buckets for the underflow, overflow, and inflow(s).
705     *
706     * @return number of buckets
707     */
708    public int getBucketCount() {
709        initBuckets();
710        return buckets.getBucketCount();
711    }
712
713    /**
714     * Return the number of records in the index: that is, the total number of distinct &lt;name,data&gt; pairs added with addRecord(...), over all the buckets.
715     *
716     * @return total number of records in buckets
717     */
718    public int getRecordCount() {
719        return inputList != null ? inputList.size() : 0;
720    }
721
722    /**
723     * Return an iterator over the buckets.
724     *
725     * @return iterator over buckets.
726     */
727    @Override
728    public Iterator<Bucket<V>> iterator() {
729        initBuckets();
730        return buckets.iterator();
731    }
732
733    /**
734     * Creates an index, and buckets and sorts the list of records into the index.
735     */
736    private void initBuckets() {
737        if (buckets != null) {
738            return;
739        }
740        buckets = createBucketList();
741        if (inputList == null || inputList.isEmpty()) {
742            return;
743        }
744
745        // Sort the records by name.
746        // Stable sort preserves input order of collation duplicates.
747        Collections.sort(inputList, recordComparator);
748
749        // Now, we traverse all of the input, which is now sorted.
750        // If the item doesn't go in the current bucket, we find the next bucket that contains it.
751        // This makes the process order n*log(n), since we just sort the list and then do a linear process.
752        // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
753        // we need to improve it for that case.
754
755        Iterator<Bucket<V>> bucketIterator = buckets.fullIterator();
756        Bucket<V> currentBucket = bucketIterator.next();
757        Bucket<V> nextBucket;
758        String upperBoundary;
759        if (bucketIterator.hasNext()) {
760            nextBucket = bucketIterator.next();
761            upperBoundary = nextBucket.lowerBoundary;
762        } else {
763            nextBucket = null;
764            upperBoundary = null;
765        }
766        for (Record<V> r : inputList) {
767            // if the current bucket isn't the right one, find the one that is
768            // We have a special flag for the last bucket so that we don't look any further
769            while (upperBoundary != null &&
770                    collatorPrimaryOnly.compare(r.name, upperBoundary) >= 0) {
771                currentBucket = nextBucket;
772                // now reset the boundary that we compare against
773                if (bucketIterator.hasNext()) {
774                    nextBucket = bucketIterator.next();
775                    upperBoundary = nextBucket.lowerBoundary;
776                } else {
777                    upperBoundary = null;
778                }
779            }
780            // now put the record into the bucket.
781            Bucket<V> bucket = currentBucket;
782            if (bucket.displayBucket != null) {
783                bucket = bucket.displayBucket;
784            }
785            if (bucket.records == null) {
786                bucket.records = new ArrayList<Record<V>>();
787            }
788            bucket.records.add(r);
789        }
790    }
791
792    private int maxLabelCount = 99;
793
794    /**
795     * Returns true if one index character string is "better" than the other.
796     * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
797     * better, and otherwise binary-less-than is better.
798     */
799    private static boolean isOneLabelBetterThanOther(Normalizer2 nfkdNormalizer, String one, String other) {
800        // This is called with primary-equal strings, but never with one.equals(other).
801        String n1 = nfkdNormalizer.normalize(one);
802        String n2 = nfkdNormalizer.normalize(other);
803        int result = n1.codePointCount(0, n1.length()) - n2.codePointCount(0, n2.length());
804        if (result != 0) {
805            return result < 0;
806        }
807        result = binaryCmp.compare(n1, n2);
808        if (result != 0) {
809            return result < 0;
810        }
811        return binaryCmp.compare(one, other) < 0;
812    }
813
814    /**
815     * A (name, data) pair, to be sorted by name into one of the index buckets.
816     * The user data is not used by the index implementation.
817     */
818    public static class Record<V> {
819        private final CharSequence name;
820        private final V data;
821
822        private Record(CharSequence name, V data) {
823            this.name = name;
824            this.data = data;
825        }
826
827        /**
828         * Get the name
829         *
830         * @return the name
831         */
832        public CharSequence getName() {
833            return name;
834        }
835
836        /**
837         * Get the data
838         *
839         * @return the data
840         */
841        public V getData() {
842            return data;
843        }
844
845        /**
846         * Standard toString()
847         */
848        @Override
849        public String toString() {
850            return name + "=" + data;
851        }
852    }
853
854    /**
855     * An index "bucket" with a label string and type.
856     * It is referenced by {@link AlphabeticIndex#getBucketIndex(CharSequence)}
857     * and {@link AlphabeticIndex.ImmutableIndex#getBucketIndex(CharSequence)},
858     * returned by {@link AlphabeticIndex.ImmutableIndex#getBucket(int)},
859     * and {@link AlphabeticIndex#addRecord(CharSequence, Object)} adds a record
860     * into a bucket according to the record's name.
861     *
862     * @param <V>
863     *            Data type
864     */
865    public static class Bucket<V> implements Iterable<Record<V>> {
866        private final String label;
867        private final String lowerBoundary;
868        private final LabelType labelType;
869        private Bucket<V> displayBucket;
870        private int displayIndex;
871        private List<Record<V>> records;
872
873        /**
874         * Type of the label
875         */
876        public enum LabelType {
877            /**
878             * Normal
879             */
880            NORMAL,
881            /**
882             * Underflow (before the first)
883             */
884            UNDERFLOW,
885            /**
886             * Inflow (between scripts)
887             */
888            INFLOW,
889            /**
890             * Overflow (after the last)
891             */
892            OVERFLOW
893        }
894
895        /**
896         * Set up the bucket.
897         *
898         * @param label
899         *            label for the bucket
900         * @param labelType
901         *            is an underflow, overflow, or inflow bucket
902         */
903        private Bucket(String label, String lowerBoundary, LabelType labelType) {
904            this.label = label;
905            this.lowerBoundary = lowerBoundary;
906            this.labelType = labelType;
907        }
908
909        /**
910         * Get the label
911         *
912         * @return label for the bucket
913         */
914        public String getLabel() {
915            return label;
916        }
917
918        /**
919         * Is a normal, underflow, overflow, or inflow bucket
920         *
921         * @return is an underflow, overflow, or inflow bucket
922         */
923        public LabelType getLabelType() {
924            return labelType;
925        }
926
927        /**
928         * Get the number of records in the bucket.
929         *
930         * @return number of records in bucket
931         */
932        public int size() {
933            return records == null ? 0 : records.size();
934        }
935
936        /**
937         * Iterator over the records in the bucket
938         */
939        @Override
940        public Iterator<Record<V>> iterator() {
941            if (records == null) {
942                return Collections.<Record<V>>emptyList().iterator();
943            }
944            return records.iterator();
945        }
946
947        /**
948         * Standard toString()
949         */
950        @Override
951        public String toString() {
952            return "{" +
953            "labelType=" + labelType
954            + ", " +
955            "lowerBoundary=" + lowerBoundary
956            + ", " +
957            "label=" + label
958            + "}"
959            ;
960        }
961    }
962
963    private BucketList<V> createBucketList() {
964        // Initialize indexCharacters.
965        List<String> indexCharacters = initLabels();
966
967        // Variables for hasMultiplePrimaryWeights().
968        long variableTop;
969        if (collatorPrimaryOnly.isAlternateHandlingShifted()) {
970            variableTop = collatorPrimaryOnly.getVariableTop() & 0xffffffffL;
971        } else {
972            variableTop = 0;
973        }
974        boolean hasInvisibleBuckets = false;
975
976        // Helper arrays for Chinese Pinyin collation.
977        @SuppressWarnings({ "rawtypes", "unchecked" })
978        Bucket<V>[] asciiBuckets = new Bucket[26];
979        @SuppressWarnings({ "rawtypes", "unchecked" })
980        Bucket<V>[] pinyinBuckets = new Bucket[26];
981        boolean hasPinyin = false;
982
983        ArrayList<Bucket<V>> bucketList = new ArrayList<Bucket<V>>();
984
985        // underflow bucket
986        bucketList.add(new Bucket<V>(getUnderflowLabel(), "", LabelType.UNDERFLOW));
987
988        // fix up the list, adding underflow, additions, overflow
989        // Insert inflow labels as needed.
990        int scriptIndex = -1;
991        String scriptUpperBoundary = "";
992        for (String current : indexCharacters) {
993            if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) >= 0) {
994                // We crossed the script boundary into a new script.
995                String inflowBoundary = scriptUpperBoundary;
996                boolean skippedScript = false;
997                for (;;) {
998                    scriptUpperBoundary = firstCharsInScripts.get(++scriptIndex);
999                    if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) < 0) {
1000                        break;
1001                    }
1002                    skippedScript = true;
1003                }
1004                if (skippedScript && bucketList.size() > 1) {
1005                    // We are skipping one or more scripts,
1006                    // and we are not just getting out of the underflow label.
1007                    bucketList.add(new Bucket<V>(getInflowLabel(), inflowBoundary,
1008                            LabelType.INFLOW));
1009                }
1010            }
1011            // Add a bucket with the current label.
1012            Bucket<V> bucket = new Bucket<V>(fixLabel(current), current, LabelType.NORMAL);
1013            bucketList.add(bucket);
1014            // Remember ASCII and Pinyin buckets for Pinyin redirects.
1015            char c;
1016            if (current.length() == 1 && 'A' <= (c = current.charAt(0)) && c <= 'Z') {
1017                asciiBuckets[c - 'A'] = bucket;
1018            } else if (current.length() == BASE.length() + 1 && current.startsWith(BASE) &&
1019                    'A' <= (c = current.charAt(BASE.length())) && c <= 'Z') {
1020                pinyinBuckets[c - 'A'] = bucket;
1021                hasPinyin = true;
1022            }
1023            // Check for multiple primary weights.
1024            if (!current.startsWith(BASE) &&
1025                    hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, current) &&
1026                    !current.endsWith("\uffff")) {
1027                // "Æ" or "Sch" etc.
1028                for (int i = bucketList.size() - 2;; --i) {
1029                    Bucket<V> singleBucket = bucketList.get(i);
1030                    if (singleBucket.labelType != LabelType.NORMAL) {
1031                        // There is no single-character bucket since the last
1032                        // underflow or inflow label.
1033                        break;
1034                    }
1035                    if (singleBucket.displayBucket == null &&
1036                            !hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, singleBucket.lowerBoundary)) {
1037                        // Add an invisible bucket that redirects strings greater than the expansion
1038                        // to the previous single-character bucket.
1039                        // For example, after ... Q R S Sch we add Sch\uFFFF->S
1040                        // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
1041                        bucket = new Bucket<V>("", current + "\uFFFF", LabelType.NORMAL);
1042                        bucket.displayBucket = singleBucket;
1043                        bucketList.add(bucket);
1044                        hasInvisibleBuckets = true;
1045                        break;
1046                    }
1047                }
1048            }
1049        }
1050        if (bucketList.size() == 1) {
1051            // No real labels, show only the underflow label.
1052            return new BucketList<V>(bucketList, bucketList);
1053        }
1054        // overflow bucket
1055        bucketList.add(new Bucket<V>(getOverflowLabel(), scriptUpperBoundary, LabelType.OVERFLOW)); // final
1056
1057        if (hasPinyin) {
1058            // Redirect Pinyin buckets.
1059            Bucket<V> asciiBucket = null;
1060            for (int i = 0; i < 26; ++i) {
1061                if (asciiBuckets[i] != null) {
1062                    asciiBucket = asciiBuckets[i];
1063                }
1064                if (pinyinBuckets[i] != null && asciiBucket != null) {
1065                    pinyinBuckets[i].displayBucket = asciiBucket;
1066                    hasInvisibleBuckets = true;
1067                }
1068            }
1069        }
1070
1071        if (!hasInvisibleBuckets) {
1072            return new BucketList<V>(bucketList, bucketList);
1073        }
1074        // Merge inflow buckets that are visually adjacent.
1075        // Iterate backwards: Merge inflow into overflow rather than the other way around.
1076        int i = bucketList.size() - 1;
1077        Bucket<V> nextBucket = bucketList.get(i);
1078        while (--i > 0) {
1079            Bucket<V> bucket = bucketList.get(i);
1080            if (bucket.displayBucket != null) {
1081                continue;  // skip invisible buckets
1082            }
1083            if (bucket.labelType == LabelType.INFLOW) {
1084                if (nextBucket.labelType != LabelType.NORMAL) {
1085                    bucket.displayBucket = nextBucket;
1086                    continue;
1087                }
1088            }
1089            nextBucket = bucket;
1090        }
1091
1092        ArrayList<Bucket<V>> publicBucketList = new ArrayList<Bucket<V>>();
1093        for (Bucket<V> bucket : bucketList) {
1094            if (bucket.displayBucket == null) {
1095                publicBucketList.add(bucket);
1096            }
1097        }
1098        return new BucketList<V>(bucketList, publicBucketList);
1099    }
1100
1101    private static class BucketList<V> implements Iterable<Bucket<V>> {
1102        private final ArrayList<Bucket<V>> bucketList;
1103        private final List<Bucket<V>> immutableVisibleList;
1104
1105        private BucketList(ArrayList<Bucket<V>> bucketList, ArrayList<Bucket<V>> publicBucketList) {
1106            this.bucketList = bucketList;
1107
1108            int displayIndex = 0;
1109            for (Bucket<V> bucket : publicBucketList) {
1110                bucket.displayIndex = displayIndex++;
1111            }
1112            immutableVisibleList = Collections.unmodifiableList(publicBucketList);
1113        }
1114
1115        private int getBucketCount() {
1116            return immutableVisibleList.size();
1117        }
1118
1119        private int getBucketIndex(CharSequence name, Collator collatorPrimaryOnly) {
1120            // binary search
1121            int start = 0;
1122            int limit = bucketList.size();
1123            while ((start + 1) < limit) {
1124                int i = (start + limit) / 2;
1125                Bucket<V> bucket = bucketList.get(i);
1126                int nameVsBucket = collatorPrimaryOnly.compare(name, bucket.lowerBoundary);
1127                if (nameVsBucket < 0) {
1128                    limit = i;
1129                } else {
1130                    start = i;
1131                }
1132            }
1133            Bucket<V> bucket = bucketList.get(start);
1134            if (bucket.displayBucket != null) {
1135                bucket = bucket.displayBucket;
1136            }
1137            return bucket.displayIndex;
1138        }
1139
1140        /**
1141         * Private iterator over all the buckets, visible and invisible
1142         */
1143        private Iterator<Bucket<V>> fullIterator() {
1144            return bucketList.iterator();
1145        }
1146
1147        /**
1148         * Iterator over just the visible buckets.
1149         */
1150        @Override
1151        public Iterator<Bucket<V>> iterator() {
1152            return immutableVisibleList.iterator(); // use immutable list to prevent remove().
1153        }
1154    }
1155
1156    private static boolean hasMultiplePrimaryWeights(
1157            RuleBasedCollator coll, long variableTop, String s) {
1158        long[] ces = coll.internalGetCEs(s);
1159        boolean seenPrimary = false;
1160        for (int i = 0; i < ces.length; ++i) {
1161            long ce = ces[i];
1162            long p = ce >>> 32;
1163            if (p > variableTop) {
1164                // not primary ignorable
1165                if (seenPrimary) {
1166                    return true;
1167                }
1168                seenPrimary = true;
1169            }
1170        }
1171        return false;
1172    }
1173
1174    // TODO: Surely we have at least a ticket for porting these mask values to UCharacter.java?!
1175    private static final int GC_LU_MASK = 1 << UCharacter.UPPERCASE_LETTER;
1176    private static final int GC_LL_MASK = 1 << UCharacter.LOWERCASE_LETTER;
1177    private static final int GC_LT_MASK = 1 << UCharacter.TITLECASE_LETTER;
1178    private static final int GC_LM_MASK = 1 << UCharacter.MODIFIER_LETTER;
1179    private static final int GC_LO_MASK = 1 << UCharacter.OTHER_LETTER;
1180    private static final int GC_L_MASK =
1181            GC_LU_MASK|GC_LL_MASK|GC_LT_MASK|GC_LM_MASK|GC_LO_MASK;
1182    private static final int GC_CN_MASK = 1 << UCharacter.GENERAL_OTHER_TYPES;
1183
1184    /**
1185     * Return a list of the first character in each script. Only exposed for testing.
1186     *
1187     * @return list of first characters in each script
1188     * @deprecated This API is ICU internal, only for testing.
1189     * @hide original deprecated declaration
1190     * @hide draft / provisional / internal are hidden on Android
1191     */
1192    @Deprecated
1193    public List<String> getFirstCharactersInScripts() {
1194        List<String> dest = new ArrayList<String>(200);
1195        // Fetch the script-first-primary contractions which are defined in the root collator.
1196        // They all start with U+FDD1.
1197        UnicodeSet set = new UnicodeSet();
1198        collatorPrimaryOnly.internalAddContractions(0xFDD1, set);
1199        if (set.isEmpty()) {
1200            throw new UnsupportedOperationException(
1201                    "AlphabeticIndex requires script-first-primary contractions");
1202        }
1203        for (String boundary : set) {
1204            int gcMask = 1 << UCharacter.getType(boundary.codePointAt(1));
1205            if ((gcMask & (GC_L_MASK | GC_CN_MASK)) == 0) {
1206                // Ignore boundaries for the special reordering groups.
1207                // Take only those for "real scripts" (where the sample character is a Letter,
1208                // and the one for unassigned implicit weights (Cn).
1209                continue;
1210            }
1211            dest.add(boundary);
1212        }
1213        return dest;
1214    }
1215}
1216