1/*
2*******************************************************************************
3*
4*   Copyright (C) 2011 International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*/
9
10#ifndef INDEXCHARS_H
11#define INDEXCHARS_H
12
13#include "unicode/utypes.h"
14#include "unicode/uobject.h"
15#include "unicode/locid.h"
16
17/**
18 * \file
19 * \brief C++ API: Index Characters
20 */
21
22
23U_CDECL_BEGIN
24
25/**
26 * Constants for Alphabetic Index Label Types.
27 * The form of these enum constants anticipates having a plain C API
28 * for Alphabetic Indexes that will also use them.
29 * @draft ICU 4.8
30 */
31typedef enum UAlphabeticIndexLabelType {
32         /**
33          *  Normal Label, typically the starting letter of the names
34          *  in the bucket with this label.
35          * @draft ICU 4.8
36          */
37         U_ALPHAINDEX_NORMAL    = 0,
38
39         /**
40          * Undeflow Label.  The bucket with this label contains names
41          * in scripts that sort before any of the bucket labels in this index.
42          * @draft ICU 4.8
43          */
44         U_ALPHAINDEX_UNDERFLOW = 1,
45
46         /**
47          * Inflow Label.  The bucket with this label contains names
48          * in scripts that sort between two of the bucket labels in this index.
49          * Inflow labels are created when an index contains normal labels for
50          * multiple scripts, and skips other scripts that sort between some of the
51          * included scripts.
52          * @draft ICU 4.8
53          */
54         U_ALPHAINDEX_INFLOW    = 2,
55
56         /**
57          * Overflow Label. Te bucket with this label contains names in scripts
58          * that sort after all of the bucket labels in this index.
59          * @draft ICU 4.8
60          */
61         U_ALPHAINDEX_OVERFLOW  = 3
62     } UAlphabeticIndexLabelType;
63
64
65struct UHashtable;
66U_CDECL_END
67
68U_NAMESPACE_BEGIN
69
70// Forward Declarations
71
72class Collator;
73class RuleBasedCollator;
74class StringEnumeration;
75class UnicodeSet;
76class UVector;
77
78
79
80/**
81 * class AlphabeticIndex supports the creation of a UI index appropriate for a given language, such as:
82 *
83 * <pre>
84 *  <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z \\u00C6 \\u00D8 \\u00C5 ...</b>
85 *
86 *  <b>A</b>
87 *     Addison
88 *     Albertson
89 *     Azensky
90 *  <b>B</b>
91 *     Baker
92 *  ...
93 * </pre>
94 *
95 * The class can generate a list of labels for use as a UI "index", that is, a list of
96 * clickable characters (or character sequences) that allow the user to see a segment
97 * (bucket) of a larger "target" list. That is, each label corresponds to a bucket in
98 * the target list, where everything in the bucket is greater than or equal to the character
99 * (according to the locale's collation). Strings can be added to the index;
100 * they will be in sorted order in the right bucket.
101 * <p>
102 * The class also supports having buckets for strings before the first (underflow),
103 * after the last (overflow), and between scripts (inflow). For example, if the index
104 * is constructed with labels for Russian and English, Greek characters would fall
105 * into an inflow bucket between the other two scripts.
106 * <p>
107 * The AlphabeticIndex class is not intended for public subclassing.
108 * <p>
109 * <i>Example</i>
110 * <p>
111 * The "show..." methods below are just to illustrate usage.
112 *
113 * <pre>
114 * // Create a simple index.  "Item" is assumed to be an application
115 * // defined type that the application's UI and other processing knows about,
116 * //  and that has a name.
117 *
118 * UErrorCode status = U_ZERO_ERROR;
119 * AlphabeticIndex index = new AlphabeticIndex(desiredLocale, status);
120 * index->addLabels(additionalLocale, status);
121 * for (Item *item in some source of Items ) {
122 *     index->addRecord(item->name(), item, status);
123 * }
124 * ...
125 * // Show index at top. We could skip or gray out empty buckets
126 *
127 * while (index->nextBucket(status)) {
128 *     if (showAll || index->getBucketRecordCount() != 0) {
129 *         showLabelAtTop(UI, index->getBucketLabel());
130 *     }
131 * }
132 *  ...
133 * // Show the buckets with their contents, skipping empty buckets
134 *
135 * index->resetBucketIterator(status);
136 * while (index->nextBucket(status)) {
137 *    if (index->getBucketRecordCount() != 0) {
138 *        showLabelInList(UI, index->getBucketLabel());
139 *        while (index->nextRecord(status)) {
140 *            showIndexedItem(UI, static_cast<Item *>(index->getRecordData()))
141 * </pre>
142 *
143 * The caller can build different UIs using this class.
144 * For example, an index character could be omitted or grayed-out
145 * if its bucket is empty. Small buckets could also be combined based on size, such as:
146 *
147 * <pre>
148 * <b>... A-F G-N O-Z ...</b>
149 * </pre>
150 *
151 * <p>
152 * <b>Notes:</b>
153 * <ul>
154 * <li>Additional collation parameters can be passed in as part of the locale name.
155 *     For example, German plus numeric
156 *     sorting would be "de@kn-true".
157 * </ul>
158 *
159 * @draft ICU 4.8 This API might change or be removed in a future release.
160 */
161
162
163class U_I18N_API AlphabeticIndex: public UObject {
164
165  public:
166
167    /**
168     * Construct an AlphabeticIndex object for the specified locale.  If the locale's
169     * data does not include index characters, a set of them will be
170     * synthesized based on the locale's exemplar characters.  The locale
171     * determines the sorting order for both the index characters and the
172     * user item names appearing under each Index character.
173     *
174     * @param locale the desired locale.
175     * @param status Error code, will be set with the reason if the construction
176     *               of the AlphabeticIndex object fails.
177     * @draft ICU 4.8
178     */
179     AlphabeticIndex(const Locale &locale, UErrorCode &status);
180
181
182
183    /**
184     * Add Labels to this Index.  The labels are additions to those
185     * that are already in the index; they do not replace the existing
186     * ones.
187     * @param additions The additional characters to add to the index, such as A-Z.
188     * @param status Error code, will be set with the reason if the
189     *               operation fails.
190     * @return this, for chaining
191     * @draft ICU 4.8
192     */
193     virtual AlphabeticIndex &addLabels(const UnicodeSet &additions, UErrorCode &status);
194
195    /**
196     * Add the index characters from a Locale to the index.  The labels
197     * are added to those that are already in the index; they do not replace the
198     * existing index characters.  The collation order for this index is not
199     * changed; it remains that of the locale that was originally specified
200     * when creating this Index.
201     *
202     * @param locale The locale whose index characters are to be added.
203     * @param status Error code, will be set with the reason if the
204     *               operation fails.
205     * @return this, for chaining
206     * @draft ICU 4.8
207     */
208     virtual AlphabeticIndex &addLabels(const Locale &locale, UErrorCode &status);
209
210     /**
211      * Destructor
212      * @draft ICU 4.8
213      */
214     virtual ~AlphabeticIndex();
215
216
217    /**
218     * Get the Collator that establishes the ordering of the items in this index.
219     * Ownership of the collator remains with the AlphabeticIndex instance.
220     *
221     * The returned collator is a reference to the internal collator used by this
222     * index.  It may be safely used to compare the names of items or to get
223     * sort keys for names.  However if any settings need to be changed,
224     * or other non-const methods called, a cloned copy must be made first.
225     *
226     * @return The collator
227     * @draft ICU 4.8
228     */
229    virtual const RuleBasedCollator &getCollator() const;
230
231
232   /**
233     * Get the default label used for abbreviated buckets <i>between</i> other index characters.
234     * For example, consider the labels when Latin and Greek are used:
235     *     X Y Z ... &#x0391; &#x0392; &#x0393;.
236     *
237     * @return inflow label
238     * @draft ICU 4.8
239     */
240    virtual const UnicodeString &getInflowLabel() const;
241
242   /**
243     * Set the default label used for abbreviated buckets <i>between</i> other index characters.
244     * An inflow label will be automatically inserted if two otherwise-adjacent label characters
245     * are from different scripts, e.g. Latin and Cyrillic, and a third script, e.g. Greek,
246     * sorts between the two.  The default inflow character is an ellipsis (...)
247     *
248     * @param inflowLabel the new Inflow label.
249     * @param status Error code, will be set with the reason if the operation fails.
250     * @return this
251     * @draft ICU 4.8
252     */
253    virtual AlphabeticIndex &setInflowLabel(const UnicodeString &inflowLabel, UErrorCode &status);
254
255
256
257   /**
258     * Get the special label used for items that sort after the last normal label,
259     * and that would not otherwise have an appropriate label.
260     *
261     * @return the overflow label
262     * @draft ICU 4.8
263     */
264    virtual const UnicodeString &getOverflowLabel() const;
265
266
267   /**
268     * Set the label used for items that sort after the last normal label,
269     * and that would not otherwise have an appropriate label.
270     *
271     * @param overflowLabel the new overflow label.
272     * @param status Error code, will be set with the reason if the operation fails.
273     * @return this
274     * @draft ICU 4.8
275     */
276    virtual AlphabeticIndex &setOverflowLabel(const UnicodeString &overflowLabel, UErrorCode &status);
277
278   /**
279     * Get the special label used for items that sort before the first normal label,
280     * and that would not otherwise have an appropriate label.
281     *
282     * @return underflow label
283     * @draft ICU 4.8
284     */
285    virtual const UnicodeString &getUnderflowLabel() const;
286
287   /**
288     * Set the label used for items that sort before the first normal label,
289     * and that would not otherwise have an appropriate label.
290     *
291     * @param underflowLabel the new underflow label.
292     * @param status Error code, will be set with the reason if the operation fails.
293     * @return this
294     * @draft ICU 4.8
295     */
296    virtual AlphabeticIndex &setUnderflowLabel(const UnicodeString &underflowLabel, UErrorCode &status);
297
298
299    /**
300     * Get the limit on the number of labels permitted in the index.
301     * The number does not include over, under and inflow labels.
302     *
303     * @return maxLabelCount maximum number of labels.
304     * @draft ICU 4.8
305     */
306    virtual int32_t getMaxLabelCount() const;
307
308    /**
309     * Set a limit on the number of labels permitted in the index.
310     * The number does not include over, under and inflow labels.
311     * Currently, if the number is exceeded, then every
312     * nth item is removed to bring the count down.
313     * A more sophisticated mechanism may be available in the future.
314     *
315     * @param maxLabelCount the maximum number of labels.
316     * @param status error code
317     * @return This, for chaining
318     * @draft ICU 4.8
319     */
320    virtual AlphabeticIndex &setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status);
321
322
323    /**
324     * Get the Unicode character (or tailored string) that defines an overflow bucket;
325     * that is anything greater than or equal to that string should go in that bucket,
326     * instead of with the last character. Normally that is the first character of the script
327     * after lowerLimit. Thus in X Y Z ... <i>Devanagari-ka</i>, the overflow character for Z
328     * would be the <i>Greek-alpha</i>.
329     *
330     * @param lowerLimit   The character below the overflow (or inflow) bucket
331     * @param status error code
332     * @return string that defines top of the overflow buck for lowerLimit, or an empty string if there is none
333     * @internal
334     */
335    virtual const UnicodeString &getOverflowComparisonString(const UnicodeString &lowerLimit,
336                                                             UErrorCode &status);
337
338
339    /**
340     * Add a record to the index.  Each record will be associated with an index Bucket
341     *  based on the record's name.  The list of records for each bucket will be sorted
342     *  based on the collation ordering of the names in the index's locale.
343     *  Records with duplicate names are permitted; they will be kept in the order
344     *  that they were added.
345     *
346     * @param name The display name for the Record.  The Record will be placed in
347     *             a bucket based on this name.
348     * @param data An optional pointer to user data associated with this
349     *             item.  When iterating the contents of a bucket, both the
350     *             data pointer the name will be available for each Record.
351     * @param status  Error code, will be set with the reason if the operation fails.
352     * @return        This, for chaining.
353     * @draft ICU 4.8
354     */
355    virtual AlphabeticIndex &addRecord(const UnicodeString &name, const void *data, UErrorCode &status);
356
357    /**
358     * Remove all Records from the Index.  The set of Buckets, which define the headings under
359     * which records are classified, is not altered.
360     *
361     * @param status  Error code, will be set with the reason if the operation fails.
362     * @return        This, for chaining.
363     * @draft ICU 4.8
364     */
365    virtual AlphabeticIndex &clearRecords(UErrorCode &status);
366
367
368    /**  Get the number of labels in this index.
369     *      Note: may trigger lazy index construction.
370     *
371     * @param status  Error code, will be set with the reason if the operation fails.
372     * @return        The number of labels in this index, including any under, over or
373     *                in-flow labels.
374     * @draft ICU 4.8
375     */
376    virtual int32_t  getBucketCount(UErrorCode &status);
377
378
379    /**  Get the total number of Records in this index, that is, the number
380     *   of <name, data> pairs added.
381     *
382     * @param status  Error code, will be set with the reason if the operation fails.
383     * @return        The number of records in this index, that is, the total number
384     *                of (name, data) items added with addRecord().
385     * @draft ICU 4.8
386     */
387    virtual int32_t  getRecordCount(UErrorCode &status);
388
389
390
391    /**
392     *   Given the name of a record, return the zero-based index of the Bucket
393     *   in which the item should appear.  The name need not be in the index.
394     *   A Record will not be added to the index by this function.
395     *   Bucket numbers are zero-based, in Bucket iteration order.
396     *
397     * @param itemName  The name whose bucket position in the index is to be determined.
398     * @param status  Error code, will be set with the reason if the operation fails.
399     * @return The bucket number for this name.
400     * @draft ICU 4.8
401     *
402     */
403    virtual int32_t  getBucketIndex(const UnicodeString &itemName, UErrorCode &status);
404
405
406    /**
407     *   Get the zero based index of the current Bucket from an iteration
408     *   over the Buckets of this index.  Return -1 if no iteration is in process.
409     *   @return  the index of the current Bucket
410     *   @draft ICU 4.8
411     */
412    virtual int32_t  getBucketIndex() const;
413
414
415    /**
416     *   Advance the iteration over the Buckets of this index.  Return FALSE if
417     *   there are no more Buckets.
418     *
419     *   @param status  Error code, will be set with the reason if the operation fails.
420     *   U_ENUM_OUT_OF_SYNC_ERROR will be reported if the index is modified while
421     *   an enumeration of its contents are in process.
422     *
423     *   @return TRUE if success, FALSE if at end of iteration
424     *   @draft ICU 4.8
425     */
426    virtual UBool nextBucket(UErrorCode &status);
427
428    /**
429     *   Return the name of the Label of the current bucket from an iteration over the buckets.
430     *   If the iteration is before the first Bucket (nextBucket() has not been called),
431     *   or after the last, return an empty string.
432     *
433     *   @return the bucket label.
434     *   @draft ICU 4.8
435     */
436    virtual const UnicodeString &getBucketLabel() const;
437
438    /**
439     *  Return the type of the label for the current Bucket (selected by the
440     *  iteration over Buckets.)
441     *
442     * @return the label type.
443     * @draft ICU 4.8
444     */
445    virtual UAlphabeticIndexLabelType getBucketLabelType() const;
446
447    /**
448      * Get the number of <name, data> Records in the current Bucket.
449      * If the current bucket iteration position is before the first label or after the
450      * last, return 0.
451      *
452      *  @return the number of Records.
453      *  @draft ICU 4.8
454      */
455    virtual int32_t getBucketRecordCount() const;
456
457
458    /**
459     *  Reset the Bucket iteration for this index.  The next call to nextBucket()
460     *  will restart the iteration at the first label.
461     *
462     * @param status  Error code, will be set with the reason if the operation fails.
463     * @return        this, for chaining.
464     * @draft ICU 4.8
465     */
466    virtual AlphabeticIndex &resetBucketIterator(UErrorCode &status);
467
468    /**
469     * Advance to the next record in the current Bucket.
470     * When nextBucket() is called, Record iteration is reset to just before the
471     * first Record in the new Bucket.
472     *
473     *   @param status  Error code, will be set with the reason if the operation fails.
474     *   U_ENUM_OUT_OF_SYNC_ERROR will be reported if the index is modified while
475     *   an enumeration of its contents are in process.
476     *   @return TRUE if successful, FALSE when the iteration advances past the last item.
477     *   @draft ICU 4.8
478     */
479    virtual UBool nextRecord(UErrorCode &status);
480
481    /**
482     * Get the name of the current Record.
483     * Return an empty string if the Record iteration position is before first
484     * or after the last.
485     *
486     *  @return The name of the current index item.
487     *  @draft ICU 4.8
488     */
489    virtual const UnicodeString &getRecordName() const;
490
491
492    /**
493     * Return the data pointer of the Record currently being iterated over.
494     * Return NULL if the current iteration position before the first item in this Bucket,
495     * or after the last.
496     *
497     *  @return The current Record's data pointer.
498     *  @draft ICU 4.8
499     */
500    virtual const void *getRecordData() const;
501
502
503    /**
504     * Reset the Record iterator position to before the first Record in the current Bucket.
505     *
506     *  @return This, for chaining.
507     *  @draft ICU 4.8
508     */
509    virtual AlphabeticIndex &resetRecordIterator();
510
511private:
512    // No ICU "poor man's RTTI" for this class nor its subclasses.
513    virtual UClassID getDynamicClassID() const;
514
515     /**
516      * No Copy constructor.
517      * @internal
518      */
519     AlphabeticIndex(const AlphabeticIndex &other);
520
521     /**
522      *   No assignment.
523      */
524     AlphabeticIndex &operator =(const AlphabeticIndex & /*other*/) { return *this;};
525
526    /**
527     * No Equality operators.
528     * @internal
529     */
530     virtual UBool operator==(const AlphabeticIndex& other) const;
531
532    /**
533     * Inequality operator.
534     * @internal
535     */
536     virtual UBool operator!=(const AlphabeticIndex& other) const;
537
538     // Common initialization, for use from all constructors.
539     void init(UErrorCode &status);
540
541     // Initialize & destruct static constants used by this class.
542     static void staticInit(UErrorCode &status);
543
544     // Pinyin stuff.  If the input name is Chinese, add the Pinyin prefix to the dest string.
545     void hackName(UnicodeString &dest, const UnicodeString &name, const Collator *coll);
546     void initPinyinBounds(const Collator *coll, UErrorCode &status);
547
548   public:
549     /**
550      *   Delete all shared (static) data associated with an AlphabeticIndex.
551      *   Internal function, not intended for direct use.
552      *   @internal.
553      */
554     static void staticCleanup();
555   private:
556
557     // Add index characters from the specified locale to the dest set.
558     // Does not remove any previous contents from dest.
559     static void getIndexExemplars(UnicodeSet &dest, const Locale &locale, UErrorCode &status);
560
561     UVector *firstStringsInScript(UErrorCode &status);
562
563     static UnicodeString separated(const UnicodeString &item);
564
565     static UnicodeSet *getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status);
566
567     void buildIndex(UErrorCode &status);
568     void buildBucketList(UErrorCode &status);
569     void bucketRecords(UErrorCode &status);
570
571
572  public:
573
574    //  The following internal items are declared public only to allow access from
575    //  implementation code written in plain C.  They are not intended for
576    //  public use.
577
578    /**
579     * A record, or item, in the index.
580     * @internal
581     */
582     struct Record: public UMemory {
583         AlphabeticIndex     *alphaIndex_;
584         const UnicodeString  name_;
585         UnicodeString        sortingName_;  // Usually the same as name_; different for Pinyin.
586         const void           *data_;
587         int32_t              serialNumber_;  // Defines sorting order for names that compare equal.
588         Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data);
589         ~Record();
590     };
591
592     /**
593       * Holds all user records before they are distributed into buckets.
594       * Type of contents is (Record *)
595       * @internal
596       */
597     UVector  *inputRecords_;
598
599     /**
600      * A Bucket holds an index label and references to everything belonging to that label.
601      * For implementation use only.  Declared public because pure C implementation code needs access.
602      * @internal
603      */
604     struct Bucket: public UMemory {
605         UnicodeString     label_;
606         UnicodeString     lowerBoundary_;
607         UAlphabeticIndexLabelType labelType_;
608         UVector           *records_; // Records are owned by inputRecords_ vector.
609
610         Bucket(const UnicodeString &label,   // Parameter strings are copied.
611                const UnicodeString &lowerBoundary,
612                UAlphabeticIndexLabelType type, UErrorCode &status);
613         ~Bucket();
614     };
615
616  public:
617
618    /**
619      * Language Types.  For internal ICU use only.
620      * @internal
621      */
622    enum ELangType {
623        /** @internal */
624        kNormal,
625        /** @internal */
626        kSimplified,
627        /** @internal */
628        kTraditional
629    };
630
631    /**
632      * Get the Language Type for this Index.  Based on the locale.
633      * @internal
634      */
635    static ELangType  langTypeFromLocale(const Locale &loc);
636
637
638   private:
639
640     // Holds the contents of this index, buckets of user items.
641     // UVector elements are of type (Bucket *)
642     UVector *bucketList_;
643
644     int32_t  labelsIterIndex_;      // Index of next item to return.
645     int32_t  itemsIterIndex_;
646     Bucket   *currentBucket_;       // While an iteration of the index in underway,
647                                     //   point to the bucket for the current label.
648                                     // NULL when no iteration underway.
649
650     UBool    indexBuildRequired_;   //  Caller has made changes to the index that
651                                     //  require rebuilding & bucketing before the
652                                     //  contents can be iterated.
653
654     int32_t    maxLabelCount_;      // Limit on # of labels permitted in the index.
655
656     UHashtable *alreadyIn_;         // Key=UnicodeString, value=UnicodeSet
657
658     UnicodeSet *initialLabels_;     // Initial (unprocessed) set of Labels.  Union
659                                     //   of those explicitly set by the user plus
660                                     //   those from locales.  Raw values, before
661                                     //   crunching into bucket labels.
662
663     UVector    *labels_;            // List of Labels, after processing, sorting.
664                                     //   Contents are (UnicodeString *)
665
666     UnicodeSet *noDistinctSorting_; // As the set of labels is built, strings may
667                                     // be discarded from the exemplars. This contains
668                                     // some of the discards, and is
669                                     // intended for debugging.
670
671     UnicodeSet *notAlphabetic_;     // As the set of labels is built, strings may
672                                     // be discarded from the exemplars. This contains
673                                     // some of the discards, and is
674                                     // intended for debugging.
675
676
677     UVector    *firstScriptCharacters_;  // The first character from each script,
678                                          //   in collation order.
679
680     Locale    locale_;
681     Collator  *collator_;
682     Collator  *collatorPrimaryOnly_;
683
684     UnicodeString  inflowLabel_;
685     UnicodeString  overflowLabel_;
686     UnicodeString  underflowLabel_;
687     UnicodeString  overflowComparisonString_;
688
689     ELangType      langType_;        // The language type, simplified Chinese, Traditional Chinese,
690                                      //  or not Chinese (Normal).  Part of the Pinyin support
691
692     typedef const UChar PinyinLookup[24][3];
693     static PinyinLookup   HACK_PINYIN_LOOKUP_SHORT;
694     static PinyinLookup   HACK_PINYIN_LOOKUP_LONG;
695
696     // These will be lazily set to the short or long tables based on which
697     //   Chinese collation has been configured into the ICU library.
698     static PinyinLookup   *HACK_PINYIN_LOOKUP;
699     static const UChar    *PINYIN_LOWER_BOUNDS;
700
701
702
703     int32_t    recordCounter_;         // Counts Records created.  For minting record serial numbers.
704
705// Constants.  Lazily initialized the first time an AlphabeticIndex object is created.
706
707     static UnicodeSet *ALPHABETIC;
708     static UnicodeSet *CORE_LATIN;
709     static UnicodeSet *ETHIOPIC;
710     static UnicodeSet *HANGUL;
711     static UnicodeSet *IGNORE_SCRIPTS;
712     static UnicodeSet *TO_TRY;
713     static UnicodeSet *UNIHAN;
714     static const UnicodeString *EMPTY_STRING;
715
716};
717
718U_NAMESPACE_END
719#endif
720
721