1/*
2******************************************************************************
3* Copyright (C) 1996-2014, International Business Machines Corporation and
4* others. All Rights Reserved.
5******************************************************************************
6*/
7
8/**
9 * \file
10 * \brief C++ API: The RuleBasedCollator class implements the Collator abstract base class.
11 */
12
13/**
14* File tblcoll.h
15*
16* Created by: Helena Shih
17*
18* Modification History:
19*
20*  Date        Name        Description
21*  2/5/97      aliu        Added streamIn and streamOut methods.  Added
22*                          constructor which reads RuleBasedCollator object from
23*                          a binary file.  Added writeToFile method which streams
24*                          RuleBasedCollator out to a binary file.  The streamIn
25*                          and streamOut methods use istream and ostream objects
26*                          in binary mode.
27*  2/12/97     aliu        Modified to use TableCollationData sub-object to
28*                          hold invariant data.
29*  2/13/97     aliu        Moved several methods into this class from Collation.
30*                          Added a private RuleBasedCollator(Locale&) constructor,
31*                          to be used by Collator::createDefault().  General
32*                          clean up.
33*  2/20/97     helena      Added clone, operator==, operator!=, operator=, and copy
34*                          constructor and getDynamicClassID.
35*  3/5/97      aliu        Modified constructFromFile() to add parameter
36*                          specifying whether or not binary loading is to be
37*                          attempted.  This is required for dynamic rule loading.
38* 05/07/97     helena      Added memory allocation error detection.
39*  6/17/97     helena      Added IDENTICAL strength for compare, changed getRules to
40*                          use MergeCollation::getPattern.
41*  6/20/97     helena      Java class name change.
42*  8/18/97     helena      Added internal API documentation.
43* 09/03/97     helena      Added createCollationKeyValues().
44* 02/10/98     damiba      Added compare with "length" parameter
45* 08/05/98     erm         Synched with 1.2 version of RuleBasedCollator.java
46* 04/23/99     stephen     Removed EDecompositionMode, merged with
47*                          Normalizer::EMode
48* 06/14/99     stephen     Removed kResourceBundleSuffix
49* 11/02/99     helena      Collator performance enhancements.  Eliminates the
50*                          UnicodeString construction and special case for NO_OP.
51* 11/23/99     srl         More performance enhancements. Updates to NormalizerIterator
52*                          internal state management.
53* 12/15/99     aliu        Update to support Thai collation.  Move NormalizerIterator
54*                          to implementation file.
55* 01/29/01     synwee      Modified into a C++ wrapper which calls C API
56*                          (ucol.h)
57* 2012-2014    markus      Rewritten in C++ again.
58*/
59
60#ifndef TBLCOLL_H
61#define TBLCOLL_H
62
63#include "unicode/utypes.h"
64
65#if !UCONFIG_NO_COLLATION
66
67#include "unicode/coll.h"
68#include "unicode/locid.h"
69#include "unicode/uiter.h"
70#include "unicode/ucol.h"
71
72U_NAMESPACE_BEGIN
73
74struct CollationData;
75struct CollationSettings;
76struct CollationTailoring;
77/**
78* @stable ICU 2.0
79*/
80class StringSearch;
81/**
82* @stable ICU 2.0
83*/
84class CollationElementIterator;
85class CollationKey;
86class SortKeyByteSink;
87class UnicodeSet;
88class UnicodeString;
89class UVector64;
90
91/**
92 * The RuleBasedCollator class provides the implementation of
93 * Collator, using data-driven tables. The user can create a customized
94 * table-based collation.
95 * <p>
96 * For more information about the collation service see
97 * <a href="http://userguide.icu-project.org/collation">the User Guide</a>.
98 * <p>
99 * Collation service provides correct sorting orders for most locales supported in ICU.
100 * If specific data for a locale is not available, the orders eventually falls back
101 * to the <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>.
102 * <p>
103 * Sort ordering may be customized by providing your own set of rules. For more on
104 * this subject see the <a href="http://userguide.icu-project.org/collation/customization">
105 * Collation Customization</a> section of the User Guide.
106 * <p>
107 * Note, RuleBasedCollator is not to be subclassed.
108 * @see        Collator
109 */
110class U_I18N_API RuleBasedCollator : public Collator {
111public:
112    /**
113     * RuleBasedCollator constructor. This takes the table rules and builds a
114     * collation table out of them. Please see RuleBasedCollator class
115     * description for more details on the collation rule syntax.
116     * @param rules the collation rules to build the collation table from.
117     * @param status reporting a success or an error.
118     * @stable ICU 2.0
119     */
120    RuleBasedCollator(const UnicodeString& rules, UErrorCode& status);
121
122    /**
123     * RuleBasedCollator constructor. This takes the table rules and builds a
124     * collation table out of them. Please see RuleBasedCollator class
125     * description for more details on the collation rule syntax.
126     * @param rules the collation rules to build the collation table from.
127     * @param collationStrength strength for comparison
128     * @param status reporting a success or an error.
129     * @stable ICU 2.0
130     */
131    RuleBasedCollator(const UnicodeString& rules,
132                       ECollationStrength collationStrength,
133                       UErrorCode& status);
134
135    /**
136     * RuleBasedCollator constructor. This takes the table rules and builds a
137     * collation table out of them. Please see RuleBasedCollator class
138     * description for more details on the collation rule syntax.
139     * @param rules the collation rules to build the collation table from.
140     * @param decompositionMode the normalisation mode
141     * @param status reporting a success or an error.
142     * @stable ICU 2.0
143     */
144    RuleBasedCollator(const UnicodeString& rules,
145                    UColAttributeValue decompositionMode,
146                    UErrorCode& status);
147
148    /**
149     * RuleBasedCollator constructor. This takes the table rules and builds a
150     * collation table out of them. Please see RuleBasedCollator class
151     * description for more details on the collation rule syntax.
152     * @param rules the collation rules to build the collation table from.
153     * @param collationStrength strength for comparison
154     * @param decompositionMode the normalisation mode
155     * @param status reporting a success or an error.
156     * @stable ICU 2.0
157     */
158    RuleBasedCollator(const UnicodeString& rules,
159                    ECollationStrength collationStrength,
160                    UColAttributeValue decompositionMode,
161                    UErrorCode& status);
162
163#ifndef U_HIDE_INTERNAL_API
164    /**
165     * TODO: document & propose as public API
166     * @internal
167     */
168    RuleBasedCollator(const UnicodeString &rules,
169                      UParseError &parseError, UnicodeString &reason,
170                      UErrorCode &errorCode);
171#endif  /* U_HIDE_INTERNAL_API */
172
173    /**
174     * Copy constructor.
175     * @param other the RuleBasedCollator object to be copied
176     * @stable ICU 2.0
177     */
178    RuleBasedCollator(const RuleBasedCollator& other);
179
180
181    /** Opens a collator from a collator binary image created using
182    *  cloneBinary. Binary image used in instantiation of the
183    *  collator remains owned by the user and should stay around for
184    *  the lifetime of the collator. The API also takes a base collator
185    *  which usually should be the root collator.
186    *  @param bin binary image owned by the user and required through the
187    *             lifetime of the collator
188    *  @param length size of the image. If negative, the API will try to
189    *                figure out the length of the image
190    *  @param base fallback collator, usually root. The base is required to be
191    *              present through the lifetime of the collator. Currently
192    *              it cannot be NULL.
193    *  @param status for catching errors
194    *  @return newly created collator
195    *  @see cloneBinary
196    *  @stable ICU 3.4
197    */
198    RuleBasedCollator(const uint8_t *bin, int32_t length,
199                    const RuleBasedCollator *base,
200                    UErrorCode &status);
201
202    /**
203     * Destructor.
204     * @stable ICU 2.0
205     */
206    virtual ~RuleBasedCollator();
207
208    /**
209     * Assignment operator.
210     * @param other other RuleBasedCollator object to copy from.
211     * @stable ICU 2.0
212     */
213    RuleBasedCollator& operator=(const RuleBasedCollator& other);
214
215    /**
216     * Returns true if argument is the same as this object.
217     * @param other Collator object to be compared.
218     * @return true if arguments is the same as this object.
219     * @stable ICU 2.0
220     */
221    virtual UBool operator==(const Collator& other) const;
222
223    /**
224     * Makes a copy of this object.
225     * @return a copy of this object, owned by the caller
226     * @stable ICU 2.0
227     */
228    virtual Collator* clone(void) const;
229
230    /**
231     * Creates a collation element iterator for the source string. The caller of
232     * this method is responsible for the memory management of the return
233     * pointer.
234     * @param source the string over which the CollationElementIterator will
235     *        iterate.
236     * @return the collation element iterator of the source string using this as
237     *         the based Collator.
238     * @stable ICU 2.2
239     */
240    virtual CollationElementIterator* createCollationElementIterator(
241                                           const UnicodeString& source) const;
242
243    /**
244     * Creates a collation element iterator for the source. The caller of this
245     * method is responsible for the memory management of the returned pointer.
246     * @param source the CharacterIterator which produces the characters over
247     *        which the CollationElementItgerator will iterate.
248     * @return the collation element iterator of the source using this as the
249     *         based Collator.
250     * @stable ICU 2.2
251     */
252    virtual CollationElementIterator* createCollationElementIterator(
253                                         const CharacterIterator& source) const;
254
255    // Make deprecated versions of Collator::compare() visible.
256    using Collator::compare;
257
258    /**
259    * The comparison function compares the character data stored in two
260    * different strings. Returns information about whether a string is less
261    * than, greater than or equal to another string.
262    * @param source the source string to be compared with.
263    * @param target the string that is to be compared with the source string.
264    * @param status possible error code
265    * @return Returns an enum value. UCOL_GREATER if source is greater
266    * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less
267    * than target
268    * @stable ICU 2.6
269    **/
270    virtual UCollationResult compare(const UnicodeString& source,
271                                     const UnicodeString& target,
272                                     UErrorCode &status) const;
273
274    /**
275    * Does the same thing as compare but limits the comparison to a specified
276    * length
277    * @param source the source string to be compared with.
278    * @param target the string that is to be compared with the source string.
279    * @param length the length the comparison is limited to
280    * @param status possible error code
281    * @return Returns an enum value. UCOL_GREATER if source (up to the specified
282    *         length) is greater than target; UCOL_EQUAL if source (up to specified
283    *         length) is equal to target; UCOL_LESS if source (up to the specified
284    *         length) is less  than target.
285    * @stable ICU 2.6
286    */
287    virtual UCollationResult compare(const UnicodeString& source,
288                                     const UnicodeString& target,
289                                     int32_t length,
290                                     UErrorCode &status) const;
291
292    /**
293    * The comparison function compares the character data stored in two
294    * different string arrays. Returns information about whether a string array
295    * is less than, greater than or equal to another string array.
296    * @param source the source string array to be compared with.
297    * @param sourceLength the length of the source string array.  If this value
298    *        is equal to -1, the string array is null-terminated.
299    * @param target the string that is to be compared with the source string.
300    * @param targetLength the length of the target string array.  If this value
301    *        is equal to -1, the string array is null-terminated.
302    * @param status possible error code
303    * @return Returns an enum value. UCOL_GREATER if source is greater
304    * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less
305    * than target
306    * @stable ICU 2.6
307    */
308    virtual UCollationResult compare(const UChar* source, int32_t sourceLength,
309                                     const UChar* target, int32_t targetLength,
310                                     UErrorCode &status) const;
311
312    /**
313     * Compares two strings using the Collator.
314     * Returns whether the first one compares less than/equal to/greater than
315     * the second one.
316     * This version takes UCharIterator input.
317     * @param sIter the first ("source") string iterator
318     * @param tIter the second ("target") string iterator
319     * @param status ICU status
320     * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER
321     * @stable ICU 4.2
322     */
323    virtual UCollationResult compare(UCharIterator &sIter,
324                                     UCharIterator &tIter,
325                                     UErrorCode &status) const;
326
327    /**
328     * Compares two UTF-8 strings using the Collator.
329     * Returns whether the first one compares less than/equal to/greater than
330     * the second one.
331     * This version takes UTF-8 input.
332     * Note that a StringPiece can be implicitly constructed
333     * from a std::string or a NUL-terminated const char * string.
334     * @param source the first UTF-8 string
335     * @param target the second UTF-8 string
336     * @param status ICU status
337     * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER
338     * @stable ICU 51
339     */
340    virtual UCollationResult compareUTF8(const StringPiece &source,
341                                         const StringPiece &target,
342                                         UErrorCode &status) const;
343
344    /**
345    * Transforms a specified region of the string into a series of characters
346    * that can be compared with CollationKey.compare. Use a CollationKey when
347    * you need to do repeated comparisions on the same string. For a single
348    * comparison the compare method will be faster.
349    * @param source the source string.
350    * @param key the transformed key of the source string.
351    * @param status the error code status.
352    * @return the transformed key.
353    * @see CollationKey
354    * @stable ICU 2.0
355    */
356    virtual CollationKey& getCollationKey(const UnicodeString& source,
357                                          CollationKey& key,
358                                          UErrorCode& status) const;
359
360    /**
361    * Transforms a specified region of the string into a series of characters
362    * that can be compared with CollationKey.compare. Use a CollationKey when
363    * you need to do repeated comparisions on the same string. For a single
364    * comparison the compare method will be faster.
365    * @param source the source string.
366    * @param sourceLength the length of the source string.
367    * @param key the transformed key of the source string.
368    * @param status the error code status.
369    * @return the transformed key.
370    * @see CollationKey
371    * @stable ICU 2.0
372    */
373    virtual CollationKey& getCollationKey(const UChar *source,
374                                          int32_t sourceLength,
375                                          CollationKey& key,
376                                          UErrorCode& status) const;
377
378    /**
379     * Generates the hash code for the rule-based collation object.
380     * @return the hash code.
381     * @stable ICU 2.0
382     */
383    virtual int32_t hashCode() const;
384
385    /**
386    * Gets the locale of the Collator
387    * @param type can be either requested, valid or actual locale. For more
388    *             information see the definition of ULocDataLocaleType in
389    *             uloc.h
390    * @param status the error code status.
391    * @return locale where the collation data lives. If the collator
392    *         was instantiated from rules, locale is empty.
393    * @deprecated ICU 2.8 likely to change in ICU 3.0, based on feedback
394    */
395    virtual Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
396
397    /**
398     * Gets the tailoring rules for this collator.
399     * @return the collation tailoring from which this collator was created
400     * @stable ICU 2.0
401     */
402    const UnicodeString& getRules() const;
403
404    /**
405     * Gets the version information for a Collator.
406     * @param info the version # information, the result will be filled in
407     * @stable ICU 2.0
408     */
409    virtual void getVersion(UVersionInfo info) const;
410
411#ifndef U_HIDE_DEPRECATED_API
412    /**
413     * Returns the maximum length of any expansion sequences that end with the
414     * specified comparison order.
415     *
416     * This is specific to the kind of collation element values and sequences
417     * returned by the CollationElementIterator.
418     * Call CollationElementIterator::getMaxExpansion() instead.
419     *
420     * @param order a collation order returned by CollationElementIterator::previous
421     *              or CollationElementIterator::next.
422     * @return maximum size of the expansion sequences ending with the collation
423     *         element, or 1 if the collation element does not occur at the end of
424     *         any expansion sequence
425     * @see CollationElementIterator#getMaxExpansion
426     * @deprecated ICU 51 Use CollationElementIterator::getMaxExpansion() instead.
427     */
428    int32_t getMaxExpansion(int32_t order) const;
429#endif  /* U_HIDE_DEPRECATED_API */
430
431    /**
432     * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. This
433     * method is to implement a simple version of RTTI, since not all C++
434     * compilers support genuine RTTI. Polymorphic operator==() and clone()
435     * methods call this method.
436     * @return The class ID for this object. All objects of a given class have
437     *         the same class ID. Objects of other classes have different class
438     *         IDs.
439     * @stable ICU 2.0
440     */
441    virtual UClassID getDynamicClassID(void) const;
442
443    /**
444     * Returns the class ID for this class. This is useful only for comparing to
445     * a return value from getDynamicClassID(). For example:
446     * <pre>
447     * Base* polymorphic_pointer = createPolymorphicObject();
448     * if (polymorphic_pointer->getDynamicClassID() ==
449     *                                          Derived::getStaticClassID()) ...
450     * </pre>
451     * @return The class ID for all objects of this class.
452     * @stable ICU 2.0
453     */
454    static UClassID U_EXPORT2 getStaticClassID(void);
455
456#ifndef U_HIDE_DEPRECATED_API
457    /**
458     * Do not use this method: The caller and the ICU library might use different heaps.
459     * Use cloneBinary() instead which writes to caller-provided memory.
460     *
461     * Returns a binary format of this collator.
462     * @param length Returns the length of the data, in bytes
463     * @param status the error code status.
464     * @return memory, owned by the caller, of size 'length' bytes.
465     * @deprecated ICU 52. Use cloneBinary() instead.
466     */
467    uint8_t *cloneRuleData(int32_t &length, UErrorCode &status) const;
468#endif  /* U_HIDE_DEPRECATED_API */
469
470    /** Creates a binary image of a collator. This binary image can be stored and
471    *  later used to instantiate a collator using ucol_openBinary.
472    *  This API supports preflighting.
473    *  @param buffer a fill-in buffer to receive the binary image
474    *  @param capacity capacity of the destination buffer
475    *  @param status for catching errors
476    *  @return size of the image
477    *  @see ucol_openBinary
478    *  @stable ICU 3.4
479    */
480    int32_t cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) const;
481
482    /**
483     * Returns current rules. Delta defines whether full rules are returned or
484     * just the tailoring.
485     *
486     * getRules(void) should normally be used instead.
487     * See http://userguide.icu-project.org/collation/customization#TOC-Building-on-Existing-Locales
488     * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES.
489     * @param buffer UnicodeString to store the result rules
490     * @stable ICU 2.2
491     * @see UCOL_FULL_RULES
492     */
493    void getRules(UColRuleOption delta, UnicodeString &buffer) const;
494
495    /**
496     * Universal attribute setter
497     * @param attr attribute type
498     * @param value attribute value
499     * @param status to indicate whether the operation went on smoothly or there were errors
500     * @stable ICU 2.2
501     */
502    virtual void setAttribute(UColAttribute attr, UColAttributeValue value,
503                              UErrorCode &status);
504
505    /**
506     * Universal attribute getter.
507     * @param attr attribute type
508     * @param status to indicate whether the operation went on smoothly or there were errors
509     * @return attribute value
510     * @stable ICU 2.2
511     */
512    virtual UColAttributeValue getAttribute(UColAttribute attr,
513                                            UErrorCode &status) const;
514
515    /**
516     * Sets the variable top to the top of the specified reordering group.
517     * The variable top determines the highest-sorting character
518     * which is affected by UCOL_ALTERNATE_HANDLING.
519     * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect.
520     * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION,
521     *              UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY;
522     *              or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group
523     * @param errorCode Standard ICU error code. Its input value must
524     *                  pass the U_SUCCESS() test, or else the function returns
525     *                  immediately. Check for U_FAILURE() on output or use with
526     *                  function chaining. (See User Guide for details.)
527     * @return *this
528     * @see getMaxVariable
529     * @draft ICU 53
530     */
531    virtual Collator &setMaxVariable(UColReorderCode group, UErrorCode &errorCode);
532
533    /**
534     * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING.
535     * @return the maximum variable reordering group.
536     * @see setMaxVariable
537     * @draft ICU 53
538     */
539    virtual UColReorderCode getMaxVariable() const;
540
541    /**
542     * Sets the variable top to the primary weight of the specified string.
543     *
544     * Beginning with ICU 53, the variable top is pinned to
545     * the top of one of the supported reordering groups,
546     * and it must not be beyond the last of those groups.
547     * See setMaxVariable().
548     * @param varTop one or more (if contraction) UChars to which the variable top should be set
549     * @param len length of variable top string. If -1 it is considered to be zero terminated.
550     * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br>
551     *    U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br>
552     *    U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond
553     *    the last reordering group supported by setMaxVariable()
554     * @return variable top primary weight
555     * @deprecated ICU 53 Call setMaxVariable() instead.
556     */
557    virtual uint32_t setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status);
558
559    /**
560     * Sets the variable top to the primary weight of the specified string.
561     *
562     * Beginning with ICU 53, the variable top is pinned to
563     * the top of one of the supported reordering groups,
564     * and it must not be beyond the last of those groups.
565     * See setMaxVariable().
566     * @param varTop a UnicodeString size 1 or more (if contraction) of UChars to which the variable top should be set
567     * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br>
568     *    U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br>
569     *    U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond
570     *    the last reordering group supported by setMaxVariable()
571     * @return variable top primary weight
572     * @deprecated ICU 53 Call setMaxVariable() instead.
573     */
574    virtual uint32_t setVariableTop(const UnicodeString &varTop, UErrorCode &status);
575
576    /**
577     * Sets the variable top to the specified primary weight.
578     *
579     * Beginning with ICU 53, the variable top is pinned to
580     * the top of one of the supported reordering groups,
581     * and it must not be beyond the last of those groups.
582     * See setMaxVariable().
583     * @param varTop primary weight, as returned by setVariableTop or ucol_getVariableTop
584     * @param status error code
585     * @deprecated ICU 53 Call setMaxVariable() instead.
586     */
587    virtual void setVariableTop(uint32_t varTop, UErrorCode &status);
588
589    /**
590     * Gets the variable top value of a Collator.
591     * @param status error code (not changed by function). If error code is set, the return value is undefined.
592     * @return the variable top primary weight
593     * @see getMaxVariable
594     * @stable ICU 2.0
595     */
596    virtual uint32_t getVariableTop(UErrorCode &status) const;
597
598    /**
599     * Get a UnicodeSet that contains all the characters and sequences tailored in
600     * this collator.
601     * @param status      error code of the operation
602     * @return a pointer to a UnicodeSet object containing all the
603     *         code points and sequences that may sort differently than
604     *         in the root collator. The object must be disposed of by using delete
605     * @stable ICU 2.4
606     */
607    virtual UnicodeSet *getTailoredSet(UErrorCode &status) const;
608
609    /**
610     * Get the sort key as an array of bytes from a UnicodeString.
611     * @param source string to be processed.
612     * @param result buffer to store result in. If NULL, number of bytes needed
613     *        will be returned.
614     * @param resultLength length of the result buffer. If if not enough the
615     *        buffer will be filled to capacity.
616     * @return Number of bytes needed for storing the sort key
617     * @stable ICU 2.0
618     */
619    virtual int32_t getSortKey(const UnicodeString& source, uint8_t *result,
620                               int32_t resultLength) const;
621
622    /**
623     * Get the sort key as an array of bytes from a UChar buffer.
624     * @param source string to be processed.
625     * @param sourceLength length of string to be processed. If -1, the string
626     *        is 0 terminated and length will be decided by the function.
627     * @param result buffer to store result in. If NULL, number of bytes needed
628     *        will be returned.
629     * @param resultLength length of the result buffer. If if not enough the
630     *        buffer will be filled to capacity.
631     * @return Number of bytes needed for storing the sort key
632     * @stable ICU 2.2
633     */
634    virtual int32_t getSortKey(const UChar *source, int32_t sourceLength,
635                               uint8_t *result, int32_t resultLength) const;
636
637    /**
638     * Retrieves the reordering codes for this collator.
639     * @param dest The array to fill with the script ordering.
640     * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
641     *  will only return the length of the result without writing any of the result string (pre-flighting).
642     * @param status A reference to an error code value, which must not indicate
643     * a failure before the function call.
644     * @return The length of the script ordering array.
645     * @see ucol_setReorderCodes
646     * @see Collator#getEquivalentReorderCodes
647     * @see Collator#setReorderCodes
648     * @stable ICU 4.8
649     */
650     virtual int32_t getReorderCodes(int32_t *dest,
651                                     int32_t destCapacity,
652                                     UErrorCode& status) const;
653
654    /**
655     * Sets the ordering of scripts for this collator.
656     * @param reorderCodes An array of script codes in the new order. This can be NULL if the
657     * length is also set to 0. An empty array will clear any reordering codes on the collator.
658     * @param reorderCodesLength The length of reorderCodes.
659     * @param status error code
660     * @see Collator#getReorderCodes
661     * @see Collator#getEquivalentReorderCodes
662     * @stable ICU 4.8
663     */
664     virtual void setReorderCodes(const int32_t* reorderCodes,
665                                  int32_t reorderCodesLength,
666                                  UErrorCode& status) ;
667
668    /**
669     * Implements ucol_strcollUTF8().
670     * @internal
671     */
672    virtual UCollationResult internalCompareUTF8(
673            const char *left, int32_t leftLength,
674            const char *right, int32_t rightLength,
675            UErrorCode &errorCode) const;
676
677    /** Get the short definition string for a collator. This internal API harvests the collator's
678     *  locale and the attribute set and produces a string that can be used for opening
679     *  a collator with the same attributes using the ucol_openFromShortString API.
680     *  This string will be normalized.
681     *  The structure and the syntax of the string is defined in the "Naming collators"
682     *  section of the users guide:
683     *  http://userguide.icu-project.org/collation/concepts#TOC-Collator-naming-scheme
684     *  This function supports preflighting.
685     *
686     *  This is internal, and intended to be used with delegate converters.
687     *
688     *  @param locale a locale that will appear as a collators locale in the resulting
689     *                short string definition. If NULL, the locale will be harvested
690     *                from the collator.
691     *  @param buffer space to hold the resulting string
692     *  @param capacity capacity of the buffer
693     *  @param status for returning errors. All the preflighting errors are featured
694     *  @return length of the resulting string
695     *  @see ucol_openFromShortString
696     *  @see ucol_normalizeShortDefinitionString
697     *  @see ucol_getShortDefinitionString
698     *  @internal
699     */
700    virtual int32_t internalGetShortDefinitionString(const char *locale,
701                                                     char *buffer,
702                                                     int32_t capacity,
703                                                     UErrorCode &status) const;
704
705    /**
706     * Implements ucol_nextSortKeyPart().
707     * @internal
708     */
709    virtual int32_t internalNextSortKeyPart(
710            UCharIterator *iter, uint32_t state[2],
711            uint8_t *dest, int32_t count, UErrorCode &errorCode) const;
712
713#ifndef U_HIDE_INTERNAL_API
714    /**
715     * Only for use in ucol_openRules().
716     * @internal
717     */
718    RuleBasedCollator();
719
720    /**
721     * Implements ucol_getLocaleByType().
722     * Needed because the lifetime of the locale ID string must match that of the collator.
723     * getLocale() returns a copy of a Locale, with minimal lifetime in a C wrapper.
724     * @internal
725     */
726    const char *internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const;
727
728    /**
729     * Implements ucol_getContractionsAndExpansions().
730     * Gets this collator's sets of contraction strings and/or
731     * characters and strings that map to multiple collation elements (expansions).
732     * If addPrefixes is TRUE, then contractions that are expressed as
733     * prefix/pre-context rules are included.
734     * @param contractions if not NULL, the set to hold the contractions
735     * @param expansions if not NULL, the set to hold the expansions
736     * @param addPrefixes include prefix contextual mappings
737     * @param errorCode in/out ICU error code
738     * @internal
739     */
740    void internalGetContractionsAndExpansions(
741            UnicodeSet *contractions, UnicodeSet *expansions,
742            UBool addPrefixes, UErrorCode &errorCode) const;
743
744    /**
745     * Adds the contractions that start with character c to the set.
746     * Ignores prefixes. Used by AlphabeticIndex.
747     * @internal
748     */
749    void internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const;
750
751    /**
752     * Implements from-rule constructors, and ucol_openRules().
753     * @internal
754     */
755    void internalBuildTailoring(
756            const UnicodeString &rules,
757            int32_t strength,
758            UColAttributeValue decompositionMode,
759            UParseError *outParseError, UnicodeString *outReason,
760            UErrorCode &errorCode);
761
762    /** @internal */
763    static inline RuleBasedCollator *rbcFromUCollator(UCollator *uc) {
764        return dynamic_cast<RuleBasedCollator *>(fromUCollator(uc));
765    }
766    /** @internal */
767    static inline const RuleBasedCollator *rbcFromUCollator(const UCollator *uc) {
768        return dynamic_cast<const RuleBasedCollator *>(fromUCollator(uc));
769    }
770
771    /**
772     * Appends the CEs for the string to the vector.
773     * @internal for tests & tools
774     */
775    void internalGetCEs(const UnicodeString &str, UVector64 &ces, UErrorCode &errorCode) const;
776#endif  // U_HIDE_INTERNAL_API
777
778protected:
779   /**
780    * Used internally by registration to define the requested and valid locales.
781    * @param requestedLocale the requested locale
782    * @param validLocale the valid locale
783    * @param actualLocale the actual locale
784    * @internal
785    */
786    virtual void setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale);
787
788private:
789    friend class CollationElementIterator;
790    friend class Collator;
791
792    RuleBasedCollator(const CollationTailoring *t, const Locale &vl);
793
794    /**
795     * Enumeration of attributes that are relevant for short definition strings
796     * (e.g., ucol_getShortDefinitionString()).
797     * Effectively extends UColAttribute.
798     */
799    enum Attributes {
800        ATTR_VARIABLE_TOP = UCOL_ATTRIBUTE_COUNT,
801        ATTR_LIMIT
802    };
803
804    void adoptTailoring(CollationTailoring *t);
805
806    // Both lengths must be <0 or else both must be >=0.
807    UCollationResult doCompare(const UChar *left, int32_t leftLength,
808                               const UChar *right, int32_t rightLength,
809                               UErrorCode &errorCode) const;
810    UCollationResult doCompare(const uint8_t *left, int32_t leftLength,
811                               const uint8_t *right, int32_t rightLength,
812                               UErrorCode &errorCode) const;
813
814    void writeSortKey(const UChar *s, int32_t length,
815                      SortKeyByteSink &sink, UErrorCode &errorCode) const;
816
817    void writeIdenticalLevel(const UChar *s, const UChar *limit,
818                             SortKeyByteSink &sink, UErrorCode &errorCode) const;
819
820    const CollationSettings &getDefaultSettings() const;
821
822    void setAttributeDefault(int32_t attribute) {
823        explicitlySetAttributes &= ~((uint32_t)1 << attribute);
824    }
825    void setAttributeExplicitly(int32_t attribute) {
826        explicitlySetAttributes |= (uint32_t)1 << attribute;
827    }
828    UBool attributeHasBeenSetExplicitly(int32_t attribute) const {
829        // assert(0 <= attribute < ATTR_LIMIT);
830        return (UBool)((explicitlySetAttributes & ((uint32_t)1 << attribute)) != 0);
831    }
832
833    /**
834     * Tests whether a character is "unsafe" for use as a collation starting point.
835     *
836     * @param c code point or code unit
837     * @return TRUE if c is unsafe
838     * @see CollationElementIterator#setOffset(int)
839     */
840    UBool isUnsafe(UChar32 c) const;
841
842    static void computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode);
843    UBool initMaxExpansions(UErrorCode &errorCode) const;
844
845    void setFastLatinOptions(CollationSettings &ownedSettings) const;
846
847    const CollationData *data;
848    const CollationSettings *settings;  // reference-counted
849    const CollationTailoring *tailoring;  // reference-counted
850    Locale validLocale;
851    uint32_t explicitlySetAttributes;
852
853    UBool actualLocaleIsSameAsValid;
854};
855
856U_NAMESPACE_END
857
858#endif  // !UCONFIG_NO_COLLATION
859#endif  // TBLCOLL_H
860