1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5* Copyright (C) 1996-2016, International Business Machines Corporation and
6* others. All Rights Reserved.
7******************************************************************************
8*/
9
10/**
11 * \file
12 * \brief C++ API: The RuleBasedCollator class implements the Collator abstract base class.
13 */
14
15/**
16* File tblcoll.h
17*
18* Created by: Helena Shih
19*
20* Modification History:
21*
22*  Date        Name        Description
23*  2/5/97      aliu        Added streamIn and streamOut methods.  Added
24*                          constructor which reads RuleBasedCollator object from
25*                          a binary file.  Added writeToFile method which streams
26*                          RuleBasedCollator out to a binary file.  The streamIn
27*                          and streamOut methods use istream and ostream objects
28*                          in binary mode.
29*  2/12/97     aliu        Modified to use TableCollationData sub-object to
30*                          hold invariant data.
31*  2/13/97     aliu        Moved several methods into this class from Collation.
32*                          Added a private RuleBasedCollator(Locale&) constructor,
33*                          to be used by Collator::createDefault().  General
34*                          clean up.
35*  2/20/97     helena      Added clone, operator==, operator!=, operator=, and copy
36*                          constructor and getDynamicClassID.
37*  3/5/97      aliu        Modified constructFromFile() to add parameter
38*                          specifying whether or not binary loading is to be
39*                          attempted.  This is required for dynamic rule loading.
40* 05/07/97     helena      Added memory allocation error detection.
41*  6/17/97     helena      Added IDENTICAL strength for compare, changed getRules to
42*                          use MergeCollation::getPattern.
43*  6/20/97     helena      Java class name change.
44*  8/18/97     helena      Added internal API documentation.
45* 09/03/97     helena      Added createCollationKeyValues().
46* 02/10/98     damiba      Added compare with "length" parameter
47* 08/05/98     erm         Synched with 1.2 version of RuleBasedCollator.java
48* 04/23/99     stephen     Removed EDecompositionMode, merged with
49*                          Normalizer::EMode
50* 06/14/99     stephen     Removed kResourceBundleSuffix
51* 11/02/99     helena      Collator performance enhancements.  Eliminates the
52*                          UnicodeString construction and special case for NO_OP.
53* 11/23/99     srl         More performance enhancements. Updates to NormalizerIterator
54*                          internal state management.
55* 12/15/99     aliu        Update to support Thai collation.  Move NormalizerIterator
56*                          to implementation file.
57* 01/29/01     synwee      Modified into a C++ wrapper which calls C API
58*                          (ucol.h)
59* 2012-2014    markus      Rewritten in C++ again.
60*/
61
62#ifndef TBLCOLL_H
63#define TBLCOLL_H
64
65#include "unicode/utypes.h"
66
67#if !UCONFIG_NO_COLLATION
68
69#include "unicode/coll.h"
70#include "unicode/locid.h"
71#include "unicode/uiter.h"
72#include "unicode/ucol.h"
73
74U_NAMESPACE_BEGIN
75
76struct CollationCacheEntry;
77struct CollationData;
78struct CollationSettings;
79struct CollationTailoring;
80/**
81* @stable ICU 2.0
82*/
83class StringSearch;
84/**
85* @stable ICU 2.0
86*/
87class CollationElementIterator;
88class CollationKey;
89class SortKeyByteSink;
90class UnicodeSet;
91class UnicodeString;
92class UVector64;
93
94/**
95 * The RuleBasedCollator class provides the implementation of
96 * Collator, using data-driven tables. The user can create a customized
97 * table-based collation.
98 * <p>
99 * For more information about the collation service see
100 * <a href="http://userguide.icu-project.org/collation">the User Guide</a>.
101 * <p>
102 * Collation service provides correct sorting orders for most locales supported in ICU.
103 * If specific data for a locale is not available, the orders eventually falls back
104 * to the <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>.
105 * <p>
106 * Sort ordering may be customized by providing your own set of rules. For more on
107 * this subject see the <a href="http://userguide.icu-project.org/collation/customization">
108 * Collation Customization</a> section of the User Guide.
109 * <p>
110 * Note, RuleBasedCollator is not to be subclassed.
111 * @see        Collator
112 */
113class U_I18N_API RuleBasedCollator : public Collator {
114public:
115    /**
116     * RuleBasedCollator constructor. This takes the table rules and builds a
117     * collation table out of them. Please see RuleBasedCollator class
118     * description for more details on the collation rule syntax.
119     * @param rules the collation rules to build the collation table from.
120     * @param status reporting a success or an error.
121     * @stable ICU 2.0
122     */
123    RuleBasedCollator(const UnicodeString& rules, UErrorCode& status);
124
125    /**
126     * RuleBasedCollator constructor. This takes the table rules and builds a
127     * collation table out of them. Please see RuleBasedCollator class
128     * description for more details on the collation rule syntax.
129     * @param rules the collation rules to build the collation table from.
130     * @param collationStrength strength for comparison
131     * @param status reporting a success or an error.
132     * @stable ICU 2.0
133     */
134    RuleBasedCollator(const UnicodeString& rules,
135                       ECollationStrength collationStrength,
136                       UErrorCode& status);
137
138    /**
139     * RuleBasedCollator constructor. This takes the table rules and builds a
140     * collation table out of them. Please see RuleBasedCollator class
141     * description for more details on the collation rule syntax.
142     * @param rules the collation rules to build the collation table from.
143     * @param decompositionMode the normalisation mode
144     * @param status reporting a success or an error.
145     * @stable ICU 2.0
146     */
147    RuleBasedCollator(const UnicodeString& rules,
148                    UColAttributeValue decompositionMode,
149                    UErrorCode& status);
150
151    /**
152     * RuleBasedCollator constructor. This takes the table rules and builds a
153     * collation table out of them. Please see RuleBasedCollator class
154     * description for more details on the collation rule syntax.
155     * @param rules the collation rules to build the collation table from.
156     * @param collationStrength strength for comparison
157     * @param decompositionMode the normalisation mode
158     * @param status reporting a success or an error.
159     * @stable ICU 2.0
160     */
161    RuleBasedCollator(const UnicodeString& rules,
162                    ECollationStrength collationStrength,
163                    UColAttributeValue decompositionMode,
164                    UErrorCode& status);
165
166#ifndef U_HIDE_INTERNAL_API
167    /**
168     * TODO: document & propose as public API
169     * @internal
170     */
171    RuleBasedCollator(const UnicodeString &rules,
172                      UParseError &parseError, UnicodeString &reason,
173                      UErrorCode &errorCode);
174#endif  /* U_HIDE_INTERNAL_API */
175
176    /**
177     * Copy constructor.
178     * @param other the RuleBasedCollator object to be copied
179     * @stable ICU 2.0
180     */
181    RuleBasedCollator(const RuleBasedCollator& other);
182
183
184    /** Opens a collator from a collator binary image created using
185    *  cloneBinary. Binary image used in instantiation of the
186    *  collator remains owned by the user and should stay around for
187    *  the lifetime of the collator. The API also takes a base collator
188    *  which must be the root collator.
189    *  @param bin binary image owned by the user and required through the
190    *             lifetime of the collator
191    *  @param length size of the image. If negative, the API will try to
192    *                figure out the length of the image
193    *  @param base Base collator, for lookup of untailored characters.
194    *              Must be the root collator, must not be NULL.
195    *              The base is required to be present through the lifetime of the collator.
196    *  @param status for catching errors
197    *  @return newly created collator
198    *  @see cloneBinary
199    *  @stable ICU 3.4
200    */
201    RuleBasedCollator(const uint8_t *bin, int32_t length,
202                    const RuleBasedCollator *base,
203                    UErrorCode &status);
204
205    /**
206     * Destructor.
207     * @stable ICU 2.0
208     */
209    virtual ~RuleBasedCollator();
210
211    /**
212     * Assignment operator.
213     * @param other other RuleBasedCollator object to copy from.
214     * @stable ICU 2.0
215     */
216    RuleBasedCollator& operator=(const RuleBasedCollator& other);
217
218    /**
219     * Returns true if argument is the same as this object.
220     * @param other Collator object to be compared.
221     * @return true if arguments is the same as this object.
222     * @stable ICU 2.0
223     */
224    virtual UBool operator==(const Collator& other) const;
225
226    /**
227     * Makes a copy of this object.
228     * @return a copy of this object, owned by the caller
229     * @stable ICU 2.0
230     */
231    virtual Collator* clone(void) const;
232
233    /**
234     * Creates a collation element iterator for the source string. The caller of
235     * this method is responsible for the memory management of the return
236     * pointer.
237     * @param source the string over which the CollationElementIterator will
238     *        iterate.
239     * @return the collation element iterator of the source string using this as
240     *         the based Collator.
241     * @stable ICU 2.2
242     */
243    virtual CollationElementIterator* createCollationElementIterator(
244                                           const UnicodeString& source) const;
245
246    /**
247     * Creates a collation element iterator for the source. The caller of this
248     * method is responsible for the memory management of the returned pointer.
249     * @param source the CharacterIterator which produces the characters over
250     *        which the CollationElementItgerator will iterate.
251     * @return the collation element iterator of the source using this as the
252     *         based Collator.
253     * @stable ICU 2.2
254     */
255    virtual CollationElementIterator* createCollationElementIterator(
256                                         const CharacterIterator& source) const;
257
258    // Make deprecated versions of Collator::compare() visible.
259    using Collator::compare;
260
261    /**
262    * The comparison function compares the character data stored in two
263    * different strings. Returns information about whether a string is less
264    * than, greater than or equal to another string.
265    * @param source the source string to be compared with.
266    * @param target the string that is to be compared with the source string.
267    * @param status possible error code
268    * @return Returns an enum value. UCOL_GREATER if source is greater
269    * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less
270    * than target
271    * @stable ICU 2.6
272    **/
273    virtual UCollationResult compare(const UnicodeString& source,
274                                     const UnicodeString& target,
275                                     UErrorCode &status) const;
276
277    /**
278    * Does the same thing as compare but limits the comparison to a specified
279    * length
280    * @param source the source string to be compared with.
281    * @param target the string that is to be compared with the source string.
282    * @param length the length the comparison is limited to
283    * @param status possible error code
284    * @return Returns an enum value. UCOL_GREATER if source (up to the specified
285    *         length) is greater than target; UCOL_EQUAL if source (up to specified
286    *         length) is equal to target; UCOL_LESS if source (up to the specified
287    *         length) is less  than target.
288    * @stable ICU 2.6
289    */
290    virtual UCollationResult compare(const UnicodeString& source,
291                                     const UnicodeString& target,
292                                     int32_t length,
293                                     UErrorCode &status) const;
294
295    /**
296    * The comparison function compares the character data stored in two
297    * different string arrays. Returns information about whether a string array
298    * is less than, greater than or equal to another string array.
299    * @param source the source string array to be compared with.
300    * @param sourceLength the length of the source string array.  If this value
301    *        is equal to -1, the string array is null-terminated.
302    * @param target the string that is to be compared with the source string.
303    * @param targetLength the length of the target string array.  If this value
304    *        is equal to -1, the string array is null-terminated.
305    * @param status possible error code
306    * @return Returns an enum value. UCOL_GREATER if source is greater
307    * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less
308    * than target
309    * @stable ICU 2.6
310    */
311    virtual UCollationResult compare(const char16_t* source, int32_t sourceLength,
312                                     const char16_t* target, int32_t targetLength,
313                                     UErrorCode &status) const;
314
315    /**
316     * Compares two strings using the Collator.
317     * Returns whether the first one compares less than/equal to/greater than
318     * the second one.
319     * This version takes UCharIterator input.
320     * @param sIter the first ("source") string iterator
321     * @param tIter the second ("target") string iterator
322     * @param status ICU status
323     * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER
324     * @stable ICU 4.2
325     */
326    virtual UCollationResult compare(UCharIterator &sIter,
327                                     UCharIterator &tIter,
328                                     UErrorCode &status) const;
329
330    /**
331     * Compares two UTF-8 strings using the Collator.
332     * Returns whether the first one compares less than/equal to/greater than
333     * the second one.
334     * This version takes UTF-8 input.
335     * Note that a StringPiece can be implicitly constructed
336     * from a std::string or a NUL-terminated const char * string.
337     * @param source the first UTF-8 string
338     * @param target the second UTF-8 string
339     * @param status ICU status
340     * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER
341     * @stable ICU 51
342     */
343    virtual UCollationResult compareUTF8(const StringPiece &source,
344                                         const StringPiece &target,
345                                         UErrorCode &status) const;
346
347    /**
348     * Transforms the string into a series of characters
349     * that can be compared with CollationKey.compare().
350     *
351     * Note that sort keys are often less efficient than simply doing comparison.
352     * For more details, see the ICU User Guide.
353     *
354     * @param source the source string.
355     * @param key the transformed key of the source string.
356     * @param status the error code status.
357     * @return the transformed key.
358     * @see CollationKey
359     * @stable ICU 2.0
360     */
361    virtual CollationKey& getCollationKey(const UnicodeString& source,
362                                          CollationKey& key,
363                                          UErrorCode& status) const;
364
365    /**
366     * Transforms a specified region of the string into a series of characters
367     * that can be compared with CollationKey.compare.
368     *
369     * Note that sort keys are often less efficient than simply doing comparison.
370     * For more details, see the ICU User Guide.
371     *
372     * @param source the source string.
373     * @param sourceLength the length of the source string.
374     * @param key the transformed key of the source string.
375     * @param status the error code status.
376     * @return the transformed key.
377     * @see CollationKey
378     * @stable ICU 2.0
379     */
380    virtual CollationKey& getCollationKey(const char16_t *source,
381                                          int32_t sourceLength,
382                                          CollationKey& key,
383                                          UErrorCode& status) const;
384
385    /**
386     * Generates the hash code for the rule-based collation object.
387     * @return the hash code.
388     * @stable ICU 2.0
389     */
390    virtual int32_t hashCode() const;
391
392    /**
393    * Gets the locale of the Collator
394    * @param type can be either requested, valid or actual locale. For more
395    *             information see the definition of ULocDataLocaleType in
396    *             uloc.h
397    * @param status the error code status.
398    * @return locale where the collation data lives. If the collator
399    *         was instantiated from rules, locale is empty.
400    * @deprecated ICU 2.8 likely to change in ICU 3.0, based on feedback
401    */
402    virtual Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
403
404    /**
405     * Gets the tailoring rules for this collator.
406     * @return the collation tailoring from which this collator was created
407     * @stable ICU 2.0
408     */
409    const UnicodeString& getRules() const;
410
411    /**
412     * Gets the version information for a Collator.
413     * @param info the version # information, the result will be filled in
414     * @stable ICU 2.0
415     */
416    virtual void getVersion(UVersionInfo info) const;
417
418#ifndef U_HIDE_DEPRECATED_API
419    /**
420     * Returns the maximum length of any expansion sequences that end with the
421     * specified comparison order.
422     *
423     * This is specific to the kind of collation element values and sequences
424     * returned by the CollationElementIterator.
425     * Call CollationElementIterator::getMaxExpansion() instead.
426     *
427     * @param order a collation order returned by CollationElementIterator::previous
428     *              or CollationElementIterator::next.
429     * @return maximum size of the expansion sequences ending with the collation
430     *         element, or 1 if the collation element does not occur at the end of
431     *         any expansion sequence
432     * @see CollationElementIterator#getMaxExpansion
433     * @deprecated ICU 51 Use CollationElementIterator::getMaxExpansion() instead.
434     */
435    int32_t getMaxExpansion(int32_t order) const;
436#endif  /* U_HIDE_DEPRECATED_API */
437
438    /**
439     * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. This
440     * method is to implement a simple version of RTTI, since not all C++
441     * compilers support genuine RTTI. Polymorphic operator==() and clone()
442     * methods call this method.
443     * @return The class ID for this object. All objects of a given class have
444     *         the same class ID. Objects of other classes have different class
445     *         IDs.
446     * @stable ICU 2.0
447     */
448    virtual UClassID getDynamicClassID(void) const;
449
450    /**
451     * Returns the class ID for this class. This is useful only for comparing to
452     * a return value from getDynamicClassID(). For example:
453     * <pre>
454     * Base* polymorphic_pointer = createPolymorphicObject();
455     * if (polymorphic_pointer->getDynamicClassID() ==
456     *                                          Derived::getStaticClassID()) ...
457     * </pre>
458     * @return The class ID for all objects of this class.
459     * @stable ICU 2.0
460     */
461    static UClassID U_EXPORT2 getStaticClassID(void);
462
463#ifndef U_HIDE_DEPRECATED_API
464    /**
465     * Do not use this method: The caller and the ICU library might use different heaps.
466     * Use cloneBinary() instead which writes to caller-provided memory.
467     *
468     * Returns a binary format of this collator.
469     * @param length Returns the length of the data, in bytes
470     * @param status the error code status.
471     * @return memory, owned by the caller, of size 'length' bytes.
472     * @deprecated ICU 52. Use cloneBinary() instead.
473     */
474    uint8_t *cloneRuleData(int32_t &length, UErrorCode &status) const;
475#endif  /* U_HIDE_DEPRECATED_API */
476
477    /** Creates a binary image of a collator. This binary image can be stored and
478    *  later used to instantiate a collator using ucol_openBinary.
479    *  This API supports preflighting.
480    *  @param buffer a fill-in buffer to receive the binary image
481    *  @param capacity capacity of the destination buffer
482    *  @param status for catching errors
483    *  @return size of the image
484    *  @see ucol_openBinary
485    *  @stable ICU 3.4
486    */
487    int32_t cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) const;
488
489    /**
490     * Returns current rules. Delta defines whether full rules are returned or
491     * just the tailoring.
492     *
493     * getRules(void) should normally be used instead.
494     * See http://userguide.icu-project.org/collation/customization#TOC-Building-on-Existing-Locales
495     * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES.
496     * @param buffer UnicodeString to store the result rules
497     * @stable ICU 2.2
498     * @see UCOL_FULL_RULES
499     */
500    void getRules(UColRuleOption delta, UnicodeString &buffer) const;
501
502    /**
503     * Universal attribute setter
504     * @param attr attribute type
505     * @param value attribute value
506     * @param status to indicate whether the operation went on smoothly or there were errors
507     * @stable ICU 2.2
508     */
509    virtual void setAttribute(UColAttribute attr, UColAttributeValue value,
510                              UErrorCode &status);
511
512    /**
513     * Universal attribute getter.
514     * @param attr attribute type
515     * @param status to indicate whether the operation went on smoothly or there were errors
516     * @return attribute value
517     * @stable ICU 2.2
518     */
519    virtual UColAttributeValue getAttribute(UColAttribute attr,
520                                            UErrorCode &status) const;
521
522    /**
523     * Sets the variable top to the top of the specified reordering group.
524     * The variable top determines the highest-sorting character
525     * which is affected by UCOL_ALTERNATE_HANDLING.
526     * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect.
527     * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION,
528     *              UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY;
529     *              or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group
530     * @param errorCode Standard ICU error code. Its input value must
531     *                  pass the U_SUCCESS() test, or else the function returns
532     *                  immediately. Check for U_FAILURE() on output or use with
533     *                  function chaining. (See User Guide for details.)
534     * @return *this
535     * @see getMaxVariable
536     * @stable ICU 53
537     */
538    virtual Collator &setMaxVariable(UColReorderCode group, UErrorCode &errorCode);
539
540    /**
541     * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING.
542     * @return the maximum variable reordering group.
543     * @see setMaxVariable
544     * @stable ICU 53
545     */
546    virtual UColReorderCode getMaxVariable() const;
547
548    /**
549     * Sets the variable top to the primary weight of the specified string.
550     *
551     * Beginning with ICU 53, the variable top is pinned to
552     * the top of one of the supported reordering groups,
553     * and it must not be beyond the last of those groups.
554     * See setMaxVariable().
555     * @param varTop one or more (if contraction) char16_ts to which the variable top should be set
556     * @param len length of variable top string. If -1 it is considered to be zero terminated.
557     * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br>
558     *    U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br>
559     *    U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond
560     *    the last reordering group supported by setMaxVariable()
561     * @return variable top primary weight
562     * @deprecated ICU 53 Call setMaxVariable() instead.
563     */
564    virtual uint32_t setVariableTop(const char16_t *varTop, int32_t len, UErrorCode &status);
565
566    /**
567     * Sets the variable top to the primary weight of the specified string.
568     *
569     * Beginning with ICU 53, the variable top is pinned to
570     * the top of one of the supported reordering groups,
571     * and it must not be beyond the last of those groups.
572     * See setMaxVariable().
573     * @param varTop a UnicodeString size 1 or more (if contraction) of char16_ts to which the variable top should be set
574     * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br>
575     *    U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br>
576     *    U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond
577     *    the last reordering group supported by setMaxVariable()
578     * @return variable top primary weight
579     * @deprecated ICU 53 Call setMaxVariable() instead.
580     */
581    virtual uint32_t setVariableTop(const UnicodeString &varTop, UErrorCode &status);
582
583    /**
584     * Sets the variable top to the specified primary weight.
585     *
586     * Beginning with ICU 53, the variable top is pinned to
587     * the top of one of the supported reordering groups,
588     * and it must not be beyond the last of those groups.
589     * See setMaxVariable().
590     * @param varTop primary weight, as returned by setVariableTop or ucol_getVariableTop
591     * @param status error code
592     * @deprecated ICU 53 Call setMaxVariable() instead.
593     */
594    virtual void setVariableTop(uint32_t varTop, UErrorCode &status);
595
596    /**
597     * Gets the variable top value of a Collator.
598     * @param status error code (not changed by function). If error code is set, the return value is undefined.
599     * @return the variable top primary weight
600     * @see getMaxVariable
601     * @stable ICU 2.0
602     */
603    virtual uint32_t getVariableTop(UErrorCode &status) const;
604
605    /**
606     * Get a UnicodeSet that contains all the characters and sequences tailored in
607     * this collator.
608     * @param status      error code of the operation
609     * @return a pointer to a UnicodeSet object containing all the
610     *         code points and sequences that may sort differently than
611     *         in the root collator. The object must be disposed of by using delete
612     * @stable ICU 2.4
613     */
614    virtual UnicodeSet *getTailoredSet(UErrorCode &status) const;
615
616    /**
617     * Get the sort key as an array of bytes from a UnicodeString.
618     *
619     * Note that sort keys are often less efficient than simply doing comparison.
620     * For more details, see the ICU User Guide.
621     *
622     * @param source string to be processed.
623     * @param result buffer to store result in. If NULL, number of bytes needed
624     *        will be returned.
625     * @param resultLength length of the result buffer. If if not enough the
626     *        buffer will be filled to capacity.
627     * @return Number of bytes needed for storing the sort key
628     * @stable ICU 2.0
629     */
630    virtual int32_t getSortKey(const UnicodeString& source, uint8_t *result,
631                               int32_t resultLength) const;
632
633    /**
634     * Get the sort key as an array of bytes from a char16_t buffer.
635     *
636     * Note that sort keys are often less efficient than simply doing comparison.
637     * For more details, see the ICU User Guide.
638     *
639     * @param source string to be processed.
640     * @param sourceLength length of string to be processed. If -1, the string
641     *        is 0 terminated and length will be decided by the function.
642     * @param result buffer to store result in. If NULL, number of bytes needed
643     *        will be returned.
644     * @param resultLength length of the result buffer. If if not enough the
645     *        buffer will be filled to capacity.
646     * @return Number of bytes needed for storing the sort key
647     * @stable ICU 2.2
648     */
649    virtual int32_t getSortKey(const char16_t *source, int32_t sourceLength,
650                               uint8_t *result, int32_t resultLength) const;
651
652    /**
653     * Retrieves the reordering codes for this collator.
654     * @param dest The array to fill with the script ordering.
655     * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
656     *  will only return the length of the result without writing any codes (pre-flighting).
657     * @param status A reference to an error code value, which must not indicate
658     * a failure before the function call.
659     * @return The length of the script ordering array.
660     * @see ucol_setReorderCodes
661     * @see Collator#getEquivalentReorderCodes
662     * @see Collator#setReorderCodes
663     * @stable ICU 4.8
664     */
665     virtual int32_t getReorderCodes(int32_t *dest,
666                                     int32_t destCapacity,
667                                     UErrorCode& status) const;
668
669    /**
670     * Sets the ordering of scripts for this collator.
671     * @param reorderCodes An array of script codes in the new order. This can be NULL if the
672     * length is also set to 0. An empty array will clear any reordering codes on the collator.
673     * @param reorderCodesLength The length of reorderCodes.
674     * @param status error code
675     * @see ucol_setReorderCodes
676     * @see Collator#getReorderCodes
677     * @see Collator#getEquivalentReorderCodes
678     * @stable ICU 4.8
679     */
680     virtual void setReorderCodes(const int32_t* reorderCodes,
681                                  int32_t reorderCodesLength,
682                                  UErrorCode& status) ;
683
684    /**
685     * Implements ucol_strcollUTF8().
686     * @internal
687     */
688    virtual UCollationResult internalCompareUTF8(
689            const char *left, int32_t leftLength,
690            const char *right, int32_t rightLength,
691            UErrorCode &errorCode) const;
692
693    /** Get the short definition string for a collator. This internal API harvests the collator's
694     *  locale and the attribute set and produces a string that can be used for opening
695     *  a collator with the same attributes using the ucol_openFromShortString API.
696     *  This string will be normalized.
697     *  The structure and the syntax of the string is defined in the "Naming collators"
698     *  section of the users guide:
699     *  http://userguide.icu-project.org/collation/concepts#TOC-Collator-naming-scheme
700     *  This function supports preflighting.
701     *
702     *  This is internal, and intended to be used with delegate converters.
703     *
704     *  @param locale a locale that will appear as a collators locale in the resulting
705     *                short string definition. If NULL, the locale will be harvested
706     *                from the collator.
707     *  @param buffer space to hold the resulting string
708     *  @param capacity capacity of the buffer
709     *  @param status for returning errors. All the preflighting errors are featured
710     *  @return length of the resulting string
711     *  @see ucol_openFromShortString
712     *  @see ucol_normalizeShortDefinitionString
713     *  @see ucol_getShortDefinitionString
714     *  @internal
715     */
716    virtual int32_t internalGetShortDefinitionString(const char *locale,
717                                                     char *buffer,
718                                                     int32_t capacity,
719                                                     UErrorCode &status) const;
720
721    /**
722     * Implements ucol_nextSortKeyPart().
723     * @internal
724     */
725    virtual int32_t internalNextSortKeyPart(
726            UCharIterator *iter, uint32_t state[2],
727            uint8_t *dest, int32_t count, UErrorCode &errorCode) const;
728
729    // Do not enclose the default constructor with #ifndef U_HIDE_INTERNAL_API
730    /**
731     * Only for use in ucol_openRules().
732     * @internal
733     */
734    RuleBasedCollator();
735
736#ifndef U_HIDE_INTERNAL_API
737    /**
738     * Implements ucol_getLocaleByType().
739     * Needed because the lifetime of the locale ID string must match that of the collator.
740     * getLocale() returns a copy of a Locale, with minimal lifetime in a C wrapper.
741     * @internal
742     */
743    const char *internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const;
744
745    /**
746     * Implements ucol_getContractionsAndExpansions().
747     * Gets this collator's sets of contraction strings and/or
748     * characters and strings that map to multiple collation elements (expansions).
749     * If addPrefixes is TRUE, then contractions that are expressed as
750     * prefix/pre-context rules are included.
751     * @param contractions if not NULL, the set to hold the contractions
752     * @param expansions if not NULL, the set to hold the expansions
753     * @param addPrefixes include prefix contextual mappings
754     * @param errorCode in/out ICU error code
755     * @internal
756     */
757    void internalGetContractionsAndExpansions(
758            UnicodeSet *contractions, UnicodeSet *expansions,
759            UBool addPrefixes, UErrorCode &errorCode) const;
760
761    /**
762     * Adds the contractions that start with character c to the set.
763     * Ignores prefixes. Used by AlphabeticIndex.
764     * @internal
765     */
766    void internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const;
767
768    /**
769     * Implements from-rule constructors, and ucol_openRules().
770     * @internal
771     */
772    void internalBuildTailoring(
773            const UnicodeString &rules,
774            int32_t strength,
775            UColAttributeValue decompositionMode,
776            UParseError *outParseError, UnicodeString *outReason,
777            UErrorCode &errorCode);
778
779    /** @internal */
780    static inline RuleBasedCollator *rbcFromUCollator(UCollator *uc) {
781        return dynamic_cast<RuleBasedCollator *>(fromUCollator(uc));
782    }
783    /** @internal */
784    static inline const RuleBasedCollator *rbcFromUCollator(const UCollator *uc) {
785        return dynamic_cast<const RuleBasedCollator *>(fromUCollator(uc));
786    }
787
788    /**
789     * Appends the CEs for the string to the vector.
790     * @internal for tests & tools
791     */
792    void internalGetCEs(const UnicodeString &str, UVector64 &ces, UErrorCode &errorCode) const;
793#endif  // U_HIDE_INTERNAL_API
794
795protected:
796   /**
797    * Used internally by registration to define the requested and valid locales.
798    * @param requestedLocale the requested locale
799    * @param validLocale the valid locale
800    * @param actualLocale the actual locale
801    * @internal
802    */
803    virtual void setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale);
804
805private:
806    friend class CollationElementIterator;
807    friend class Collator;
808
809    RuleBasedCollator(const CollationCacheEntry *entry);
810
811    /**
812     * Enumeration of attributes that are relevant for short definition strings
813     * (e.g., ucol_getShortDefinitionString()).
814     * Effectively extends UColAttribute.
815     */
816    enum Attributes {
817        ATTR_VARIABLE_TOP = UCOL_ATTRIBUTE_COUNT,
818        ATTR_LIMIT
819    };
820
821    void adoptTailoring(CollationTailoring *t, UErrorCode &errorCode);
822
823    // Both lengths must be <0 or else both must be >=0.
824    UCollationResult doCompare(const char16_t *left, int32_t leftLength,
825                               const char16_t *right, int32_t rightLength,
826                               UErrorCode &errorCode) const;
827    UCollationResult doCompare(const uint8_t *left, int32_t leftLength,
828                               const uint8_t *right, int32_t rightLength,
829                               UErrorCode &errorCode) const;
830
831    void writeSortKey(const char16_t *s, int32_t length,
832                      SortKeyByteSink &sink, UErrorCode &errorCode) const;
833
834    void writeIdenticalLevel(const char16_t *s, const char16_t *limit,
835                             SortKeyByteSink &sink, UErrorCode &errorCode) const;
836
837    const CollationSettings &getDefaultSettings() const;
838
839    void setAttributeDefault(int32_t attribute) {
840        explicitlySetAttributes &= ~((uint32_t)1 << attribute);
841    }
842    void setAttributeExplicitly(int32_t attribute) {
843        explicitlySetAttributes |= (uint32_t)1 << attribute;
844    }
845    UBool attributeHasBeenSetExplicitly(int32_t attribute) const {
846        // assert(0 <= attribute < ATTR_LIMIT);
847        return (UBool)((explicitlySetAttributes & ((uint32_t)1 << attribute)) != 0);
848    }
849
850    /**
851     * Tests whether a character is "unsafe" for use as a collation starting point.
852     *
853     * @param c code point or code unit
854     * @return TRUE if c is unsafe
855     * @see CollationElementIterator#setOffset(int)
856     */
857    UBool isUnsafe(UChar32 c) const;
858
859    static void U_CALLCONV computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode);
860    UBool initMaxExpansions(UErrorCode &errorCode) const;
861
862    void setFastLatinOptions(CollationSettings &ownedSettings) const;
863
864    const CollationData *data;
865    const CollationSettings *settings;  // reference-counted
866    const CollationTailoring *tailoring;  // alias of cacheEntry->tailoring
867    const CollationCacheEntry *cacheEntry;  // reference-counted
868    Locale validLocale;
869    uint32_t explicitlySetAttributes;
870
871    UBool actualLocaleIsSameAsValid;
872};
873
874U_NAMESPACE_END
875
876#endif  // !UCONFIG_NO_COLLATION
877#endif  // TBLCOLL_H
878