1/*
2*******************************************************************************
3* Copyright (C) 2012-2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* collationdatabuilder.h
7*
8* created on: 2012apr01
9* created by: Markus W. Scherer
10*/
11
12#ifndef __COLLATIONDATABUILDER_H__
13#define __COLLATIONDATABUILDER_H__
14
15#include "unicode/utypes.h"
16
17#if !UCONFIG_NO_COLLATION
18
19#include "unicode/uniset.h"
20#include "unicode/unistr.h"
21#include "unicode/uversion.h"
22#include "collation.h"
23#include "collationdata.h"
24#include "collationsettings.h"
25#include "normalizer2impl.h"
26#include "utrie2.h"
27#include "uvectr32.h"
28#include "uvectr64.h"
29#include "uvector.h"
30
31U_NAMESPACE_BEGIN
32
33struct ConditionalCE32;
34
35class CollationFastLatinBuilder;
36class CopyHelper;
37class DataBuilderCollationIterator;
38class UCharsTrieBuilder;
39
40/**
41 * Low-level CollationData builder.
42 * Takes (character, CE) pairs and builds them into runtime data structures.
43 * Supports characters with context prefixes and contraction suffixes.
44 */
45class U_I18N_API CollationDataBuilder : public UObject {
46public:
47    /**
48     * Collation element modifier. Interface class for a modifier
49     * that changes a tailoring builder's temporary CEs to final CEs.
50     * Called for every non-special CE32 and every expansion CE.
51     */
52    class CEModifier : public UObject {
53    public:
54        virtual ~CEModifier();
55        /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */
56        virtual int64_t modifyCE32(uint32_t ce32) const = 0;
57        /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */
58        virtual int64_t modifyCE(int64_t ce) const = 0;
59    };
60
61    CollationDataBuilder(UErrorCode &errorCode);
62
63    virtual ~CollationDataBuilder();
64
65    void initForTailoring(const CollationData *b, UErrorCode &errorCode);
66
67    virtual UBool isCompressibleLeadByte(uint32_t b) const;
68
69    inline UBool isCompressiblePrimary(uint32_t p) const {
70        return isCompressibleLeadByte(p >> 24);
71    }
72
73    /**
74     * @return TRUE if this builder has mappings (e.g., add() has been called)
75     */
76    UBool hasMappings() const { return modified; }
77
78    /**
79     * @return TRUE if c has CEs in this builder
80     */
81    UBool isAssigned(UChar32 c) const;
82
83    /**
84     * @return the three-byte primary if c maps to a single such CE and has no context data,
85     * otherwise returns 0.
86     */
87    uint32_t getLongPrimaryIfSingleCE(UChar32 c) const;
88
89    /**
90     * @return the single CE for c.
91     * Sets an error code if c does not have a single CE.
92     */
93    int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
94
95    void add(const UnicodeString &prefix, const UnicodeString &s,
96             const int64_t ces[], int32_t cesLength,
97             UErrorCode &errorCode);
98
99    /**
100     * Encodes the ces as either the returned ce32 by itself,
101     * or by storing an expansion, with the returned ce32 referring to that.
102     *
103     * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))
104     */
105    virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
106    void addCE32(const UnicodeString &prefix, const UnicodeString &s,
107                 uint32_t ce32, UErrorCode &errorCode);
108
109    /**
110     * Sets three-byte-primary CEs for a range of code points in code point order,
111     * if it is worth doing; otherwise no change is made.
112     * None of the code points in the range should have complex mappings so far
113     * (expansions/contractions/prefixes).
114     * @param start first code point
115     * @param end last code point (inclusive)
116     * @param primary primary weight for 'start'
117     * @param step per-code point primary-weight increment
118     * @param errorCode ICU in/out error code
119     * @return TRUE if an OFFSET_TAG range was used for start..end
120     */
121    UBool maybeSetPrimaryRange(UChar32 start, UChar32 end,
122                               uint32_t primary, int32_t step,
123                               UErrorCode &errorCode);
124
125    /**
126     * Sets three-byte-primary CEs for a range of code points in code point order.
127     * Sets range values if that is worth doing, or else individual values.
128     * None of the code points in the range should have complex mappings so far
129     * (expansions/contractions/prefixes).
130     * @param start first code point
131     * @param end last code point (inclusive)
132     * @param primary primary weight for 'start'
133     * @param step per-code point primary-weight increment
134     * @param errorCode ICU in/out error code
135     * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step
136     */
137    uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
138                                          uint32_t primary, int32_t step,
139                                          UErrorCode &errorCode);
140
141    /**
142     * Copies all mappings from the src builder, with modifications.
143     * This builder here must not be built yet, and should be empty.
144     */
145    void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
146                  UErrorCode &errorCode);
147
148    void optimize(const UnicodeSet &set, UErrorCode &errorCode);
149    void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode);
150
151    void enableFastLatin() { fastLatinEnabled = TRUE; }
152    virtual void build(CollationData &data, UErrorCode &errorCode);
153
154    /**
155     * Looks up CEs for s and appends them to the ces array.
156     * Does not handle normalization: s should be in FCD form.
157     *
158     * Does not write completely ignorable CEs.
159     * Does not write beyond Collation::MAX_EXPANSION_LENGTH.
160     *
161     * @return incremented cesLength
162     */
163    int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength);
164    int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s,
165                   int64_t ces[], int32_t cesLength);
166
167protected:
168    friend class CopyHelper;
169    friend class DataBuilderCollationIterator;
170
171    uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const;
172
173    int32_t addCE(int64_t ce, UErrorCode &errorCode);
174    int32_t addCE32(uint32_t ce32, UErrorCode &errorCode);
175    int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode);
176
177    inline ConditionalCE32 *getConditionalCE32(int32_t index) const {
178        return static_cast<ConditionalCE32 *>(conditionalCE32s[index]);
179    }
180    inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const {
181        return getConditionalCE32(Collation::indexFromCE32(ce32));
182    }
183
184    static uint32_t makeBuilderContextCE32(int32_t index) {
185        return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index);
186    }
187    static inline UBool isBuilderContextCE32(uint32_t ce32) {
188        return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG);
189    }
190
191    static uint32_t encodeOneCEAsCE32(int64_t ce);
192    uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode);
193    uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode);
194    uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode);
195
196    uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode);
197    /**
198     * Copies base contractions to a list of ConditionalCE32.
199     * Sets cond->next to the index of the first new item
200     * and returns the index of the last new item.
201     */
202    int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
203                                         ConditionalCE32 *cond, UErrorCode &errorCode);
204
205    UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode);
206    void setDigitTags(UErrorCode &errorCode);
207    void setLeadSurrogates(UErrorCode &errorCode);
208
209    void buildMappings(CollationData &data, UErrorCode &errorCode);
210
211    void clearContexts();
212    void buildContexts(UErrorCode &errorCode);
213    uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode);
214    int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
215                           UErrorCode &errorCode);
216
217    void buildFastLatinTable(CollationData &data, UErrorCode &errorCode);
218
219    int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength);
220
221    static UChar32 jamoCpFromIndex(int32_t i) {
222        // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27
223        if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; }
224        i -= Hangul::JAMO_L_COUNT;
225        if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; }
226        i -= Hangul::JAMO_V_COUNT;
227        // i < 27
228        return Hangul::JAMO_T_BASE + 1 + i;
229    }
230
231    /** @see Collation::BUILDER_DATA_TAG */
232    static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100;
233
234    const Normalizer2Impl &nfcImpl;
235    const CollationData *base;
236    const CollationSettings *baseSettings;
237    UTrie2 *trie;
238    UVector32 ce32s;
239    UVector64 ce64s;
240    UVector conditionalCE32s;  // vector of ConditionalCE32
241    // Characters that have context (prefixes or contraction suffixes).
242    UnicodeSet contextChars;
243    // Serialized UCharsTrie structures for finalized contexts.
244    UnicodeString contexts;
245    UnicodeSet unsafeBackwardSet;
246    UBool modified;
247
248    UBool fastLatinEnabled;
249    CollationFastLatinBuilder *fastLatinBuilder;
250
251    DataBuilderCollationIterator *collIter;
252};
253
254U_NAMESPACE_END
255
256#endif  // !UCONFIG_NO_COLLATION
257#endif  // __COLLATIONDATABUILDER_H__
258