1/*
2 ******************************************************************************
3 *   Copyright (C) 1996-2012, International Business Machines                 *
4 *   Corporation and others.  All Rights Reserved.                            *
5 ******************************************************************************
6 */
7
8/**
9 * \file
10 * \brief Originally, added as C++ API for Collation data used to compute minLengthInChars
11 * \internal
12 */
13
14/*
15 * Note: This module was incldued in ICU 4.0.1 as @internal technology preview for supporting
16 * Boyer-Moore string search API. For now, only SSearchTest depends on this module. I temporaly
17 * moved the module from i18n directory to intltest, because we have no plan to publish this
18 * as public API. (2012-12-18 yoshito)
19 */
20
21#ifndef COLL_DATA_H
22#define COLL_DATA_H
23
24#include "unicode/utypes.h"
25
26#if !UCONFIG_NO_COLLATION
27
28#include "unicode/ucol.h"
29#include "unicode/unistr.h"
30
31 /**
32  * The size of the internal CE buffer in a <code>CEList</code> object
33  */
34#define CELIST_BUFFER_SIZE 4
35
36/**
37 * \def INSTRUMENT_CELIST
38 * Define this to enable the <code>CEList</code> objects to collect
39 * statistics.
40 */
41
42 /**
43  * The size of the initial list in a <code>StringList</code> object.
44  */
45#define STRING_LIST_BUFFER_SIZE 16
46
47U_NAMESPACE_USE
48
49 /**
50  * This object holds a list of CEs generated from a particular
51  * <code>UnicodeString</code>
52  *
53  */
54class CEList
55{
56public:
57    /**
58     * Construct a <code>CEList</code> object.
59     *
60     * @param coll - the Collator used to collect the CEs.
61     * @param string - the string for which to collect the CEs.
62     * @param status - will be set if any errors occur.
63     *
64     * Note: if on return, status is set to an error code,
65     * the only safe thing to do with this object is to call
66     * the destructor.
67     */
68    CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status);
69
70    /**
71     * The destructor.
72     */
73    ~CEList();
74
75    /**
76     * Return the number of CEs in the list.
77     *
78     * @return the number of CEs in the list.
79     */
80    int32_t size() const;
81
82    /**
83     * Get a particular CE from the list.
84     *
85     * @param index - the index of the CE to return
86     *
87     * @return the CE, or <code>0</code> if <code>index</code> is out of range
88     */
89    uint32_t get(int32_t index) const;
90
91    /**
92     * Check if the CEs in another <code>CEList</code> match the
93     * suffix of this list starting at a give offset.
94     *
95     * @param offset - the offset of the suffix
96     * @param other - the other <code>CEList</code>
97     *
98     * @return <code>TRUE</code> if the CEs match, <code>FALSE</code> otherwise.
99     */
100    UBool matchesAt(int32_t offset, const CEList *other) const;
101
102    /**
103     * The index operator.
104     *
105     * @param index - the index
106     *
107     * @return a reference to the given CE in the list
108     */
109    uint32_t &operator[](int32_t index) const;
110
111private:
112    void add(uint32_t ce, UErrorCode &status);
113
114    uint32_t ceBuffer[CELIST_BUFFER_SIZE];
115    uint32_t *ces;
116    int32_t listMax;
117    int32_t listSize;
118};
119
120/**
121 * StringList
122 *
123 * This object holds a list of <code>UnicodeString</code> objects.
124 */
125class StringList
126{
127public:
128    /**
129     * Construct an empty <code>StringList</code>
130     *
131     * @param status - will be set if any errors occur.
132     *
133     * Note: if on return, status is set to an error code,
134     * the only safe thing to do with this object is to call
135     * the destructor.
136     */
137    StringList(UErrorCode &status);
138
139    /**
140     * The destructor.
141     */
142    ~StringList();
143
144    /**
145     * Add a string to the list.
146     *
147     * @param string - the string to add
148     * @param status - will be set if any errors occur.
149     */
150    void add(const UnicodeString *string, UErrorCode &status);
151
152    /**
153     * Add an array of Unicode code points to the list.
154     *
155     * @param chars - the address of the array of code points
156     * @param count - the number of code points in the array
157     * @param status - will be set if any errors occur.
158     */
159    void add(const UChar *chars, int32_t count, UErrorCode &status);
160
161    /**
162     * Get a particular string from the list.
163     *
164     * @param index - the index of the string
165     *
166     * @return a pointer to the <code>UnicodeString</code> or <code>NULL</code>
167     *         if <code>index</code> is out of bounds.
168     */
169    const UnicodeString *get(int32_t index) const;
170
171    /**
172     * Get the number of stings in the list.
173     *
174     * @return the number of strings in the list.
175     */
176    int32_t size() const;
177
178private:
179    UnicodeString *strings;
180    int32_t listMax;
181    int32_t listSize;
182};
183
184
185/*
186 * Forward references to internal classes.
187 */
188class StringToCEsMap;
189class CEToStringsMap;
190
191/**
192 * CollData
193 *
194 * This class holds the Collator-specific data needed to
195 * compute the length of the shortest string that can
196 * generate a partcular list of CEs.
197 *
198 * <code>CollData</code> objects are quite expensive to compute. Because
199 * of this, they are cached. When you call <code>CollData::open</code> it
200 * returns a reference counted cached object. When you call <code>CollData::close</code>
201 * the reference count on the object is decremented but the object is not deleted.
202 *
203 * If you do not need to reuse any unreferenced objects in the cache, you can call
204 * <code>CollData::flushCollDataCache</code>. If you no longer need any <code>CollData</code>
205 * objects, you can call <code>CollData::freeCollDataCache</code>
206 */
207class CollData
208{
209public:
210    /**
211     * Construct a <code>CollData</code> object.
212     *
213     * @param collator - the collator
214     * @param status - will be set if any errors occur.
215     */
216    CollData(UCollator *collator, UErrorCode &status);
217
218    /**
219     * The destructor.
220     */
221    ~CollData();
222
223    /**
224     * Get the <code>UCollator</code> object used to create this object.
225     * The object returned may not be the exact object that was used to
226     * create this object, but it will have the same behavior.
227     */
228    UCollator *getCollator() const;
229
230    /**
231     * Get a list of all the strings which generate a list
232     * of CEs starting with a given CE.
233     *
234     * @param ce - the CE
235     *
236     * return a <code>StringList</code> object containing all
237     *        the stirngs, or <code>NULL</code> if there are
238     *        no such strings.
239     */
240    const StringList *getStringList(int32_t ce) const;
241
242    /**
243     * Get a list of the CEs generated by a partcular stirng.
244     *
245     * @param string - the string
246     *
247     * @return a <code>CEList</code> object containt the CEs. You
248     *         must call <code>freeCEList</code> when you are finished
249     *         using the <code>CEList</code>/
250     */
251    const CEList *getCEList(const UnicodeString *string) const;
252
253    /**
254     * Release a <code>CEList</code> returned by <code>getCEList</code>.
255     *
256     * @param list - the <code>CEList</code> to free.
257     */
258    void freeCEList(const CEList *list);
259
260    /**
261     * Return the length of the shortest string that will generate
262     * the given list of CEs.
263     *
264     * @param ces - the CEs
265     * @param offset - the offset of the first CE in the list to use.
266     *
267     * @return the length of the shortest string.
268     */
269    int32_t minLengthInChars(const CEList *ces, int32_t offset) const;
270
271
272    /**
273     * Return the length of the shortest string that will generate
274     * the given list of CEs.
275     *
276     * Note: the algorithm used to do this computation is recursive. To
277     * limit the amount of recursion, a "history" list is used to record
278     * the best answer starting at a particular offset in the list of CEs.
279     * If the same offset is visited again during the recursion, the answer
280     * in the history list is used.
281     *
282     * @param ces - the CEs
283     * @param offset - the offset of the first CE in the list to use.
284     * @param history - the history list. Must be at least as long as
285     *                 the number of cEs in the <code>CEList</code>
286     *
287     * @return the length of the shortest string.
288     */
289   int32_t minLengthInChars(const CEList *ces, int32_t offset, int32_t *history) const;
290
291private:
292    UCollator      *coll;
293    CEToStringsMap *ceToCharsStartingWith;
294
295    uint32_t minHan;
296    uint32_t maxHan;
297
298    uint32_t jamoLimits[4];
299};
300
301#endif // #if !UCONFIG_NO_COLLATION
302#endif // #ifndef COLL_DATA_H
303