1/**
2 *******************************************************************************
3 * Copyright (C) 2006,2011, International Business Machines Corporation        *
4 * and others. All Rights Reserved.                                            *
5 *******************************************************************************
6 */
7
8#ifndef DICTBE_H
9#define DICTBE_H
10
11#include "unicode/utypes.h"
12#include "unicode/uniset.h"
13#include "unicode/utext.h"
14
15#include "brkeng.h"
16
17U_NAMESPACE_BEGIN
18
19class TrieWordDictionary;
20
21/*******************************************************************
22 * DictionaryBreakEngine
23 */
24
25/**
26 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
27 * dictionary to determine language-specific breaks.</p>
28 *
29 * <p>After it is constructed a DictionaryBreakEngine may be shared between
30 * threads without synchronization.</p>
31 */
32class DictionaryBreakEngine : public LanguageBreakEngine {
33 private:
34    /**
35     * The set of characters handled by this engine
36     * @internal
37     */
38
39  UnicodeSet    fSet;
40
41    /**
42     * The set of break types handled by this engine
43     * @internal
44     */
45
46  uint32_t      fTypes;
47
48  /**
49   * <p>Default constructor.</p>
50   *
51   */
52  DictionaryBreakEngine();
53
54 public:
55
56  /**
57   * <p>Constructor setting the break types handled.</p>
58   *
59   * @param breakTypes A bitmap of types handled by the engine.
60   */
61  DictionaryBreakEngine( uint32_t breakTypes );
62
63  /**
64   * <p>Virtual destructor.</p>
65   */
66  virtual ~DictionaryBreakEngine();
67
68 /**
69  * <p>Indicate whether this engine handles a particular character for
70  * a particular kind of break.</p>
71  *
72  * @param c A character which begins a run that the engine might handle
73  * @param breakType The type of text break which the caller wants to determine
74  * @return TRUE if this engine handles the particular character and break
75  * type.
76  */
77  virtual UBool handles( UChar32 c, int32_t breakType ) const;
78
79 /**
80  * <p>Find any breaks within a run in the supplied text.</p>
81  *
82  * @param text A UText representing the text. The
83  * iterator is left at the end of the run of characters which the engine
84  * is capable of handling.
85  * @param startPos The start of the run within the supplied text.
86  * @param endPos The end of the run within the supplied text.
87  * @param reverse Whether the caller is looking for breaks in a reverse
88  * direction.
89  * @param breakType The type of break desired, or -1.
90  * @param foundBreaks An allocated C array of the breaks found, if any
91  * @return The number of breaks found.
92  */
93  virtual int32_t findBreaks( UText *text,
94                              int32_t startPos,
95                              int32_t endPos,
96                              UBool reverse,
97                              int32_t breakType,
98                              UStack &foundBreaks ) const;
99
100 protected:
101
102 /**
103  * <p>Set the character set handled by this engine.</p>
104  *
105  * @param set A UnicodeSet of the set of characters handled by the engine
106  */
107  virtual void setCharacters( const UnicodeSet &set );
108
109 /**
110  * <p>Set the break types handled by this engine.</p>
111  *
112  * @param breakTypes A bitmap of types handled by the engine.
113  */
114//  virtual void setBreakTypes( uint32_t breakTypes );
115
116 /**
117  * <p>Divide up a range of known dictionary characters.</p>
118  *
119  * @param text A UText representing the text
120  * @param rangeStart The start of the range of dictionary characters
121  * @param rangeEnd The end of the range of dictionary characters
122  * @param foundBreaks Output of C array of int32_t break positions, or 0
123  * @return The number of breaks found
124  */
125  virtual int32_t divideUpDictionaryRange( UText *text,
126                                           int32_t rangeStart,
127                                           int32_t rangeEnd,
128                                           UStack &foundBreaks ) const = 0;
129
130};
131
132/*******************************************************************
133 * ThaiBreakEngine
134 */
135
136/**
137 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
138 * TrieWordDictionary and heuristics to determine Thai-specific breaks.</p>
139 *
140 * <p>After it is constructed a ThaiBreakEngine may be shared between
141 * threads without synchronization.</p>
142 */
143class ThaiBreakEngine : public DictionaryBreakEngine {
144 private:
145    /**
146     * The set of characters handled by this engine
147     * @internal
148     */
149
150  UnicodeSet                fThaiWordSet;
151  UnicodeSet                fEndWordSet;
152  UnicodeSet                fBeginWordSet;
153  UnicodeSet                fSuffixSet;
154  UnicodeSet                fMarkSet;
155  const TrieWordDictionary  *fDictionary;
156
157 public:
158
159  /**
160   * <p>Default constructor.</p>
161   *
162   * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
163   * engine is deleted.
164   */
165  ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status);
166
167  /**
168   * <p>Virtual destructor.</p>
169   */
170  virtual ~ThaiBreakEngine();
171
172 protected:
173 /**
174  * <p>Divide up a range of known dictionary characters.</p>
175  *
176  * @param text A UText representing the text
177  * @param rangeStart The start of the range of dictionary characters
178  * @param rangeEnd The end of the range of dictionary characters
179  * @param foundBreaks Output of C array of int32_t break positions, or 0
180  * @return The number of breaks found
181  */
182  virtual int32_t divideUpDictionaryRange( UText *text,
183                                           int32_t rangeStart,
184                                           int32_t rangeEnd,
185                                           UStack &foundBreaks ) const;
186
187};
188
189
190/*******************************************************************
191 * KhmerBreakEngine
192 */
193
194/**
195 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
196 * TrieWordDictionary and heuristics to determine Khmer-specific breaks.</p>
197 *
198 * <p>After it is constructed a KhmerBreakEngine may be shared between
199 * threads without synchronization.</p>
200 */
201class KhmerBreakEngine : public DictionaryBreakEngine {
202 private:
203    /**
204     * The set of characters handled by this engine
205     * @internal
206     */
207
208  UnicodeSet                fKhmerWordSet;
209  UnicodeSet                fEndWordSet;
210  UnicodeSet                fBeginWordSet;
211  UnicodeSet                fMarkSet;
212  const TrieWordDictionary  *fDictionary;
213
214 public:
215
216  /**
217   * <p>Default constructor.</p>
218   *
219   * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
220   * engine is deleted.
221   */
222  KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status);
223
224  /**
225   * <p>Virtual destructor.</p>
226   */
227  virtual ~KhmerBreakEngine();
228
229 protected:
230 /**
231  * <p>Divide up a range of known dictionary characters.</p>
232  *
233  * @param text A UText representing the text
234  * @param rangeStart The start of the range of dictionary characters
235  * @param rangeEnd The end of the range of dictionary characters
236  * @param foundBreaks Output of C array of int32_t break positions, or 0
237  * @return The number of breaks found
238  */
239  virtual int32_t divideUpDictionaryRange( UText *text,
240                                           int32_t rangeStart,
241                                           int32_t rangeEnd,
242                                           UStack &foundBreaks ) const;
243
244};
245
246
247U_NAMESPACE_END
248
249    /* DICTBE_H */
250#endif
251