1/**
2 *******************************************************************************
3 * Copyright (C) 2006,2012-2013, International Business Machines Corporation   *
4 * and others. All Rights Reserved.                                            *
5 *******************************************************************************
6 */
7
8#ifndef DICTBE_H
9#define DICTBE_H
10
11#include "unicode/utypes.h"
12#include "unicode/uniset.h"
13#include "unicode/utext.h"
14
15#include "brkeng.h"
16
17U_NAMESPACE_BEGIN
18
19class DictionaryMatcher;
20
21/*******************************************************************
22 * DictionaryBreakEngine
23 */
24
25/**
26 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
27 * dictionary to determine language-specific breaks.</p>
28 *
29 * <p>After it is constructed a DictionaryBreakEngine may be shared between
30 * threads without synchronization.</p>
31 */
32class DictionaryBreakEngine : public LanguageBreakEngine {
33 private:
34    /**
35     * The set of characters handled by this engine
36     * @internal
37     */
38
39  UnicodeSet    fSet;
40
41    /**
42     * The set of break types handled by this engine
43     * @internal
44     */
45
46  uint32_t      fTypes;
47
48  /**
49   * <p>Default constructor.</p>
50   *
51   */
52  DictionaryBreakEngine();
53
54 public:
55
56  /**
57   * <p>Constructor setting the break types handled.</p>
58   *
59   * @param breakTypes A bitmap of types handled by the engine.
60   */
61  DictionaryBreakEngine( uint32_t breakTypes );
62
63  /**
64   * <p>Virtual destructor.</p>
65   */
66  virtual ~DictionaryBreakEngine();
67
68  /**
69   * <p>Indicate whether this engine handles a particular character for
70   * a particular kind of break.</p>
71   *
72   * @param c A character which begins a run that the engine might handle
73   * @param breakType The type of text break which the caller wants to determine
74   * @return TRUE if this engine handles the particular character and break
75   * type.
76   */
77  virtual UBool handles( UChar32 c, int32_t breakType ) const;
78
79  /**
80   * <p>Find any breaks within a run in the supplied text.</p>
81   *
82   * @param text A UText representing the text. The iterator is left at
83   * the end of the run of characters which the engine is capable of handling
84   * that starts from the first (or last) character in the range.
85   * @param startPos The start of the run within the supplied text.
86   * @param endPos The end of the run within the supplied text.
87   * @param reverse Whether the caller is looking for breaks in a reverse
88   * direction.
89   * @param breakType The type of break desired, or -1.
90   * @param foundBreaks An allocated C array of the breaks found, if any
91   * @return The number of breaks found.
92   */
93  virtual int32_t findBreaks( UText *text,
94                              int32_t startPos,
95                              int32_t endPos,
96                              UBool reverse,
97                              int32_t breakType,
98                              UStack &foundBreaks ) const;
99
100 protected:
101
102 /**
103  * <p>Set the character set handled by this engine.</p>
104  *
105  * @param set A UnicodeSet of the set of characters handled by the engine
106  */
107  virtual void setCharacters( const UnicodeSet &set );
108
109 /**
110  * <p>Set the break types handled by this engine.</p>
111  *
112  * @param breakTypes A bitmap of types handled by the engine.
113  */
114//  virtual void setBreakTypes( uint32_t breakTypes );
115
116 /**
117  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
118  *
119  * @param text A UText representing the text
120  * @param rangeStart The start of the range of dictionary characters
121  * @param rangeEnd The end of the range of dictionary characters
122  * @param foundBreaks Output of C array of int32_t break positions, or 0
123  * @return The number of breaks found
124  */
125  virtual int32_t divideUpDictionaryRange( UText *text,
126                                           int32_t rangeStart,
127                                           int32_t rangeEnd,
128                                           UStack &foundBreaks ) const = 0;
129
130};
131
132/*******************************************************************
133 * ThaiBreakEngine
134 */
135
136/**
137 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
138 * dictionary and heuristics to determine Thai-specific breaks.</p>
139 *
140 * <p>After it is constructed a ThaiBreakEngine may be shared between
141 * threads without synchronization.</p>
142 */
143class ThaiBreakEngine : public DictionaryBreakEngine {
144 private:
145    /**
146     * The set of characters handled by this engine
147     * @internal
148     */
149
150  UnicodeSet                fThaiWordSet;
151  UnicodeSet                fEndWordSet;
152  UnicodeSet                fBeginWordSet;
153  UnicodeSet                fSuffixSet;
154  UnicodeSet                fMarkSet;
155  DictionaryMatcher  *fDictionary;
156
157 public:
158
159  /**
160   * <p>Default constructor.</p>
161   *
162   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
163   * engine is deleted.
164   */
165  ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
166
167  /**
168   * <p>Virtual destructor.</p>
169   */
170  virtual ~ThaiBreakEngine();
171
172 protected:
173 /**
174  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
175  *
176  * @param text A UText representing the text
177  * @param rangeStart The start of the range of dictionary characters
178  * @param rangeEnd The end of the range of dictionary characters
179  * @param foundBreaks Output of C array of int32_t break positions, or 0
180  * @return The number of breaks found
181  */
182  virtual int32_t divideUpDictionaryRange( UText *text,
183                                           int32_t rangeStart,
184                                           int32_t rangeEnd,
185                                           UStack &foundBreaks ) const;
186
187};
188
189/*******************************************************************
190 * LaoBreakEngine
191 */
192
193/**
194 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
195 * dictionary and heuristics to determine Lao-specific breaks.</p>
196 *
197 * <p>After it is constructed a LaoBreakEngine may be shared between
198 * threads without synchronization.</p>
199 */
200class LaoBreakEngine : public DictionaryBreakEngine {
201 private:
202    /**
203     * The set of characters handled by this engine
204     * @internal
205     */
206
207  UnicodeSet                fLaoWordSet;
208  UnicodeSet                fEndWordSet;
209  UnicodeSet                fBeginWordSet;
210  UnicodeSet                fMarkSet;
211  DictionaryMatcher  *fDictionary;
212
213 public:
214
215  /**
216   * <p>Default constructor.</p>
217   *
218   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
219   * engine is deleted.
220   */
221  LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
222
223  /**
224   * <p>Virtual destructor.</p>
225   */
226  virtual ~LaoBreakEngine();
227
228 protected:
229 /**
230  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
231  *
232  * @param text A UText representing the text
233  * @param rangeStart The start of the range of dictionary characters
234  * @param rangeEnd The end of the range of dictionary characters
235  * @param foundBreaks Output of C array of int32_t break positions, or 0
236  * @return The number of breaks found
237  */
238  virtual int32_t divideUpDictionaryRange( UText *text,
239                                           int32_t rangeStart,
240                                           int32_t rangeEnd,
241                                           UStack &foundBreaks ) const;
242
243};
244
245/*******************************************************************
246 * KhmerBreakEngine
247 */
248
249/**
250 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
251 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
252 *
253 * <p>After it is constructed a KhmerBreakEngine may be shared between
254 * threads without synchronization.</p>
255 */
256class KhmerBreakEngine : public DictionaryBreakEngine {
257 private:
258    /**
259     * The set of characters handled by this engine
260     * @internal
261     */
262
263  UnicodeSet                fKhmerWordSet;
264  UnicodeSet                fEndWordSet;
265  UnicodeSet                fBeginWordSet;
266  UnicodeSet                fMarkSet;
267  DictionaryMatcher  *fDictionary;
268
269 public:
270
271  /**
272   * <p>Default constructor.</p>
273   *
274   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
275   * engine is deleted.
276   */
277  KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
278
279  /**
280   * <p>Virtual destructor.</p>
281   */
282  virtual ~KhmerBreakEngine();
283
284 protected:
285 /**
286  * <p>Divide up a range of known dictionary characters.</p>
287  *
288  * @param text A UText representing the text
289  * @param rangeStart The start of the range of dictionary characters
290  * @param rangeEnd The end of the range of dictionary characters
291  * @param foundBreaks Output of C array of int32_t break positions, or 0
292  * @return The number of breaks found
293  */
294  virtual int32_t divideUpDictionaryRange( UText *text,
295                                           int32_t rangeStart,
296                                           int32_t rangeEnd,
297                                           UStack &foundBreaks ) const;
298
299};
300
301#if !UCONFIG_NO_NORMALIZATION
302
303/*******************************************************************
304 * CjkBreakEngine
305 */
306
307//indicates language/script that the CjkBreakEngine will handle
308enum LanguageType {
309    kKorean,
310    kChineseJapanese
311};
312
313/**
314 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
315 * dictionary with costs associated with each word and
316 * Viterbi decoding to determine CJK-specific breaks.</p>
317 */
318class CjkBreakEngine : public DictionaryBreakEngine {
319 protected:
320    /**
321     * The set of characters handled by this engine
322     * @internal
323     */
324  UnicodeSet                fHangulWordSet;
325  UnicodeSet                fHanWordSet;
326  UnicodeSet                fKatakanaWordSet;
327  UnicodeSet                fHiraganaWordSet;
328
329  DictionaryMatcher  *fDictionary;
330
331 public:
332
333    /**
334     * <p>Default constructor.</p>
335     *
336     * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
337     * engine is deleted. The DictionaryMatcher must contain costs for each word
338     * in order for the dictionary to work properly.
339     */
340  CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
341
342    /**
343     * <p>Virtual destructor.</p>
344     */
345  virtual ~CjkBreakEngine();
346
347 protected:
348    /**
349     * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
350     *
351     * @param text A UText representing the text
352     * @param rangeStart The start of the range of dictionary characters
353     * @param rangeEnd The end of the range of dictionary characters
354     * @param foundBreaks Output of C array of int32_t break positions, or 0
355     * @return The number of breaks found
356     */
357  virtual int32_t divideUpDictionaryRange( UText *text,
358          int32_t rangeStart,
359          int32_t rangeEnd,
360          UStack &foundBreaks ) const;
361
362};
363
364#endif
365
366U_NAMESPACE_END
367
368    /* DICTBE_H */
369#endif
370