1/**
2 *******************************************************************************
3 * Copyright (C) 2006-2014, International Business Machines Corporation   *
4 * and others. All Rights Reserved.                                            *
5 *******************************************************************************
6 */
7
8#ifndef DICTBE_H
9#define DICTBE_H
10
11#include "unicode/utypes.h"
12#include "unicode/uniset.h"
13#include "unicode/utext.h"
14
15#include "brkeng.h"
16
17U_NAMESPACE_BEGIN
18
19class DictionaryMatcher;
20class Normalizer2;
21
22/*******************************************************************
23 * DictionaryBreakEngine
24 */
25
26/**
27 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
28 * dictionary to determine language-specific breaks.</p>
29 *
30 * <p>After it is constructed a DictionaryBreakEngine may be shared between
31 * threads without synchronization.</p>
32 */
33class DictionaryBreakEngine : public LanguageBreakEngine {
34 private:
35    /**
36     * The set of characters handled by this engine
37     * @internal
38     */
39
40  UnicodeSet    fSet;
41
42    /**
43     * The set of break types handled by this engine
44     * @internal
45     */
46
47  uint32_t      fTypes;
48
49  /**
50   * <p>Default constructor.</p>
51   *
52   */
53  DictionaryBreakEngine();
54
55 public:
56
57  /**
58   * <p>Constructor setting the break types handled.</p>
59   *
60   * @param breakTypes A bitmap of types handled by the engine.
61   */
62  DictionaryBreakEngine( uint32_t breakTypes );
63
64  /**
65   * <p>Virtual destructor.</p>
66   */
67  virtual ~DictionaryBreakEngine();
68
69  /**
70   * <p>Indicate whether this engine handles a particular character for
71   * a particular kind of break.</p>
72   *
73   * @param c A character which begins a run that the engine might handle
74   * @param breakType The type of text break which the caller wants to determine
75   * @return TRUE if this engine handles the particular character and break
76   * type.
77   */
78  virtual UBool handles( UChar32 c, int32_t breakType ) const;
79
80  /**
81   * <p>Find any breaks within a run in the supplied text.</p>
82   *
83   * @param text A UText representing the text. The iterator is left at
84   * the end of the run of characters which the engine is capable of handling
85   * that starts from the first (or last) character in the range.
86   * @param startPos The start of the run within the supplied text.
87   * @param endPos The end of the run within the supplied text.
88   * @param reverse Whether the caller is looking for breaks in a reverse
89   * direction.
90   * @param breakType The type of break desired, or -1.
91   * @param foundBreaks An allocated C array of the breaks found, if any
92   * @return The number of breaks found.
93   */
94  virtual int32_t findBreaks( UText *text,
95                              int32_t startPos,
96                              int32_t endPos,
97                              UBool reverse,
98                              int32_t breakType,
99                              UStack &foundBreaks ) const;
100
101 protected:
102
103 /**
104  * <p>Set the character set handled by this engine.</p>
105  *
106  * @param set A UnicodeSet of the set of characters handled by the engine
107  */
108  virtual void setCharacters( const UnicodeSet &set );
109
110 /**
111  * <p>Set the break types handled by this engine.</p>
112  *
113  * @param breakTypes A bitmap of types handled by the engine.
114  */
115//  virtual void setBreakTypes( uint32_t breakTypes );
116
117 /**
118  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
119  *
120  * @param text A UText representing the text
121  * @param rangeStart The start of the range of dictionary characters
122  * @param rangeEnd The end of the range of dictionary characters
123  * @param foundBreaks Output of C array of int32_t break positions, or 0
124  * @return The number of breaks found
125  */
126  virtual int32_t divideUpDictionaryRange( UText *text,
127                                           int32_t rangeStart,
128                                           int32_t rangeEnd,
129                                           UStack &foundBreaks ) const = 0;
130
131};
132
133/*******************************************************************
134 * ThaiBreakEngine
135 */
136
137/**
138 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
139 * dictionary and heuristics to determine Thai-specific breaks.</p>
140 *
141 * <p>After it is constructed a ThaiBreakEngine may be shared between
142 * threads without synchronization.</p>
143 */
144class ThaiBreakEngine : public DictionaryBreakEngine {
145 private:
146    /**
147     * The set of characters handled by this engine
148     * @internal
149     */
150
151  UnicodeSet                fThaiWordSet;
152  UnicodeSet                fEndWordSet;
153  UnicodeSet                fBeginWordSet;
154  UnicodeSet                fSuffixSet;
155  UnicodeSet                fMarkSet;
156  DictionaryMatcher  *fDictionary;
157
158 public:
159
160  /**
161   * <p>Default constructor.</p>
162   *
163   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
164   * engine is deleted.
165   */
166  ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
167
168  /**
169   * <p>Virtual destructor.</p>
170   */
171  virtual ~ThaiBreakEngine();
172
173 protected:
174 /**
175  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
176  *
177  * @param text A UText representing the text
178  * @param rangeStart The start of the range of dictionary characters
179  * @param rangeEnd The end of the range of dictionary characters
180  * @param foundBreaks Output of C array of int32_t break positions, or 0
181  * @return The number of breaks found
182  */
183  virtual int32_t divideUpDictionaryRange( UText *text,
184                                           int32_t rangeStart,
185                                           int32_t rangeEnd,
186                                           UStack &foundBreaks ) const;
187
188};
189
190/*******************************************************************
191 * LaoBreakEngine
192 */
193
194/**
195 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
196 * dictionary and heuristics to determine Lao-specific breaks.</p>
197 *
198 * <p>After it is constructed a LaoBreakEngine may be shared between
199 * threads without synchronization.</p>
200 */
201class LaoBreakEngine : public DictionaryBreakEngine {
202 private:
203    /**
204     * The set of characters handled by this engine
205     * @internal
206     */
207
208  UnicodeSet                fLaoWordSet;
209  UnicodeSet                fEndWordSet;
210  UnicodeSet                fBeginWordSet;
211  UnicodeSet                fMarkSet;
212  DictionaryMatcher  *fDictionary;
213
214 public:
215
216  /**
217   * <p>Default constructor.</p>
218   *
219   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
220   * engine is deleted.
221   */
222  LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
223
224  /**
225   * <p>Virtual destructor.</p>
226   */
227  virtual ~LaoBreakEngine();
228
229 protected:
230 /**
231  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
232  *
233  * @param text A UText representing the text
234  * @param rangeStart The start of the range of dictionary characters
235  * @param rangeEnd The end of the range of dictionary characters
236  * @param foundBreaks Output of C array of int32_t break positions, or 0
237  * @return The number of breaks found
238  */
239  virtual int32_t divideUpDictionaryRange( UText *text,
240                                           int32_t rangeStart,
241                                           int32_t rangeEnd,
242                                           UStack &foundBreaks ) const;
243
244};
245
246/*******************************************************************
247 * BurmeseBreakEngine
248 */
249
250/**
251 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
252 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
253 *
254 * <p>After it is constructed a BurmeseBreakEngine may be shared between
255 * threads without synchronization.</p>
256 */
257class BurmeseBreakEngine : public DictionaryBreakEngine {
258 private:
259    /**
260     * The set of characters handled by this engine
261     * @internal
262     */
263
264  UnicodeSet                fBurmeseWordSet;
265  UnicodeSet                fEndWordSet;
266  UnicodeSet                fBeginWordSet;
267  UnicodeSet                fMarkSet;
268  DictionaryMatcher  *fDictionary;
269
270 public:
271
272  /**
273   * <p>Default constructor.</p>
274   *
275   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
276   * engine is deleted.
277   */
278  BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
279
280  /**
281   * <p>Virtual destructor.</p>
282   */
283  virtual ~BurmeseBreakEngine();
284
285 protected:
286 /**
287  * <p>Divide up a range of known dictionary characters.</p>
288  *
289  * @param text A UText representing the text
290  * @param rangeStart The start of the range of dictionary characters
291  * @param rangeEnd The end of the range of dictionary characters
292  * @param foundBreaks Output of C array of int32_t break positions, or 0
293  * @return The number of breaks found
294  */
295  virtual int32_t divideUpDictionaryRange( UText *text,
296                                           int32_t rangeStart,
297                                           int32_t rangeEnd,
298                                           UStack &foundBreaks ) const;
299
300};
301
302/*******************************************************************
303 * KhmerBreakEngine
304 */
305
306/**
307 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
308 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
309 *
310 * <p>After it is constructed a KhmerBreakEngine may be shared between
311 * threads without synchronization.</p>
312 */
313class KhmerBreakEngine : public DictionaryBreakEngine {
314 private:
315    /**
316     * The set of characters handled by this engine
317     * @internal
318     */
319
320  UnicodeSet                fKhmerWordSet;
321  UnicodeSet                fEndWordSet;
322  UnicodeSet                fBeginWordSet;
323  UnicodeSet                fMarkSet;
324  DictionaryMatcher  *fDictionary;
325
326 public:
327
328  /**
329   * <p>Default constructor.</p>
330   *
331   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
332   * engine is deleted.
333   */
334  KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
335
336  /**
337   * <p>Virtual destructor.</p>
338   */
339  virtual ~KhmerBreakEngine();
340
341 protected:
342 /**
343  * <p>Divide up a range of known dictionary characters.</p>
344  *
345  * @param text A UText representing the text
346  * @param rangeStart The start of the range of dictionary characters
347  * @param rangeEnd The end of the range of dictionary characters
348  * @param foundBreaks Output of C array of int32_t break positions, or 0
349  * @return The number of breaks found
350  */
351  virtual int32_t divideUpDictionaryRange( UText *text,
352                                           int32_t rangeStart,
353                                           int32_t rangeEnd,
354                                           UStack &foundBreaks ) const;
355
356};
357
358#if !UCONFIG_NO_NORMALIZATION
359
360/*******************************************************************
361 * CjkBreakEngine
362 */
363
364//indicates language/script that the CjkBreakEngine will handle
365enum LanguageType {
366    kKorean,
367    kChineseJapanese
368};
369
370/**
371 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
372 * dictionary with costs associated with each word and
373 * Viterbi decoding to determine CJK-specific breaks.</p>
374 */
375class CjkBreakEngine : public DictionaryBreakEngine {
376 protected:
377    /**
378     * The set of characters handled by this engine
379     * @internal
380     */
381  UnicodeSet                fHangulWordSet;
382  UnicodeSet                fHanWordSet;
383  UnicodeSet                fKatakanaWordSet;
384  UnicodeSet                fHiraganaWordSet;
385
386  DictionaryMatcher        *fDictionary;
387  const Normalizer2        *nfkcNorm2;
388
389 public:
390
391    /**
392     * <p>Default constructor.</p>
393     *
394     * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
395     * engine is deleted. The DictionaryMatcher must contain costs for each word
396     * in order for the dictionary to work properly.
397     */
398  CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
399
400    /**
401     * <p>Virtual destructor.</p>
402     */
403  virtual ~CjkBreakEngine();
404
405 protected:
406    /**
407     * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
408     *
409     * @param text A UText representing the text
410     * @param rangeStart The start of the range of dictionary characters
411     * @param rangeEnd The end of the range of dictionary characters
412     * @param foundBreaks Output of C array of int32_t break positions, or 0
413     * @return The number of breaks found
414     */
415  virtual int32_t divideUpDictionaryRange( UText *text,
416          int32_t rangeStart,
417          int32_t rangeEnd,
418          UStack &foundBreaks ) const;
419
420};
421
422#endif
423
424U_NAMESPACE_END
425
426    /* DICTBE_H */
427#endif
428