1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/**
4 *******************************************************************************
5 * Copyright (C) 2006-2014, International Business Machines Corporation   *
6 * and others. All Rights Reserved.                                            *
7 *******************************************************************************
8 */
9
10#ifndef DICTBE_H
11#define DICTBE_H
12
13#include "unicode/utypes.h"
14#include "unicode/uniset.h"
15#include "unicode/utext.h"
16
17#include "brkeng.h"
18
19U_NAMESPACE_BEGIN
20
21class DictionaryMatcher;
22class Normalizer2;
23
24/*******************************************************************
25 * DictionaryBreakEngine
26 */
27
28/**
29 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
30 * dictionary to determine language-specific breaks.</p>
31 *
32 * <p>After it is constructed a DictionaryBreakEngine may be shared between
33 * threads without synchronization.</p>
34 */
35class DictionaryBreakEngine : public LanguageBreakEngine {
36 private:
37    /**
38     * The set of characters handled by this engine
39     * @internal
40     */
41
42  UnicodeSet    fSet;
43
44    /**
45     * The set of break types handled by this engine
46     * @internal
47     */
48
49  uint32_t      fTypes;
50
51  /**
52   * <p>Default constructor.</p>
53   *
54   */
55  DictionaryBreakEngine();
56
57 public:
58
59  /**
60   * <p>Constructor setting the break types handled.</p>
61   *
62   * @param breakTypes A bitmap of types handled by the engine.
63   */
64  DictionaryBreakEngine( uint32_t breakTypes );
65
66  /**
67   * <p>Virtual destructor.</p>
68   */
69  virtual ~DictionaryBreakEngine();
70
71  /**
72   * <p>Indicate whether this engine handles a particular character for
73   * a particular kind of break.</p>
74   *
75   * @param c A character which begins a run that the engine might handle
76   * @param breakType The type of text break which the caller wants to determine
77   * @return TRUE if this engine handles the particular character and break
78   * type.
79   */
80  virtual UBool handles( UChar32 c, int32_t breakType ) const;
81
82  /**
83   * <p>Find any breaks within a run in the supplied text.</p>
84   *
85   * @param text A UText representing the text. The iterator is left at
86   * the end of the run of characters which the engine is capable of handling
87   * that starts from the first (or last) character in the range.
88   * @param startPos The start of the run within the supplied text.
89   * @param endPos The end of the run within the supplied text.
90   * @param reverse Whether the caller is looking for breaks in a reverse
91   * direction.
92   * @param breakType The type of break desired, or -1.
93   * @param foundBreaks An allocated C array of the breaks found, if any
94   * @return The number of breaks found.
95   */
96  virtual int32_t findBreaks( UText *text,
97                              int32_t startPos,
98                              int32_t endPos,
99                              UBool reverse,
100                              int32_t breakType,
101                              UStack &foundBreaks ) const;
102
103 protected:
104
105 /**
106  * <p>Set the character set handled by this engine.</p>
107  *
108  * @param set A UnicodeSet of the set of characters handled by the engine
109  */
110  virtual void setCharacters( const UnicodeSet &set );
111
112 /**
113  * <p>Set the break types handled by this engine.</p>
114  *
115  * @param breakTypes A bitmap of types handled by the engine.
116  */
117//  virtual void setBreakTypes( uint32_t breakTypes );
118
119 /**
120  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
121  *
122  * @param text A UText representing the text
123  * @param rangeStart The start of the range of dictionary characters
124  * @param rangeEnd The end of the range of dictionary characters
125  * @param foundBreaks Output of C array of int32_t break positions, or 0
126  * @return The number of breaks found
127  */
128  virtual int32_t divideUpDictionaryRange( UText *text,
129                                           int32_t rangeStart,
130                                           int32_t rangeEnd,
131                                           UStack &foundBreaks ) const = 0;
132
133};
134
135/*******************************************************************
136 * ThaiBreakEngine
137 */
138
139/**
140 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
141 * dictionary and heuristics to determine Thai-specific breaks.</p>
142 *
143 * <p>After it is constructed a ThaiBreakEngine may be shared between
144 * threads without synchronization.</p>
145 */
146class ThaiBreakEngine : public DictionaryBreakEngine {
147 private:
148    /**
149     * The set of characters handled by this engine
150     * @internal
151     */
152
153  UnicodeSet                fThaiWordSet;
154  UnicodeSet                fEndWordSet;
155  UnicodeSet                fBeginWordSet;
156  UnicodeSet                fSuffixSet;
157  UnicodeSet                fMarkSet;
158  DictionaryMatcher  *fDictionary;
159
160 public:
161
162  /**
163   * <p>Default constructor.</p>
164   *
165   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
166   * engine is deleted.
167   */
168  ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
169
170  /**
171   * <p>Virtual destructor.</p>
172   */
173  virtual ~ThaiBreakEngine();
174
175 protected:
176 /**
177  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
178  *
179  * @param text A UText representing the text
180  * @param rangeStart The start of the range of dictionary characters
181  * @param rangeEnd The end of the range of dictionary characters
182  * @param foundBreaks Output of C array of int32_t break positions, or 0
183  * @return The number of breaks found
184  */
185  virtual int32_t divideUpDictionaryRange( UText *text,
186                                           int32_t rangeStart,
187                                           int32_t rangeEnd,
188                                           UStack &foundBreaks ) const;
189
190};
191
192/*******************************************************************
193 * LaoBreakEngine
194 */
195
196/**
197 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
198 * dictionary and heuristics to determine Lao-specific breaks.</p>
199 *
200 * <p>After it is constructed a LaoBreakEngine may be shared between
201 * threads without synchronization.</p>
202 */
203class LaoBreakEngine : public DictionaryBreakEngine {
204 private:
205    /**
206     * The set of characters handled by this engine
207     * @internal
208     */
209
210  UnicodeSet                fLaoWordSet;
211  UnicodeSet                fEndWordSet;
212  UnicodeSet                fBeginWordSet;
213  UnicodeSet                fMarkSet;
214  DictionaryMatcher  *fDictionary;
215
216 public:
217
218  /**
219   * <p>Default constructor.</p>
220   *
221   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
222   * engine is deleted.
223   */
224  LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
225
226  /**
227   * <p>Virtual destructor.</p>
228   */
229  virtual ~LaoBreakEngine();
230
231 protected:
232 /**
233  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
234  *
235  * @param text A UText representing the text
236  * @param rangeStart The start of the range of dictionary characters
237  * @param rangeEnd The end of the range of dictionary characters
238  * @param foundBreaks Output of C array of int32_t break positions, or 0
239  * @return The number of breaks found
240  */
241  virtual int32_t divideUpDictionaryRange( UText *text,
242                                           int32_t rangeStart,
243                                           int32_t rangeEnd,
244                                           UStack &foundBreaks ) const;
245
246};
247
248/*******************************************************************
249 * BurmeseBreakEngine
250 */
251
252/**
253 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
254 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
255 *
256 * <p>After it is constructed a BurmeseBreakEngine may be shared between
257 * threads without synchronization.</p>
258 */
259class BurmeseBreakEngine : public DictionaryBreakEngine {
260 private:
261    /**
262     * The set of characters handled by this engine
263     * @internal
264     */
265
266  UnicodeSet                fBurmeseWordSet;
267  UnicodeSet                fEndWordSet;
268  UnicodeSet                fBeginWordSet;
269  UnicodeSet                fMarkSet;
270  DictionaryMatcher  *fDictionary;
271
272 public:
273
274  /**
275   * <p>Default constructor.</p>
276   *
277   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
278   * engine is deleted.
279   */
280  BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
281
282  /**
283   * <p>Virtual destructor.</p>
284   */
285  virtual ~BurmeseBreakEngine();
286
287 protected:
288 /**
289  * <p>Divide up a range of known dictionary characters.</p>
290  *
291  * @param text A UText representing the text
292  * @param rangeStart The start of the range of dictionary characters
293  * @param rangeEnd The end of the range of dictionary characters
294  * @param foundBreaks Output of C array of int32_t break positions, or 0
295  * @return The number of breaks found
296  */
297  virtual int32_t divideUpDictionaryRange( UText *text,
298                                           int32_t rangeStart,
299                                           int32_t rangeEnd,
300                                           UStack &foundBreaks ) const;
301
302};
303
304/*******************************************************************
305 * KhmerBreakEngine
306 */
307
308/**
309 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
310 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
311 *
312 * <p>After it is constructed a KhmerBreakEngine may be shared between
313 * threads without synchronization.</p>
314 */
315class KhmerBreakEngine : public DictionaryBreakEngine {
316 private:
317    /**
318     * The set of characters handled by this engine
319     * @internal
320     */
321
322  UnicodeSet                fKhmerWordSet;
323  UnicodeSet                fEndWordSet;
324  UnicodeSet                fBeginWordSet;
325  UnicodeSet                fMarkSet;
326  DictionaryMatcher  *fDictionary;
327
328 public:
329
330  /**
331   * <p>Default constructor.</p>
332   *
333   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
334   * engine is deleted.
335   */
336  KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
337
338  /**
339   * <p>Virtual destructor.</p>
340   */
341  virtual ~KhmerBreakEngine();
342
343 protected:
344 /**
345  * <p>Divide up a range of known dictionary characters.</p>
346  *
347  * @param text A UText representing the text
348  * @param rangeStart The start of the range of dictionary characters
349  * @param rangeEnd The end of the range of dictionary characters
350  * @param foundBreaks Output of C array of int32_t break positions, or 0
351  * @return The number of breaks found
352  */
353  virtual int32_t divideUpDictionaryRange( UText *text,
354                                           int32_t rangeStart,
355                                           int32_t rangeEnd,
356                                           UStack &foundBreaks ) const;
357
358};
359
360#if !UCONFIG_NO_NORMALIZATION
361
362/*******************************************************************
363 * CjkBreakEngine
364 */
365
366//indicates language/script that the CjkBreakEngine will handle
367enum LanguageType {
368    kKorean,
369    kChineseJapanese
370};
371
372/**
373 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
374 * dictionary with costs associated with each word and
375 * Viterbi decoding to determine CJK-specific breaks.</p>
376 */
377class CjkBreakEngine : public DictionaryBreakEngine {
378 protected:
379    /**
380     * The set of characters handled by this engine
381     * @internal
382     */
383  UnicodeSet                fHangulWordSet;
384  UnicodeSet                fHanWordSet;
385  UnicodeSet                fKatakanaWordSet;
386  UnicodeSet                fHiraganaWordSet;
387
388  DictionaryMatcher        *fDictionary;
389  const Normalizer2        *nfkcNorm2;
390
391 public:
392
393    /**
394     * <p>Default constructor.</p>
395     *
396     * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
397     * engine is deleted. The DictionaryMatcher must contain costs for each word
398     * in order for the dictionary to work properly.
399     */
400  CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
401
402    /**
403     * <p>Virtual destructor.</p>
404     */
405  virtual ~CjkBreakEngine();
406
407 protected:
408    /**
409     * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
410     *
411     * @param text A UText representing the text
412     * @param rangeStart The start of the range of dictionary characters
413     * @param rangeEnd The end of the range of dictionary characters
414     * @param foundBreaks Output of C array of int32_t break positions, or 0
415     * @return The number of breaks found
416     */
417  virtual int32_t divideUpDictionaryRange( UText *text,
418          int32_t rangeStart,
419          int32_t rangeEnd,
420          UStack &foundBreaks ) const;
421
422};
423
424#endif
425
426U_NAMESPACE_END
427
428    /* DICTBE_H */
429#endif
430