1/**
2 ************************************************************************************
3 * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
4 * All Rights Reserved.                                                             *
5 ************************************************************************************
6 */
7
8#ifndef BRKENG_H
9#define BRKENG_H
10
11#include "unicode/utypes.h"
12#include "unicode/uobject.h"
13#include "unicode/utext.h"
14#include "unicode/uscript.h"
15
16U_NAMESPACE_BEGIN
17
18class UnicodeSet;
19class UStack;
20class DictionaryMatcher;
21
22/*******************************************************************
23 * LanguageBreakEngine
24 */
25
26/**
27 * <p>LanguageBreakEngines implement language-specific knowledge for
28 * finding text boundaries within a run of characters belonging to a
29 * specific set. The boundaries will be of a specific kind, e.g. word,
30 * line, etc.</p>
31 *
32 * <p>LanguageBreakEngines should normally be implemented so as to
33 * be shared between threads without locking.</p>
34 */
35class LanguageBreakEngine : public UMemory {
36 public:
37
38  /**
39   * <p>Default constructor.</p>
40   *
41   */
42  LanguageBreakEngine();
43
44  /**
45   * <p>Virtual destructor.</p>
46   */
47  virtual ~LanguageBreakEngine();
48
49 /**
50  * <p>Indicate whether this engine handles a particular character for
51  * a particular kind of break.</p>
52  *
53  * @param c A character which begins a run that the engine might handle
54  * @param breakType The type of text break which the caller wants to determine
55  * @return TRUE if this engine handles the particular character and break
56  * type.
57  */
58  virtual UBool handles(UChar32 c, int32_t breakType) const = 0;
59
60 /**
61  * <p>Find any breaks within a run in the supplied text.</p>
62  *
63  * @param text A UText representing the text. The
64  * iterator is left at the end of the run of characters which the engine
65  * is capable of handling.
66  * @param startPos The start of the run within the supplied text.
67  * @param endPos The end of the run within the supplied text.
68  * @param reverse Whether the caller is looking for breaks in a reverse
69  * direction.
70  * @param breakType The type of break desired, or -1.
71  * @param foundBreaks An allocated C array of the breaks found, if any
72  * @return The number of breaks found.
73  */
74  virtual int32_t findBreaks( UText *text,
75                              int32_t startPos,
76                              int32_t endPos,
77                              UBool reverse,
78                              int32_t breakType,
79                              UStack &foundBreaks ) const = 0;
80
81};
82
83/*******************************************************************
84 * LanguageBreakFactory
85 */
86
87/**
88 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
89 * that can determine breaks for characters in a specific set, if
90 * such an object can be found.</p>
91 *
92 * <p>If a LanguageBreakFactory is to be shared between threads,
93 * appropriate synchronization must be used; there is none internal
94 * to the factory.</p>
95 *
96 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
97 * normally be shared between threads without synchronization, unless
98 * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
99 *
100 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
101 * it returns when it itself is deleted, unless the specific subclass of
102 * LanguageBreakFactory indicates otherwise. Naturally, the factory should
103 * not be deleted until the LanguageBreakEngines it has returned are no
104 * longer needed.</p>
105 */
106class LanguageBreakFactory : public UMemory {
107 public:
108
109  /**
110   * <p>Default constructor.</p>
111   *
112   */
113  LanguageBreakFactory();
114
115  /**
116   * <p>Virtual destructor.</p>
117   */
118  virtual ~LanguageBreakFactory();
119
120 /**
121  * <p>Find and return a LanguageBreakEngine that can find the desired
122  * kind of break for the set of characters to which the supplied
123  * character belongs. It is up to the set of available engines to
124  * determine what the sets of characters are.</p>
125  *
126  * @param c A character that begins a run for which a LanguageBreakEngine is
127  * sought.
128  * @param breakType The kind of text break for which a LanguageBreakEngine is
129  * sought.
130  * @return A LanguageBreakEngine with the desired characteristics, or 0.
131  */
132  virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0;
133
134};
135
136/*******************************************************************
137 * UnhandledEngine
138 */
139
140/**
141 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
142 * handles characters that no other LanguageBreakEngine is available to
143 * handle. It is told the character and the type of break; at its
144 * discretion it may handle more than the specified character (e.g.,
145 * the entire script to which that character belongs.</p>
146 *
147 * <p>UnhandledEngines may not be shared between threads without
148 * external synchronization.</p>
149 */
150
151class UnhandledEngine : public LanguageBreakEngine {
152 private:
153
154    /**
155     * The sets of characters handled, for each break type
156     * @internal
157     */
158
159  UnicodeSet    *fHandled[4];
160
161 public:
162
163  /**
164   * <p>Default constructor.</p>
165   *
166   */
167  UnhandledEngine(UErrorCode &status);
168
169  /**
170   * <p>Virtual destructor.</p>
171   */
172  virtual ~UnhandledEngine();
173
174 /**
175  * <p>Indicate whether this engine handles a particular character for
176  * a particular kind of break.</p>
177  *
178  * @param c A character which begins a run that the engine might handle
179  * @param breakType The type of text break which the caller wants to determine
180  * @return TRUE if this engine handles the particular character and break
181  * type.
182  */
183  virtual UBool handles(UChar32 c, int32_t breakType) const;
184
185 /**
186  * <p>Find any breaks within a run in the supplied text.</p>
187  *
188  * @param text A UText representing the text (TODO: UText). The
189  * iterator is left at the end of the run of characters which the engine
190  * is capable of handling.
191  * @param startPos The start of the run within the supplied text.
192  * @param endPos The end of the run within the supplied text.
193  * @param reverse Whether the caller is looking for breaks in a reverse
194  * direction.
195  * @param breakType The type of break desired, or -1.
196  * @param foundBreaks An allocated C array of the breaks found, if any
197  * @return The number of breaks found.
198  */
199  virtual int32_t findBreaks( UText *text,
200                              int32_t startPos,
201                              int32_t endPos,
202                              UBool reverse,
203                              int32_t breakType,
204                              UStack &foundBreaks ) const;
205
206 /**
207  * <p>Tell the engine to handle a particular character and break type.</p>
208  *
209  * @param c A character which the engine should handle
210  * @param breakType The type of text break for which the engine should handle c
211  */
212  virtual void handleCharacter(UChar32 c, int32_t breakType);
213
214};
215
216/*******************************************************************
217 * ICULanguageBreakFactory
218 */
219
220/**
221 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
222 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
223 * data in the ICU data file.</p>
224 */
225class ICULanguageBreakFactory : public LanguageBreakFactory {
226 private:
227
228    /**
229     * The stack of break engines created by this factory
230     * @internal
231     */
232
233  UStack    *fEngines;
234
235 public:
236
237  /**
238   * <p>Standard constructor.</p>
239   *
240   */
241  ICULanguageBreakFactory(UErrorCode &status);
242
243  /**
244   * <p>Virtual destructor.</p>
245   */
246  virtual ~ICULanguageBreakFactory();
247
248 /**
249  * <p>Find and return a LanguageBreakEngine that can find the desired
250  * kind of break for the set of characters to which the supplied
251  * character belongs. It is up to the set of available engines to
252  * determine what the sets of characters are.</p>
253  *
254  * @param c A character that begins a run for which a LanguageBreakEngine is
255  * sought.
256  * @param breakType The kind of text break for which a LanguageBreakEngine is
257  * sought.
258  * @return A LanguageBreakEngine with the desired characteristics, or 0.
259  */
260  virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType);
261
262protected:
263 /**
264  * <p>Create a LanguageBreakEngine for the set of characters to which
265  * the supplied character belongs, for the specified break type.</p>
266  *
267  * @param c A character that begins a run for which a LanguageBreakEngine is
268  * sought.
269  * @param breakType The kind of text break for which a LanguageBreakEngine is
270  * sought.
271  * @return A LanguageBreakEngine with the desired characteristics, or 0.
272  */
273  virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType);
274
275  /**
276   * <p>Create a DictionaryMatcher for the specified script and break type.</p>
277   * @param script An ISO 15924 script code that identifies the dictionary to be
278   * created.
279   * @param breakType The kind of text break for which a dictionary is
280   * sought.
281   * @return A DictionaryMatcher with the desired characteristics, or NULL.
282   */
283  virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType);
284};
285
286U_NAMESPACE_END
287
288    /* BRKENG_H */
289#endif
290