1/*
2******************************************************************************
3* Copyright (C) 1996-2009, International Business Machines Corporation and others.
4* All Rights Reserved.
5******************************************************************************
6*/
7
8#ifndef UBRK_H
9#define UBRK_H
10
11#include "unicode/utypes.h"
12#include "unicode/uloc.h"
13#include "unicode/utext.h"
14
15/**
16 * A text-break iterator.
17 *  For usage in C programs.
18 */
19#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
20#   define UBRK_TYPEDEF_UBREAK_ITERATOR
21    /**
22     *  Opaque type representing an ICU Break iterator object.
23     *  @stable ICU 2.0
24     */
25    typedef void UBreakIterator;
26#endif
27
28#if !UCONFIG_NO_BREAK_ITERATION
29
30#include "unicode/parseerr.h"
31
32/**
33 * \file
34 * \brief C API: BreakIterator
35 *
36 * <h2> BreakIterator C API </h2>
37 *
38 * The BreakIterator C API defines  methods for finding the location
39 * of boundaries in text. Pointer to a UBreakIterator maintain a
40 * current position and scan over text returning the index of characters
41 * where boundaries occur.
42 * <p>
43 * Line boundary analysis determines where a text string can be broken
44 * when line-wrapping. The mechanism correctly handles punctuation and
45 * hyphenated words.
46 * <p>
47 * Sentence boundary analysis allows selection with correct
48 * interpretation of periods within numbers and abbreviations, and
49 * trailing punctuation marks such as quotation marks and parentheses.
50 * <p>
51 * Word boundary analysis is used by search and replace functions, as
52 * well as within text editing applications that allow the user to
53 * select words with a double click. Word selection provides correct
54 * interpretation of punctuation marks within and following
55 * words. Characters that are not part of a word, such as symbols or
56 * punctuation marks, have word-breaks on both sides.
57 * <p>
58 * Character boundary analysis identifies the boundaries of
59 * "Extended Grapheme Clusters", which are groupings of codepoints
60 * that should be treated as character-like units for many text operations.
61 * Please see Unicode Standard Annex #29, Unicode Text Segmentation,
62 * http://www.unicode.org/reports/tr29/ for additional information
63 * on grapheme clusters and guidelines on their use.
64 * <p>
65 * Title boundary analysis locates all positions,
66 * typically starts of words, that should be set to Title Case
67 * when title casing the text.
68 * <p>
69 * The text boundary positions are found according to the rules
70 * described in Unicode Standard Annex #29, Text Boundaries, and
71 * Unicode Standard Annex #14, Line Breaking Properties.  These
72 * are available at http://www.unicode.org/reports/tr14/ and
73 * http://www.unicode.org/reports/tr29/.
74 * <p>
75 * In addition to the plain C API defined in this header file, an
76 * object oriented C++ API with equivalent functionality is defined in the
77 * file brkiter.h.
78 * <p>
79 * Code snippits illustrating the use of the Break Iterator APIs
80 * are available in the ICU User Guide,
81 * http://icu-project.org/userguide/boundaryAnalysis.html
82 * and in the sample program icu/source/samples/break/break.cpp"
83 */
84
85/** The possible types of text boundaries.  @stable ICU 2.0 */
86typedef enum UBreakIteratorType {
87  /** Character breaks  @stable ICU 2.0 */
88  UBRK_CHARACTER = 0,
89  /** Word breaks @stable ICU 2.0 */
90  UBRK_WORD = 1,
91  /** Line breaks @stable ICU 2.0 */
92  UBRK_LINE = 2,
93  /** Sentence breaks @stable ICU 2.0 */
94  UBRK_SENTENCE = 3,
95
96#ifndef U_HIDE_DEPRECATED_API
97  /**
98   * Title Case breaks
99   * The iterator created using this type locates title boundaries as described for
100   * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
101   * please use Word Boundary iterator.
102   *
103   * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
104   */
105  UBRK_TITLE = 4,
106#endif /* U_HIDE_DEPRECATED_API */
107  UBRK_COUNT = 5
108} UBreakIteratorType;
109
110/** Value indicating all text boundaries have been returned.
111 *  @stable ICU 2.0
112 */
113#define UBRK_DONE ((int32_t) -1)
114
115
116/**
117 *  Enum constants for the word break tags returned by
118 *  getRuleStatus().  A range of values is defined for each category of
119 *  word, to allow for further subdivisions of a category in future releases.
120 *  Applications should check for tag values falling within the range, rather
121 *  than for single individual values.
122 *  @stable ICU 2.2
123*/
124typedef enum UWordBreak {
125    /** Tag value for "words" that do not fit into any of other categories.
126     *  Includes spaces and most punctuation. */
127    UBRK_WORD_NONE           = 0,
128    /** Upper bound for tags for uncategorized words. */
129    UBRK_WORD_NONE_LIMIT     = 100,
130    /** Tag value for words that appear to be numbers, lower limit.    */
131    UBRK_WORD_NUMBER         = 100,
132    /** Tag value for words that appear to be numbers, upper limit.    */
133    UBRK_WORD_NUMBER_LIMIT   = 200,
134    /** Tag value for words that contain letters, excluding
135     *  hiragana, katakana or ideographic characters, lower limit.    */
136    UBRK_WORD_LETTER         = 200,
137    /** Tag value for words containing letters, upper limit  */
138    UBRK_WORD_LETTER_LIMIT   = 300,
139    /** Tag value for words containing kana characters, lower limit */
140    UBRK_WORD_KANA           = 300,
141    /** Tag value for words containing kana characters, upper limit */
142    UBRK_WORD_KANA_LIMIT     = 400,
143    /** Tag value for words containing ideographic characters, lower limit */
144    UBRK_WORD_IDEO           = 400,
145    /** Tag value for words containing ideographic characters, upper limit */
146    UBRK_WORD_IDEO_LIMIT     = 500
147} UWordBreak;
148
149/**
150 *  Enum constants for the line break tags returned by getRuleStatus().
151 *  A range of values is defined for each category of
152 *  word, to allow for further subdivisions of a category in future releases.
153 *  Applications should check for tag values falling within the range, rather
154 *  than for single individual values.
155 *  @stable ICU 2.8
156*/
157typedef enum ULineBreakTag {
158    /** Tag value for soft line breaks, positions at which a line break
159      *  is acceptable but not required                */
160    UBRK_LINE_SOFT            = 0,
161    /** Upper bound for soft line breaks.              */
162    UBRK_LINE_SOFT_LIMIT      = 100,
163    /** Tag value for a hard, or mandatory line break  */
164    UBRK_LINE_HARD            = 100,
165    /** Upper bound for hard line breaks.              */
166    UBRK_LINE_HARD_LIMIT      = 200
167} ULineBreakTag;
168
169
170
171/**
172 *  Enum constants for the sentence break tags returned by getRuleStatus().
173 *  A range of values is defined for each category of
174 *  sentence, to allow for further subdivisions of a category in future releases.
175 *  Applications should check for tag values falling within the range, rather
176 *  than for single individual values.
177 *  @stable ICU 2.8
178*/
179typedef enum USentenceBreakTag {
180    /** Tag value for for sentences  ending with a sentence terminator
181      * ('.', '?', '!', etc.) character, possibly followed by a
182      * hard separator (CR, LF, PS, etc.)
183      */
184    UBRK_SENTENCE_TERM       = 0,
185    /** Upper bound for tags for sentences ended by sentence terminators.    */
186    UBRK_SENTENCE_TERM_LIMIT = 100,
187    /** Tag value for for sentences that do not contain an ending
188      * sentence terminator ('.', '?', '!', etc.) character, but
189      * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
190      */
191    UBRK_SENTENCE_SEP        = 100,
192    /** Upper bound for tags for sentences ended by a separator.              */
193    UBRK_SENTENCE_SEP_LIMIT  = 200
194    /** Tag value for a hard, or mandatory line break  */
195} USentenceBreakTag;
196
197
198/**
199 * Open a new UBreakIterator for locating text boundaries for a specified locale.
200 * A UBreakIterator may be used for detecting character, line, word,
201 * and sentence breaks in text.
202 * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
203 * UBRK_LINE, UBRK_SENTENCE
204 * @param locale The locale specifying the text-breaking conventions.
205 * @param text The text to be iterated over.
206 * @param textLength The number of characters in text, or -1 if null-terminated.
207 * @param status A UErrorCode to receive any errors.
208 * @return A UBreakIterator for the specified locale.
209 * @see ubrk_openRules
210 * @stable ICU 2.0
211 */
212U_STABLE UBreakIterator* U_EXPORT2
213ubrk_open(UBreakIteratorType type,
214      const char *locale,
215      const UChar *text,
216      int32_t textLength,
217      UErrorCode *status);
218
219/**
220 * Open a new UBreakIterator for locating text boundaries using specified breaking rules.
221 * The rule syntax is ... (TBD)
222 * @param rules A set of rules specifying the text breaking conventions.
223 * @param rulesLength The number of characters in rules, or -1 if null-terminated.
224 * @param text The text to be iterated over.  May be null, in which case ubrk_setText() is
225 *        used to specify the text to be iterated.
226 * @param textLength The number of characters in text, or -1 if null-terminated.
227 * @param parseErr   Receives position and context information for any syntax errors
228 *                   detected while parsing the rules.
229 * @param status A UErrorCode to receive any errors.
230 * @return A UBreakIterator for the specified rules.
231 * @see ubrk_open
232 * @stable ICU 2.2
233 */
234U_STABLE UBreakIterator* U_EXPORT2
235ubrk_openRules(const UChar     *rules,
236               int32_t         rulesLength,
237               const UChar     *text,
238               int32_t          textLength,
239               UParseError     *parseErr,
240               UErrorCode      *status);
241
242/**
243 * Thread safe cloning operation
244 * @param bi iterator to be cloned
245 * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
246 *  If buffer is not large enough, new memory will be allocated.
247 *  Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations.
248 * @param pBufferSize pointer to size of allocated space.
249 *  If *pBufferSize == 0, a sufficient size for use in cloning will
250 *  be returned ('pre-flighting')
251 *  If *pBufferSize is not enough for a stack-based safe clone,
252 *  new memory will be allocated.
253 * @param status to indicate whether the operation went on smoothly or there were errors
254 *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
255 * @return pointer to the new clone
256 * @stable ICU 2.0
257 */
258U_STABLE UBreakIterator * U_EXPORT2
259ubrk_safeClone(
260          const UBreakIterator *bi,
261          void *stackBuffer,
262          int32_t *pBufferSize,
263          UErrorCode *status);
264
265/**
266  * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
267  * @stable ICU 2.0
268  */
269#define U_BRK_SAFECLONE_BUFFERSIZE 512
270
271/**
272* Close a UBreakIterator.
273* Once closed, a UBreakIterator may no longer be used.
274* @param bi The break iterator to close.
275 * @stable ICU 2.0
276*/
277U_STABLE void U_EXPORT2
278ubrk_close(UBreakIterator *bi);
279
280/**
281 * Sets an existing iterator to point to a new piece of text
282 * @param bi The iterator to use
283 * @param text The text to be set
284 * @param textLength The length of the text
285 * @param status The error code
286 * @stable ICU 2.0
287 */
288U_STABLE void U_EXPORT2
289ubrk_setText(UBreakIterator* bi,
290             const UChar*    text,
291             int32_t         textLength,
292             UErrorCode*     status);
293
294
295/**
296 * Sets an existing iterator to point to a new piece of text
297 * @param bi The iterator to use
298 * @param text The text to be set.
299 *             This function makes a shallow clone of the supplied UText.  This means
300 *             that the caller is free to immediately close or otherwise reuse the
301 *             UText that was passed as a parameter, but that the underlying text itself
302 *             must not be altered while being referenced by the break iterator.
303 * @param status The error code
304 * @stable ICU 3.4
305 */
306U_STABLE void U_EXPORT2
307ubrk_setUText(UBreakIterator* bi,
308             UText*          text,
309             UErrorCode*     status);
310
311
312
313/**
314 * Determine the most recently-returned text boundary.
315 *
316 * @param bi The break iterator to use.
317 * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
318 * \ref ubrk_first, or \ref ubrk_last.
319 * @stable ICU 2.0
320 */
321U_STABLE int32_t U_EXPORT2
322ubrk_current(const UBreakIterator *bi);
323
324/**
325 * Determine the text boundary following the current text boundary.
326 *
327 * @param bi The break iterator to use.
328 * @return The character index of the next text boundary, or UBRK_DONE
329 * if all text boundaries have been returned.
330 * @see ubrk_previous
331 * @stable ICU 2.0
332 */
333U_STABLE int32_t U_EXPORT2
334ubrk_next(UBreakIterator *bi);
335
336/**
337 * Determine the text boundary preceding the current text boundary.
338 *
339 * @param bi The break iterator to use.
340 * @return The character index of the preceding text boundary, or UBRK_DONE
341 * if all text boundaries have been returned.
342 * @see ubrk_next
343 * @stable ICU 2.0
344 */
345U_STABLE int32_t U_EXPORT2
346ubrk_previous(UBreakIterator *bi);
347
348/**
349 * Determine the index of the first character in the text being scanned.
350 * This is not always the same as index 0 of the text.
351 * @param bi The break iterator to use.
352 * @return The character index of the first character in the text being scanned.
353 * @see ubrk_last
354 * @stable ICU 2.0
355 */
356U_STABLE int32_t U_EXPORT2
357ubrk_first(UBreakIterator *bi);
358
359/**
360 * Determine the index immediately <EM>beyond</EM> the last character in the text being
361 * scanned.
362 * This is not the same as the last character.
363 * @param bi The break iterator to use.
364 * @return The character offset immediately <EM>beyond</EM> the last character in the
365 * text being scanned.
366 * @see ubrk_first
367 * @stable ICU 2.0
368 */
369U_STABLE int32_t U_EXPORT2
370ubrk_last(UBreakIterator *bi);
371
372/**
373 * Determine the text boundary preceding the specified offset.
374 * The value returned is always smaller than offset, or UBRK_DONE.
375 * @param bi The break iterator to use.
376 * @param offset The offset to begin scanning.
377 * @return The text boundary preceding offset, or UBRK_DONE.
378 * @see ubrk_following
379 * @stable ICU 2.0
380 */
381U_STABLE int32_t U_EXPORT2
382ubrk_preceding(UBreakIterator *bi,
383           int32_t offset);
384
385/**
386 * Determine the text boundary following the specified offset.
387 * The value returned is always greater than offset, or UBRK_DONE.
388 * @param bi The break iterator to use.
389 * @param offset The offset to begin scanning.
390 * @return The text boundary following offset, or UBRK_DONE.
391 * @see ubrk_preceding
392 * @stable ICU 2.0
393 */
394U_STABLE int32_t U_EXPORT2
395ubrk_following(UBreakIterator *bi,
396           int32_t offset);
397
398/**
399* Get a locale for which text breaking information is available.
400* A UBreakIterator in a locale returned by this function will perform the correct
401* text breaking for the locale.
402* @param index The index of the desired locale.
403* @return A locale for which number text breaking information is available, or 0 if none.
404* @see ubrk_countAvailable
405* @stable ICU 2.0
406*/
407U_STABLE const char* U_EXPORT2
408ubrk_getAvailable(int32_t index);
409
410/**
411* Determine how many locales have text breaking information available.
412* This function is most useful as determining the loop ending condition for
413* calls to \ref ubrk_getAvailable.
414* @return The number of locales for which text breaking information is available.
415* @see ubrk_getAvailable
416* @stable ICU 2.0
417*/
418U_STABLE int32_t U_EXPORT2
419ubrk_countAvailable(void);
420
421
422/**
423* Returns true if the specfied position is a boundary position.  As a side
424* effect, leaves the iterator pointing to the first boundary position at
425* or after "offset".
426* @param bi The break iterator to use.
427* @param offset the offset to check.
428* @return True if "offset" is a boundary position.
429* @stable ICU 2.0
430*/
431U_STABLE  UBool U_EXPORT2
432ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
433
434/**
435 * Return the status from the break rule that determined the most recently
436 * returned break position.  The values appear in the rule source
437 * within brackets, {123}, for example.  For rules that do not specify a
438 * status, a default value of 0 is returned.
439 * <p>
440 * For word break iterators, the possible values are defined in enum UWordBreak.
441 * @stable ICU 2.2
442 */
443U_STABLE  int32_t U_EXPORT2
444ubrk_getRuleStatus(UBreakIterator *bi);
445
446/**
447 * Get the statuses from the break rules that determined the most recently
448 * returned break position.  The values appear in the rule source
449 * within brackets, {123}, for example.  The default status value for rules
450 * that do not explicitly provide one is zero.
451 * <p>
452 * For word break iterators, the possible values are defined in enum UWordBreak.
453 * @param bi        The break iterator to use
454 * @param fillInVec an array to be filled in with the status values.
455 * @param capacity  the length of the supplied vector.  A length of zero causes
456 *                  the function to return the number of status values, in the
457 *                  normal way, without attemtping to store any values.
458 * @param status    receives error codes.
459 * @return          The number of rule status values from rules that determined
460 *                  the most recent boundary returned by the break iterator.
461 * @stable ICU 3.0
462 */
463U_STABLE  int32_t U_EXPORT2
464ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);
465
466/**
467 * Return the locale of the break iterator. You can choose between the valid and
468 * the actual locale.
469 * @param bi break iterator
470 * @param type locale type (valid or actual)
471 * @param status error code
472 * @return locale string
473 * @stable ICU 2.8
474 */
475U_STABLE const char* U_EXPORT2
476ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
477
478
479#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
480
481#endif
482