ubrk.h revision b0ac937921a2c196d8b9da665135bf6ba01a1ccf
196fa612373e258120d351ed14361f964ad22f99dEvan Cheng/*
296fa612373e258120d351ed14361f964ad22f99dEvan Cheng******************************************************************************
396fa612373e258120d351ed14361f964ad22f99dEvan Cheng* Copyright (C) 1996-2009, International Business Machines Corporation and others.
496fa612373e258120d351ed14361f964ad22f99dEvan Cheng* All Rights Reserved.
57ed47a13356daed2a34cd2209a31f92552e3bdd8Chris Lattner******************************************************************************
67ed47a13356daed2a34cd2209a31f92552e3bdd8Chris Lattner*/
796fa612373e258120d351ed14361f964ad22f99dEvan Cheng
896fa612373e258120d351ed14361f964ad22f99dEvan Cheng#ifndef UBRK_H
996fa612373e258120d351ed14361f964ad22f99dEvan Cheng#define UBRK_H
1096fa612373e258120d351ed14361f964ad22f99dEvan Cheng
1196fa612373e258120d351ed14361f964ad22f99dEvan Cheng#include "unicode/utypes.h"
1296fa612373e258120d351ed14361f964ad22f99dEvan Cheng#include "unicode/uloc.h"
1396fa612373e258120d351ed14361f964ad22f99dEvan Cheng#include "unicode/utext.h"
1496fa612373e258120d351ed14361f964ad22f99dEvan Cheng
1596fa612373e258120d351ed14361f964ad22f99dEvan Cheng/**
1696fa612373e258120d351ed14361f964ad22f99dEvan Cheng * A text-break iterator.
17674be02d525d4e24bc6943ed9274958c580bcfbcJakub Staszak *  For usage in C programs.
18674be02d525d4e24bc6943ed9274958c580bcfbcJakub Staszak */
1996fa612373e258120d351ed14361f964ad22f99dEvan Cheng#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
20255f89faee13dc491cb64fbeae3c763e7e2ea4e6Chandler Carruth#   define UBRK_TYPEDEF_UBREAK_ITERATOR
2196fa612373e258120d351ed14361f964ad22f99dEvan Cheng    /**
22fb9ebbf236974beac31705eaeb9f50ab585af6abJakob Stoklund Olesen     *  Opaque type representing an ICU Break iterator object.
2396fa612373e258120d351ed14361f964ad22f99dEvan Cheng     *  @stable ICU 2.0
2496fa612373e258120d351ed14361f964ad22f99dEvan Cheng     */
2596fa612373e258120d351ed14361f964ad22f99dEvan Cheng    typedef void UBreakIterator;
26c5ea2010c06fa6a2eaac17e493fbfe8a042e132aEvan Cheng#endif
276f0d024a534af18d9e60b3ea757376cd8a3a980eDan Gohman
28b74a3e6fda768eb6160559e025f8b65c46db46d9Evan Cheng#if !UCONFIG_NO_BREAK_ITERATION
2996fa612373e258120d351ed14361f964ad22f99dEvan Cheng
3096fa612373e258120d351ed14361f964ad22f99dEvan Cheng#include "unicode/parseerr.h"
3196fa612373e258120d351ed14361f964ad22f99dEvan Cheng
32d37c13cfd1bf4b08d0b99d93c799a1caa74cf3c6Evan Cheng/**
33d37c13cfd1bf4b08d0b99d93c799a1caa74cf3c6Evan Cheng * \file
34d37c13cfd1bf4b08d0b99d93c799a1caa74cf3c6Evan Cheng * \brief C API: BreakIterator
3596fa612373e258120d351ed14361f964ad22f99dEvan Cheng *
3696fa612373e258120d351ed14361f964ad22f99dEvan Cheng * <h2> BreakIterator C API </h2>
3796fa612373e258120d351ed14361f964ad22f99dEvan Cheng *
3896fa612373e258120d351ed14361f964ad22f99dEvan Cheng * The BreakIterator C API defines  methods for finding the location
39898218cc5edecea1275ee266b2cd13313ea6b67bEvan Cheng * of boundaries in text. Pointer to a UBreakIterator maintain a
40898218cc5edecea1275ee266b2cd13313ea6b67bEvan Cheng * current position and scan over text returning the index of characters
41898218cc5edecea1275ee266b2cd13313ea6b67bEvan Cheng * where boundaries occur.
42bb6fb3357d6c1e9ffb15de4893e59e3bbdd600a3Evan Cheng * <p>
43dc3beb90178fc316f63790812b22201884eaa017Hal Finkel * Line boundary analysis determines where a text string can be broken
44dc3beb90178fc316f63790812b22201884eaa017Hal Finkel * when line-wrapping. The mechanism correctly handles punctuation and
45df23a60fa6ce053511388e1bccca5900757e1aacHal Finkel * hyphenated words.
46b74a3e6fda768eb6160559e025f8b65c46db46d9Evan Cheng * <p>
47dc3beb90178fc316f63790812b22201884eaa017Hal Finkel * Sentence boundary analysis allows selection with correct
48dc3beb90178fc316f63790812b22201884eaa017Hal Finkel * interpretation of periods within numbers and abbreviations, and
49b74a3e6fda768eb6160559e025f8b65c46db46d9Evan Cheng * trailing punctuation marks such as quotation marks and parentheses.
50dc3beb90178fc316f63790812b22201884eaa017Hal Finkel * <p>
51dc3beb90178fc316f63790812b22201884eaa017Hal Finkel * Word boundary analysis is used by search and replace functions, as
52dc3beb90178fc316f63790812b22201884eaa017Hal Finkel * well as within text editing applications that allow the user to
53dc3beb90178fc316f63790812b22201884eaa017Hal Finkel * select words with a double click. Word selection provides correct
54dc3beb90178fc316f63790812b22201884eaa017Hal Finkel * interpretation of punctuation marks within and following
55dc3beb90178fc316f63790812b22201884eaa017Hal Finkel * words. Characters that are not part of a word, such as symbols or
56dc3beb90178fc316f63790812b22201884eaa017Hal Finkel * punctuation marks, have word-breaks on both sides.
57dc3beb90178fc316f63790812b22201884eaa017Hal Finkel * <p>
58dc3beb90178fc316f63790812b22201884eaa017Hal Finkel * Character boundary analysis identifies the boundaries of
59dc3beb90178fc316f63790812b22201884eaa017Hal Finkel * "Extended Grapheme Clusters", which are groupings of codepoints
60d37c13cfd1bf4b08d0b99d93c799a1caa74cf3c6Evan Cheng * that should be treated as character-like units for many text operations.
61d37c13cfd1bf4b08d0b99d93c799a1caa74cf3c6Evan Cheng * Please see Unicode Standard Annex #29, Unicode Text Segmentation,
62d37c13cfd1bf4b08d0b99d93c799a1caa74cf3c6Evan Cheng * http://www.unicode.org/reports/tr29/ for additional information
63d37c13cfd1bf4b08d0b99d93c799a1caa74cf3c6Evan Cheng * on grapheme clusters and guidelines on their use.
64d37c13cfd1bf4b08d0b99d93c799a1caa74cf3c6Evan Cheng * <p>
65c6b9ef80a890fcf75f18cabc3fe2d5f9ef2faaf5Dale Johannesen * Title boundary analysis locates all positions,
6696fa612373e258120d351ed14361f964ad22f99dEvan Cheng * typically starts of words, that should be set to Title Case
6796fa612373e258120d351ed14361f964ad22f99dEvan Cheng * when title casing the text.
68c6b9ef80a890fcf75f18cabc3fe2d5f9ef2faaf5Dale Johannesen * <p>
6996fa612373e258120d351ed14361f964ad22f99dEvan Cheng * The text boundary positions are found according to the rules
709f946a24d9e69559d1e0aeb6d128c2fa19846c92Jakob Stoklund Olesen * described in Unicode Standard Annex #29, Text Boundaries, and
719f946a24d9e69559d1e0aeb6d128c2fa19846c92Jakob Stoklund Olesen * Unicode Standard Annex #14, Line Breaking Properties.  These
729f946a24d9e69559d1e0aeb6d128c2fa19846c92Jakob Stoklund Olesen * are available at http://www.unicode.org/reports/tr14/ and
739f946a24d9e69559d1e0aeb6d128c2fa19846c92Jakob Stoklund Olesen * http://www.unicode.org/reports/tr29/.
7496fa612373e258120d351ed14361f964ad22f99dEvan Cheng * <p>
75bb6fb3357d6c1e9ffb15de4893e59e3bbdd600a3Evan Cheng * In addition to the plain C API defined in this header file, an
76dc3beb90178fc316f63790812b22201884eaa017Hal Finkel * object oriented C++ API with equivalent functionality is defined in the
77bb6fb3357d6c1e9ffb15de4893e59e3bbdd600a3Evan Cheng * file brkiter.h.
7828654b6205acc56863cdf988eed3e345da11eca3Evan Cheng * <p>
7928654b6205acc56863cdf988eed3e345da11eca3Evan Cheng * Code snippits illustrating the use of the Break Iterator APIs
8028654b6205acc56863cdf988eed3e345da11eca3Evan Cheng * are available in the ICU User Guide,
8196fa612373e258120d351ed14361f964ad22f99dEvan Cheng * http://icu-project.org/userguide/boundaryAnalysis.html
82e0161ea1050fd4107f3307b1e25b3aac02c2ba16John Mosby * and in the sample program icu/source/samples/break/break.cpp"
83e0161ea1050fd4107f3307b1e25b3aac02c2ba16John Mosby */
84e0161ea1050fd4107f3307b1e25b3aac02c2ba16John Mosby
85e0161ea1050fd4107f3307b1e25b3aac02c2ba16John Mosby/** The possible types of text boundaries.  @stable ICU 2.0 */
8631f5591c91d4c012901018013aba19b0015fa6a0Jakob Stoklund Olesentypedef enum UBreakIteratorType {
8796fa612373e258120d351ed14361f964ad22f99dEvan Cheng  /** Character breaks  @stable ICU 2.0 */
8896fa612373e258120d351ed14361f964ad22f99dEvan Cheng  UBRK_CHARACTER = 0,
8931f5591c91d4c012901018013aba19b0015fa6a0Jakob Stoklund Olesen  /** Word breaks @stable ICU 2.0 */
9031f5591c91d4c012901018013aba19b0015fa6a0Jakob Stoklund Olesen  UBRK_WORD = 1,
91bb6fb3357d6c1e9ffb15de4893e59e3bbdd600a3Evan Cheng  /** Line breaks @stable ICU 2.0 */
925b200d8a133a07af1f7802025bd5a58a1cdd544dDan Gohman  UBRK_LINE = 2,
93bb6fb3357d6c1e9ffb15de4893e59e3bbdd600a3Evan Cheng  /** Sentence breaks @stable ICU 2.0 */
94bb6fb3357d6c1e9ffb15de4893e59e3bbdd600a3Evan Cheng  UBRK_SENTENCE = 3,
95ed570dedad945e1fe9a4bfeaa47276d875f1feedEvan Cheng
962e80991a7712d51f7637513703fc896f93eea252Hal Finkel#ifndef U_HIDE_DEPRECATED_API
972e80991a7712d51f7637513703fc896f93eea252Hal Finkel  /**
982e80991a7712d51f7637513703fc896f93eea252Hal Finkel   * Title Case breaks
992e80991a7712d51f7637513703fc896f93eea252Hal Finkel   * The iterator created using this type locates title boundaries as described for
1002e80991a7712d51f7637513703fc896f93eea252Hal Finkel   * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
1012e80991a7712d51f7637513703fc896f93eea252Hal Finkel   * please use Word Boundary iterator.
1022e80991a7712d51f7637513703fc896f93eea252Hal Finkel   *
1032e80991a7712d51f7637513703fc896f93eea252Hal Finkel   * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
1042e80991a7712d51f7637513703fc896f93eea252Hal Finkel   */
105f262b355593100c6e0fc629b03c76ab0b1e2d915Evan Cheng  UBRK_TITLE = 4,
1068846129f6eb58982a2cac22306c8c9b586084475Hal Finkel#endif /* U_HIDE_DEPRECATED_API */
1078846129f6eb58982a2cac22306c8c9b586084475Hal Finkel  UBRK_COUNT = 5
1088846129f6eb58982a2cac22306c8c9b586084475Hal Finkel} UBreakIteratorType;
1098846129f6eb58982a2cac22306c8c9b586084475Hal Finkel
1108846129f6eb58982a2cac22306c8c9b586084475Hal Finkel/** Value indicating all text boundaries have been returned.
111f262b355593100c6e0fc629b03c76ab0b1e2d915Evan Cheng *  @stable ICU 2.0
1122e80991a7712d51f7637513703fc896f93eea252Hal Finkel */
1132e80991a7712d51f7637513703fc896f93eea252Hal Finkel#define UBRK_DONE ((int32_t) -1)
1142e80991a7712d51f7637513703fc896f93eea252Hal Finkel
1152e80991a7712d51f7637513703fc896f93eea252Hal Finkel
11669cb9b78f11d505f4351a269fc90e7b77fcda437Dale Johannesen/**
11769cb9b78f11d505f4351a269fc90e7b77fcda437Dale Johannesen *  Enum constants for the word break tags returned by
11869cb9b78f11d505f4351a269fc90e7b77fcda437Dale Johannesen *  getRuleStatus().  A range of values is defined for each category of
119d9642faf7c66273eb3a8d99e5fa6b542da5374ddJim Grosbach *  word, to allow for further subdivisions of a category in future releases.
120d9642faf7c66273eb3a8d99e5fa6b542da5374ddJim Grosbach *  Applications should check for tag values falling within the range, rather
12127ea9999e84dfb1e6c2baf06ec27a92f12753917Jim Grosbach *  than for single individual values.
122d9642faf7c66273eb3a8d99e5fa6b542da5374ddJim Grosbach *  @stable ICU 2.2
12396fa612373e258120d351ed14361f964ad22f99dEvan Cheng*/
124c0823fe7c679ca8f7d1667a310c2fca97b9402d5Jakob Stoklund Olesentypedef enum UWordBreak {
125c0823fe7c679ca8f7d1667a310c2fca97b9402d5Jakob Stoklund Olesen    /** Tag value for "words" that do not fit into any of other categories.
12696fa612373e258120d351ed14361f964ad22f99dEvan Cheng     *  Includes spaces and most punctuation. */
127dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    UBRK_WORD_NONE           = 0,
128dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    /** Upper bound for tags for uncategorized words. */
129dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    UBRK_WORD_NONE_LIMIT     = 100,
130dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    /** Tag value for words that appear to be numbers, lower limit.    */
131dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    UBRK_WORD_NUMBER         = 100,
132dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    /** Tag value for words that appear to be numbers, upper limit.    */
133dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    UBRK_WORD_NUMBER_LIMIT   = 200,
134d0a3916e430201d0179c723e4ebdd9bf4f0ee02bEric Christopher    /** Tag value for words that contain letters, excluding
135dc3beb90178fc316f63790812b22201884eaa017Hal Finkel     *  hiragana, katakana or ideographic characters, lower limit.    */
136dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    UBRK_WORD_LETTER         = 200,
137dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    /** Tag value for words containing letters, upper limit  */
138dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    UBRK_WORD_LETTER_LIMIT   = 300,
139dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    /** Tag value for words containing kana characters, lower limit */
140dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    UBRK_WORD_KANA           = 300,
141dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    /** Tag value for words containing kana characters, upper limit */
142dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    UBRK_WORD_KANA_LIMIT     = 400,
143dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    /** Tag value for words containing ideographic characters, lower limit */
144d0a3916e430201d0179c723e4ebdd9bf4f0ee02bEric Christopher    UBRK_WORD_IDEO           = 400,
145dc3beb90178fc316f63790812b22201884eaa017Hal Finkel    /** Tag value for words containing ideographic characters, upper limit */
146df23a60fa6ce053511388e1bccca5900757e1aacHal Finkel    UBRK_WORD_IDEO_LIMIT     = 500
147df23a60fa6ce053511388e1bccca5900757e1aacHal Finkel} UWordBreak;
148dc3beb90178fc316f63790812b22201884eaa017Hal Finkel
149b74a3e6fda768eb6160559e025f8b65c46db46d9Evan Cheng/**
150b74a3e6fda768eb6160559e025f8b65c46db46d9Evan Cheng *  Enum constants for the line break tags returned by getRuleStatus().
151a09f0d4ab76725827d1c4e737b99ff15ba454cbcEvan Cheng *  A range of values is defined for each category of
152a09f0d4ab76725827d1c4e737b99ff15ba454cbcEvan Cheng *  word, to allow for further subdivisions of a category in future releases.
153a09f0d4ab76725827d1c4e737b99ff15ba454cbcEvan Cheng *  Applications should check for tag values falling within the range, rather
154b74a3e6fda768eb6160559e025f8b65c46db46d9Evan Cheng *  than for single individual values.
155a09f0d4ab76725827d1c4e737b99ff15ba454cbcEvan Cheng *  @stable ICU 2.8
156a09f0d4ab76725827d1c4e737b99ff15ba454cbcEvan Cheng*/
157a09f0d4ab76725827d1c4e737b99ff15ba454cbcEvan Chengtypedef enum ULineBreakTag {
158b74a3e6fda768eb6160559e025f8b65c46db46d9Evan Cheng    /** Tag value for soft line breaks, positions at which a line break
159b74a3e6fda768eb6160559e025f8b65c46db46d9Evan Cheng      *  is acceptable but not required                */
160b58f498f7502e7e1833decbbbb4df771367c7341Jim Grosbach    UBRK_LINE_SOFT            = 0,
161b58f498f7502e7e1833decbbbb4df771367c7341Jim Grosbach    /** Upper bound for soft line breaks.              */
162b58f498f7502e7e1833decbbbb4df771367c7341Jim Grosbach    UBRK_LINE_SOFT_LIMIT      = 100,
16396fa612373e258120d351ed14361f964ad22f99dEvan Cheng    /** Tag value for a hard, or mandatory line break  */
164e689ce626ce1d0022f70fb4a85113590bbdbb5e9Jakob Stoklund Olesen    UBRK_LINE_HARD            = 100,
165fb9ebbf236974beac31705eaeb9f50ab585af6abJakob Stoklund Olesen    /** Upper bound for hard line breaks.              */
166e689ce626ce1d0022f70fb4a85113590bbdbb5e9Jakob Stoklund Olesen    UBRK_LINE_HARD_LIMIT      = 200
1674823be3be1d87632fbd51ce8e51a58ee5e44b115Chad Rosier} ULineBreakTag;
1684823be3be1d87632fbd51ce8e51a58ee5e44b115Chad Rosier
1694823be3be1d87632fbd51ce8e51a58ee5e44b115Chad Rosier
170e689ce626ce1d0022f70fb4a85113590bbdbb5e9Jakob Stoklund Olesen
1714823be3be1d87632fbd51ce8e51a58ee5e44b115Chad Rosier/**
1724823be3be1d87632fbd51ce8e51a58ee5e44b115Chad Rosier *  Enum constants for the sentence break tags returned by getRuleStatus().
173cf14613455bc32b6a17821808595263e061335bcJakob Stoklund Olesen *  A range of values is defined for each category of
174e689ce626ce1d0022f70fb4a85113590bbdbb5e9Jakob Stoklund Olesen *  sentence, to allow for further subdivisions of a category in future releases.
175e689ce626ce1d0022f70fb4a85113590bbdbb5e9Jakob Stoklund Olesen *  Applications should check for tag values falling within the range, rather
176e689ce626ce1d0022f70fb4a85113590bbdbb5e9Jakob Stoklund Olesen *  than for single individual values.
177e689ce626ce1d0022f70fb4a85113590bbdbb5e9Jakob Stoklund Olesen *  @stable ICU 2.8
178e689ce626ce1d0022f70fb4a85113590bbdbb5e9Jakob Stoklund Olesen*/
179e689ce626ce1d0022f70fb4a85113590bbdbb5e9Jakob Stoklund Olesentypedef enum USentenceBreakTag {
180e689ce626ce1d0022f70fb4a85113590bbdbb5e9Jakob Stoklund Olesen    /** Tag value for for sentences  ending with a sentence terminator
1819f946a24d9e69559d1e0aeb6d128c2fa19846c92Jakob Stoklund Olesen      * ('.', '?', '!', etc.) character, possibly followed by a
182e689ce626ce1d0022f70fb4a85113590bbdbb5e9Jakob Stoklund Olesen      * hard separator (CR, LF, PS, etc.)
183e689ce626ce1d0022f70fb4a85113590bbdbb5e9Jakob Stoklund Olesen      */
184e689ce626ce1d0022f70fb4a85113590bbdbb5e9Jakob Stoklund Olesen    UBRK_SENTENCE_TERM       = 0,
185e689ce626ce1d0022f70fb4a85113590bbdbb5e9Jakob Stoklund Olesen    /** Upper bound for tags for sentences ended by sentence terminators.    */
186d37c13cfd1bf4b08d0b99d93c799a1caa74cf3c6Evan Cheng    UBRK_SENTENCE_TERM_LIMIT = 100,
1872e80991a7712d51f7637513703fc896f93eea252Hal Finkel    /** Tag value for for sentences that do not contain an ending
1882e80991a7712d51f7637513703fc896f93eea252Hal Finkel      * sentence terminator ('.', '?', '!', etc.) character, but
1892e80991a7712d51f7637513703fc896f93eea252Hal Finkel      * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
1902e80991a7712d51f7637513703fc896f93eea252Hal Finkel      */
191dffb051c21d32209c601ca0ca6baae75b6c6463fJakob Stoklund Olesen    UBRK_SENTENCE_SEP        = 100,
192dffb051c21d32209c601ca0ca6baae75b6c6463fJakob Stoklund Olesen    /** Upper bound for tags for sentences ended by a separator.              */
193dffb051c21d32209c601ca0ca6baae75b6c6463fJakob Stoklund Olesen    UBRK_SENTENCE_SEP_LIMIT  = 200
194b113cf2fedaf290242939c8f8c6f7e1438d46024Jim Grosbach    /** Tag value for a hard, or mandatory line break  */
195b113cf2fedaf290242939c8f8c6f7e1438d46024Jim Grosbach} USentenceBreakTag;
196b113cf2fedaf290242939c8f8c6f7e1438d46024Jim Grosbach
197b113cf2fedaf290242939c8f8c6f7e1438d46024Jim Grosbach
198b113cf2fedaf290242939c8f8c6f7e1438d46024Jim Grosbach/**
199b113cf2fedaf290242939c8f8c6f7e1438d46024Jim Grosbach * Open a new UBreakIterator for locating text boundaries for a specified locale.
200b113cf2fedaf290242939c8f8c6f7e1438d46024Jim Grosbach * A UBreakIterator may be used for detecting character, line, word,
201b113cf2fedaf290242939c8f8c6f7e1438d46024Jim Grosbach * and sentence breaks in text.
202b113cf2fedaf290242939c8f8c6f7e1438d46024Jim Grosbach * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
203b113cf2fedaf290242939c8f8c6f7e1438d46024Jim Grosbach * UBRK_LINE, UBRK_SENTENCE
20496fa612373e258120d351ed14361f964ad22f99dEvan Cheng * @param locale The locale specifying the text-breaking conventions.
205dffb051c21d32209c601ca0ca6baae75b6c6463fJakob Stoklund Olesen * @param text The text to be iterated over.
20696fa612373e258120d351ed14361f964ad22f99dEvan Cheng * @param textLength The number of characters in text, or -1 if null-terminated.
20796fa612373e258120d351ed14361f964ad22f99dEvan Cheng * @param status A UErrorCode to receive any errors.
20896fa612373e258120d351ed14361f964ad22f99dEvan Cheng * @return A UBreakIterator for the specified locale.
209 * @see ubrk_openRules
210 * @stable ICU 2.0
211 */
212U_STABLE UBreakIterator* U_EXPORT2
213ubrk_open(UBreakIteratorType type,
214      const char *locale,
215      const UChar *text,
216      int32_t textLength,
217      UErrorCode *status);
218
219/**
220 * Open a new UBreakIterator for locating text boundaries using specified breaking rules.
221 * The rule syntax is ... (TBD)
222 * @param rules A set of rules specifying the text breaking conventions.
223 * @param rulesLength The number of characters in rules, or -1 if null-terminated.
224 * @param text The text to be iterated over.  May be null, in which case ubrk_setText() is
225 *        used to specify the text to be iterated.
226 * @param textLength The number of characters in text, or -1 if null-terminated.
227 * @param parseErr   Receives position and context information for any syntax errors
228 *                   detected while parsing the rules.
229 * @param status A UErrorCode to receive any errors.
230 * @return A UBreakIterator for the specified rules.
231 * @see ubrk_open
232 * @stable ICU 2.2
233 */
234U_STABLE UBreakIterator* U_EXPORT2
235ubrk_openRules(const UChar     *rules,
236               int32_t         rulesLength,
237               const UChar     *text,
238               int32_t          textLength,
239               UParseError     *parseErr,
240               UErrorCode      *status);
241
242/**
243 * Thread safe cloning operation
244 * @param bi iterator to be cloned
245 * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
246 *  If buffer is not large enough, new memory will be allocated.
247 *  Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations.
248 * @param pBufferSize pointer to size of allocated space.
249 *  If *pBufferSize == 0, a sufficient size for use in cloning will
250 *  be returned ('pre-flighting')
251 *  If *pBufferSize is not enough for a stack-based safe clone,
252 *  new memory will be allocated.
253 * @param status to indicate whether the operation went on smoothly or there were errors
254 *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
255 * @return pointer to the new clone
256 * @stable ICU 2.0
257 */
258U_STABLE UBreakIterator * U_EXPORT2
259ubrk_safeClone(
260          const UBreakIterator *bi,
261          void *stackBuffer,
262          int32_t *pBufferSize,
263          UErrorCode *status);
264
265/**
266  * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
267  * @stable ICU 2.0
268  */
269#define U_BRK_SAFECLONE_BUFFERSIZE 512
270
271/**
272* Close a UBreakIterator.
273* Once closed, a UBreakIterator may no longer be used.
274* @param bi The break iterator to close.
275 * @stable ICU 2.0
276*/
277U_STABLE void U_EXPORT2
278ubrk_close(UBreakIterator *bi);
279
280/**
281 * Sets an existing iterator to point to a new piece of text
282 * @param bi The iterator to use
283 * @param text The text to be set
284 * @param textLength The length of the text
285 * @param status The error code
286 * @stable ICU 2.0
287 */
288U_STABLE void U_EXPORT2
289ubrk_setText(UBreakIterator* bi,
290             const UChar*    text,
291             int32_t         textLength,
292             UErrorCode*     status);
293
294
295/**
296 * Sets an existing iterator to point to a new piece of text
297 * @param bi The iterator to use
298 * @param text The text to be set.
299 *             This function makes a shallow clone of the supplied UText.  This means
300 *             that the caller is free to immediately close or otherwise reuse the
301 *             UText that was passed as a parameter, but that the underlying text itself
302 *             must not be altered while being referenced by the break iterator.
303 * @param status The error code
304 * @stable ICU 3.4
305 */
306U_STABLE void U_EXPORT2
307ubrk_setUText(UBreakIterator* bi,
308             UText*          text,
309             UErrorCode*     status);
310
311
312
313/**
314 * Determine the most recently-returned text boundary.
315 *
316 * @param bi The break iterator to use.
317 * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
318 * \ref ubrk_first, or \ref ubrk_last.
319 * @stable ICU 2.0
320 */
321U_STABLE int32_t U_EXPORT2
322ubrk_current(const UBreakIterator *bi);
323
324/**
325 * Determine the text boundary following the current text boundary.
326 *
327 * @param bi The break iterator to use.
328 * @return The character index of the next text boundary, or UBRK_DONE
329 * if all text boundaries have been returned.
330 * @see ubrk_previous
331 * @stable ICU 2.0
332 */
333U_STABLE int32_t U_EXPORT2
334ubrk_next(UBreakIterator *bi);
335
336/**
337 * Determine the text boundary preceding the current text boundary.
338 *
339 * @param bi The break iterator to use.
340 * @return The character index of the preceding text boundary, or UBRK_DONE
341 * if all text boundaries have been returned.
342 * @see ubrk_next
343 * @stable ICU 2.0
344 */
345U_STABLE int32_t U_EXPORT2
346ubrk_previous(UBreakIterator *bi);
347
348/**
349 * Determine the index of the first character in the text being scanned.
350 * This is not always the same as index 0 of the text.
351 * @param bi The break iterator to use.
352 * @return The character index of the first character in the text being scanned.
353 * @see ubrk_last
354 * @stable ICU 2.0
355 */
356U_STABLE int32_t U_EXPORT2
357ubrk_first(UBreakIterator *bi);
358
359/**
360 * Determine the index immediately <EM>beyond</EM> the last character in the text being
361 * scanned.
362 * This is not the same as the last character.
363 * @param bi The break iterator to use.
364 * @return The character offset immediately <EM>beyond</EM> the last character in the
365 * text being scanned.
366 * @see ubrk_first
367 * @stable ICU 2.0
368 */
369U_STABLE int32_t U_EXPORT2
370ubrk_last(UBreakIterator *bi);
371
372/**
373 * Determine the text boundary preceding the specified offset.
374 * The value returned is always smaller than offset, or UBRK_DONE.
375 * @param bi The break iterator to use.
376 * @param offset The offset to begin scanning.
377 * @return The text boundary preceding offset, or UBRK_DONE.
378 * @see ubrk_following
379 * @stable ICU 2.0
380 */
381U_STABLE int32_t U_EXPORT2
382ubrk_preceding(UBreakIterator *bi,
383           int32_t offset);
384
385/**
386 * Determine the text boundary following the specified offset.
387 * The value returned is always greater than offset, or UBRK_DONE.
388 * @param bi The break iterator to use.
389 * @param offset The offset to begin scanning.
390 * @return The text boundary following offset, or UBRK_DONE.
391 * @see ubrk_preceding
392 * @stable ICU 2.0
393 */
394U_STABLE int32_t U_EXPORT2
395ubrk_following(UBreakIterator *bi,
396           int32_t offset);
397
398/**
399* Get a locale for which text breaking information is available.
400* A UBreakIterator in a locale returned by this function will perform the correct
401* text breaking for the locale.
402* @param index The index of the desired locale.
403* @return A locale for which number text breaking information is available, or 0 if none.
404* @see ubrk_countAvailable
405* @stable ICU 2.0
406*/
407U_STABLE const char* U_EXPORT2
408ubrk_getAvailable(int32_t index);
409
410/**
411* Determine how many locales have text breaking information available.
412* This function is most useful as determining the loop ending condition for
413* calls to \ref ubrk_getAvailable.
414* @return The number of locales for which text breaking information is available.
415* @see ubrk_getAvailable
416* @stable ICU 2.0
417*/
418U_STABLE int32_t U_EXPORT2
419ubrk_countAvailable(void);
420
421
422/**
423* Returns true if the specfied position is a boundary position.  As a side
424* effect, leaves the iterator pointing to the first boundary position at
425* or after "offset".
426* @param bi The break iterator to use.
427* @param offset the offset to check.
428* @return True if "offset" is a boundary position.
429* @stable ICU 2.0
430*/
431U_STABLE  UBool U_EXPORT2
432ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
433
434/**
435 * Return the status from the break rule that determined the most recently
436 * returned break position.  The values appear in the rule source
437 * within brackets, {123}, for example.  For rules that do not specify a
438 * status, a default value of 0 is returned.
439 * <p>
440 * For word break iterators, the possible values are defined in enum UWordBreak.
441 * @stable ICU 2.2
442 */
443U_STABLE  int32_t U_EXPORT2
444ubrk_getRuleStatus(UBreakIterator *bi);
445
446/**
447 * Get the statuses from the break rules that determined the most recently
448 * returned break position.  The values appear in the rule source
449 * within brackets, {123}, for example.  The default status value for rules
450 * that do not explicitly provide one is zero.
451 * <p>
452 * For word break iterators, the possible values are defined in enum UWordBreak.
453 * @param bi        The break iterator to use
454 * @param fillInVec an array to be filled in with the status values.
455 * @param capacity  the length of the supplied vector.  A length of zero causes
456 *                  the function to return the number of status values, in the
457 *                  normal way, without attemtping to store any values.
458 * @param status    receives error codes.
459 * @return          The number of rule status values from rules that determined
460 *                  the most recent boundary returned by the break iterator.
461 * @stable ICU 3.0
462 */
463U_STABLE  int32_t U_EXPORT2
464ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);
465
466/**
467 * Return the locale of the break iterator. You can choose between the valid and
468 * the actual locale.
469 * @param bi break iterator
470 * @param type locale type (valid or actual)
471 * @param status error code
472 * @return locale string
473 * @stable ICU 2.8
474 */
475U_STABLE const char* U_EXPORT2
476ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
477
478
479#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
480
481#endif
482