1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements.  See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License.  You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18package java.text;
19
20import java.util.Locale;
21import libcore.icu.ICU;
22import libcore.icu.NativeBreakIterator;
23
24/**
25 * Locates boundaries in text. This class defines a protocol for objects that
26 * break up a piece of natural-language text according to a set of criteria.
27 * Instances or subclasses of {@code BreakIterator} can be provided, for
28 * example, to break a piece of text into words, sentences, or logical
29 * characters according to the conventions of some language or group of
30 * languages. We provide four built-in types of {@code BreakIterator}:
31 * <ul>
32 * <li>{@link #getSentenceInstance()} returns a {@code BreakIterator} that
33 * locates boundaries between sentences. This is useful for triple-click
34 * selection, for example.</li>
35 * <li>{@link #getWordInstance()} returns a {@code BreakIterator} that locates
36 * boundaries between words. This is useful for double-click selection or "find
37 * whole words" searches. This type of {@code BreakIterator} makes sure there is
38 * a boundary position at the beginning and end of each legal word (numbers
39 * count as words, too). Whitespace and punctuation are kept separate from real
40 * words.</li>
41 * <li>{@code getLineInstance()} returns a {@code BreakIterator} that locates
42 * positions where it is legal for a text editor to wrap lines. This is similar
43 * to word breaking, but not the same: punctuation and whitespace are generally
44 * kept with words (you don't want a line to start with whitespace, for
45 * example), and some special characters can force a position to be considered a
46 * line break position or prevent a position from being a line break position.</li>
47 * <li>{@code getCharacterInstance()} returns a {@code BreakIterator} that
48 * locates boundaries between logical characters. Because of the structure of
49 * the Unicode encoding, a logical character may be stored internally as more
50 * than one Unicode code point. (A with an umlaut may be stored as an a followed
51 * by a separate combining umlaut character, for example, but the user still
52 * thinks of it as one character.) This iterator allows various processes
53 * (especially text editors) to treat as characters the units of text that a
54 * user would think of as characters, rather than the units of text that the
55 * computer sees as "characters".</li>
56 * </ul> {@code BreakIterator}'s interface follows an "iterator" model (hence
57 * the name), meaning it has a concept of a "current position" and methods like
58 * {@code first()}, {@code last()}, {@code next()}, and {@code previous()} that
59 * update the current position. All {@code BreakIterator}s uphold the following
60 * invariants:
61 * <ul>
62 * <li>The beginning and end of the text are always treated as boundary
63 * positions.</li>
64 * <li>The current position of the iterator is always a boundary position
65 * (random- access methods move the iterator to the nearest boundary position
66 * before or after the specified position, not <i>to</i> the specified
67 * position).</li>
68 * <li>{@code DONE} is used as a flag to indicate when iteration has stopped.
69 * {@code DONE} is only returned when the current position is the end of the
70 * text and the user calls {@code next()}, or when the current position is the
71 * beginning of the text and the user calls {@code previous()}.</li>
72 * <li>Break positions are numbered by the positions of the characters that
73 * follow them. Thus, under normal circumstances, the position before the first
74 * character is 0, the position after the first character is 1, and the position
75 * after the last character is 1 plus the length of the string.</li>
76 * <li>The client can change the position of an iterator, or the text it
77 * analyzes, at will, but cannot change the behavior. If the user wants
78 * different behavior, he must instantiate a new iterator.</li>
79 * </ul>
80 * <p>
81 * {@code BreakIterator} accesses the text it analyzes through a
82 * {@link CharacterIterator}, which makes it possible to use {@code
83 * BreakIterator} to analyze text in any text-storage vehicle that provides a
84 * {@code CharacterIterator} interface.
85 * <p>
86 * <em>Note:</em> Some types of {@code BreakIterator} can take a long time to
87 * create, and instances of {@code BreakIterator} are not currently cached by
88 * the system. For optimal performance, keep instances of {@code BreakIterator}
89 * around as long as it makes sense. For example, when word-wrapping a document,
90 * don't create and destroy a new {@code BreakIterator} for each line. Create
91 * one break iterator for the whole document (or whatever stretch of text you're
92 * wrapping) and use it to do the whole job of wrapping the text.
93 * <p>
94 * <em>Examples</em>:
95 * <p>
96 * Creating and using text boundaries:
97 * <blockquote>
98 *
99 * <pre>
100 * public static void main(String args[]) {
101 *     if (args.length == 1) {
102 *         String stringToExamine = args[0];
103 *         //print each word in order
104 *         BreakIterator boundary = BreakIterator.getWordInstance();
105 *         boundary.setText(stringToExamine);
106 *         printEachForward(boundary, stringToExamine);
107 *         //print each sentence in reverse order
108 *         boundary = BreakIterator.getSentenceInstance(Locale.US);
109 *         boundary.setText(stringToExamine);
110 *         printEachBackward(boundary, stringToExamine);
111 *         printFirst(boundary, stringToExamine);
112 *         printLast(boundary, stringToExamine);
113 *     }
114 * }
115 * </pre>
116 *
117 * </blockquote>
118 * <p>
119 * Print each element in order:
120 * <blockquote>
121 *
122 * <pre>
123 * public static void printEachForward(BreakIterator boundary, String source) {
124 *     int start = boundary.first();
125 *     for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
126 *         System.out.println(source.substring(start, end));
127 *     }
128 * }
129 * </pre>
130 *
131 * </blockquote>
132 * <p>
133 * Print each element in reverse order:
134 * <blockquote>
135 *
136 * <pre>
137 * public static void printEachBackward(BreakIterator boundary, String source) {
138 *     int end = boundary.last();
139 *     for (int start = boundary.previous(); start != BreakIterator.DONE; end = start, start = boundary
140 *             .previous()) {
141 *         System.out.println(source.substring(start, end));
142 *     }
143 * }
144 * </pre>
145 *
146 * </blockquote>
147 * <p>
148 * Print the first element:
149 * <blockquote>
150 *
151 * <pre>
152 * public static void printFirst(BreakIterator boundary, String source) {
153 *     int start = boundary.first();
154 *     int end = boundary.next();
155 *     System.out.println(source.substring(start, end));
156 * }
157 * </pre>
158 *
159 * </blockquote>
160 * <p>
161 * Print the last element:
162 * <blockquote>
163 *
164 * <pre>
165 * public static void printLast(BreakIterator boundary, String source) {
166 *     int end = boundary.last();
167 *     int start = boundary.previous();
168 *     System.out.println(source.substring(start, end));
169 * }
170 * </pre>
171 *
172 * </blockquote>
173 * <p>
174 * Print the element at a specified position:
175 * <blockquote>
176 *
177 * <pre>
178 * public static void printAt(BreakIterator boundary, int pos, String source) {
179 *     int end = boundary.following(pos);
180 *     int start = boundary.previous();
181 *     System.out.println(source.substring(start, end));
182 * }
183 * </pre>
184 *
185 * </blockquote>
186 * <p>
187 * Find the next word:
188 * <blockquote>
189 *
190 * <pre>
191 * public static int nextWordStartAfter(int pos, String text) {
192 *     BreakIterator wb = BreakIterator.getWordInstance();
193 *     wb.setText(text);
194 *     int last = wb.following(pos);
195 *     int current = wb.next();
196 *     while (current != BreakIterator.DONE) {
197 *         for (int p = last; p &lt; current; p++) {
198 *             if (Character.isLetter(text.charAt(p)))
199 *                 return last;
200 *         }
201 *         last = current;
202 *         current = wb.next();
203 *     }
204 *     return BreakIterator.DONE;
205 * }
206 * </pre>
207 *
208 * </blockquote>
209 * <p>
210 * The iterator returned by {@code BreakIterator.getWordInstance()} is unique in
211 * that the break positions it returns don't represent both the start and end of
212 * the thing being iterated over. That is, a sentence-break iterator returns
213 * breaks that each represent the end of one sentence and the beginning of the
214 * next. With the word-break iterator, the characters between two boundaries
215 * might be a word, or they might be the punctuation or whitespace between two
216 * words. The above code uses a simple heuristic to determine which boundary is
217 * the beginning of a word: If the characters between this boundary and the next
218 * boundary include at least one letter (this can be an alphabetical letter, a
219 * CJK ideograph, a Hangul syllable, a Kana character, etc.), then the text
220 * between this boundary and the next is a word; otherwise, it's the material
221 * between words.)
222 *
223 * @see CharacterIterator
224 */
225public abstract class BreakIterator implements Cloneable {
226
227    /**
228     * This constant is returned by iterate methods like {@code previous()} or
229     * {@code next()} if they have returned all valid boundaries.
230     */
231    public static final int DONE = -1;
232
233    // the wrapped ICU implementation
234    NativeBreakIterator wrapped;
235
236    /**
237     * Default constructor, for use by subclasses.
238     */
239    protected BreakIterator() {
240    }
241
242    /*
243     * wrapping constructor
244     */
245    BreakIterator(NativeBreakIterator iterator) {
246        wrapped = iterator;
247    }
248
249    /**
250     * Returns an array of locales for which custom {@code BreakIterator} instances
251     * are available.
252     * <p>Note that Android does not support user-supplied locale service providers.
253     */
254    public static Locale[] getAvailableLocales() {
255        return ICU.getAvailableBreakIteratorLocales();
256    }
257
258    /**
259     * Returns a new instance of {@code BreakIterator} to iterate over
260     * characters using the user's default locale.
261     * See "<a href="../util/Locale.html#default_locale">Be wary of the default locale</a>".
262     * @return a new instance of {@code BreakIterator} using the default locale.
263     */
264    public static BreakIterator getCharacterInstance() {
265        return getCharacterInstance(Locale.getDefault());
266    }
267
268    /**
269     * Returns a new instance of {@code BreakIterator} to iterate over
270     * characters using the given locale.
271     *
272     * @param where
273     *            the given locale.
274     * @return a new instance of {@code BreakIterator} using the given locale.
275     */
276    public static BreakIterator getCharacterInstance(Locale where) {
277        return new RuleBasedBreakIterator(NativeBreakIterator.getCharacterInstance(where));
278    }
279
280    /**
281     * Returns a new instance of {{@code BreakIterator} to iterate over
282     * line breaks using the user's default locale.
283     * See "<a href="../util/Locale.html#default_locale">Be wary of the default locale</a>".
284     * @return a new instance of {@code BreakIterator} using the default locale.
285     */
286    public static BreakIterator getLineInstance() {
287        return getLineInstance(Locale.getDefault());
288    }
289
290    /**
291     * Returns a new instance of {@code BreakIterator} to iterate over
292     * line breaks using the given locale.
293     *
294     * @param where
295     *            the given locale.
296     * @return a new instance of {@code BreakIterator} using the given locale.
297     * @throws NullPointerException if {@code where} is {@code null}.
298     */
299    public static BreakIterator getLineInstance(Locale where) {
300        return new RuleBasedBreakIterator(NativeBreakIterator.getLineInstance(where));
301    }
302
303    /**
304     * Returns a new instance of {@code BreakIterator} to iterate over
305     * sentence-breaks using the default locale.
306     * See "<a href="../util/Locale.html#default_locale">Be wary of the default locale</a>".
307     * @return a new instance of {@code BreakIterator} using the default locale.
308     */
309    public static BreakIterator getSentenceInstance() {
310        return getSentenceInstance(Locale.getDefault());
311    }
312
313    /**
314     * Returns a new instance of {@code BreakIterator} to iterate over
315     * sentence-breaks using the given locale.
316     *
317     * @param where
318     *            the given locale.
319     * @return a new instance of {@code BreakIterator} using the given locale.
320     * @throws NullPointerException if {@code where} is {@code null}.
321     */
322    public static BreakIterator getSentenceInstance(Locale where) {
323        return new RuleBasedBreakIterator(NativeBreakIterator.getSentenceInstance(where));
324    }
325
326    /**
327     * Returns a new instance of {@code BreakIterator} to iterate over
328     * word-breaks using the default locale.
329     * See "<a href="../util/Locale.html#default_locale">Be wary of the default locale</a>".
330     * @return a new instance of {@code BreakIterator} using the default locale.
331     */
332    public static BreakIterator getWordInstance() {
333        return getWordInstance(Locale.getDefault());
334    }
335
336    /**
337     * Returns a new instance of {@code BreakIterator} to iterate over
338     * word-breaks using the given locale.
339     *
340     * @param where
341     *            the given locale.
342     * @return a new instance of {@code BreakIterator} using the given locale.
343     * @throws NullPointerException if {@code where} is {@code null}.
344     */
345    public static BreakIterator getWordInstance(Locale where) {
346        return new RuleBasedBreakIterator(NativeBreakIterator.getWordInstance(where));
347    }
348
349    /**
350     * Indicates whether the given offset is a boundary position. If this method
351     * returns true, the current iteration position is set to the given
352     * position; if the function returns false, the current iteration position
353     * is set as though {@link #following(int)} had been called.
354     *
355     * @param offset
356     *            the given offset to check.
357     * @return {@code true} if the given offset is a boundary position; {@code
358     *         false} otherwise.
359     */
360    public boolean isBoundary(int offset) {
361        return wrapped.isBoundary(offset);
362    }
363
364    /**
365     * Returns the position of last boundary preceding the given offset, and
366     * sets the current position to the returned value, or {@code DONE} if the
367     * given offset specifies the starting position.
368     *
369     * @param offset
370     *            the given start position to be searched for.
371     * @return the position of the last boundary preceding the given offset.
372     * @throws IllegalArgumentException
373     *            if the offset is invalid.
374     */
375    public int preceding(int offset) {
376        return wrapped.preceding(offset);
377    }
378
379    /**
380     * Sets the new text string to be analyzed, the current position will be
381     * reset to the beginning of this new string, and the old string will be
382     * lost.
383     *
384     * @param newText
385     *            the new text string to be analyzed.
386     */
387    public void setText(String newText) {
388        if (newText == null) {
389            throw new NullPointerException("newText == null");
390        }
391        wrapped.setText(newText);
392    }
393
394    /**
395     * Returns this iterator's current position.
396     *
397     * @return this iterator's current position.
398     */
399    public abstract int current();
400
401    /**
402     * Sets this iterator's current position to the first boundary and returns
403     * that position.
404     *
405     * @return the position of the first boundary.
406     */
407    public abstract int first();
408
409    /**
410     * Sets the position of the first boundary to the one following the given
411     * offset and returns this position. Returns {@code DONE} if there is no
412     * boundary after the given offset.
413     *
414     * @param offset
415     *            the given position to be searched for.
416     * @return the position of the first boundary following the given offset.
417     * @throws IllegalArgumentException
418     *            if the offset is invalid.
419     */
420    public abstract int following(int offset);
421
422    /**
423     * Returns a {@code CharacterIterator} which represents the text being
424     * analyzed. Please note that the returned value is probably the internal
425     * iterator used by this object. If the invoker wants to modify the status
426     * of the returned iterator, it is recommended to first create a clone of
427     * the iterator returned.
428     *
429     * @return a {@code CharacterIterator} which represents the text being
430     *         analyzed.
431     */
432    public abstract CharacterIterator getText();
433
434    /**
435     * Sets this iterator's current position to the last boundary and returns
436     * that position.
437     *
438     * @return the position of last boundary.
439     */
440    public abstract int last();
441
442    /**
443     * Sets this iterator's current position to the next boundary after the
444     * current position, and returns this position. Returns {@code DONE} if no
445     * boundary was found after the current position.
446     *
447     * @return the position of last boundary.
448     */
449    public abstract int next();
450
451    /**
452     * Sets this iterator's current position to the next boundary after the
453     * given position, and returns that position. Returns {@code DONE} if no
454     * boundary was found after the given position.
455     *
456     * @param n
457     *            the given position.
458     * @return the position of last boundary.
459     */
460    public abstract int next(int n);
461
462    /**
463     * Sets this iterator's current position to the previous boundary before the
464     * current position and returns that position. Returns {@code DONE} if
465     * no boundary was found before the current position.
466     *
467     * @return the position of last boundary.
468     */
469    public abstract int previous();
470
471    /**
472     * Sets the new text to be analyzed by the given {@code CharacterIterator}.
473     * The position will be reset to the beginning of the new text, and other
474     * status information of this iterator will be kept.
475     *
476     * @param newText
477     *            the {@code CharacterIterator} referring to the text to be
478     *            analyzed.
479     */
480    public abstract void setText(CharacterIterator newText);
481
482    /**
483     * Returns a copy of this iterator.
484     */
485    @Override
486    public Object clone() {
487        try {
488            BreakIterator cloned = (BreakIterator) super.clone();
489            cloned.wrapped = (NativeBreakIterator) wrapped.clone();
490            return cloned;
491        } catch (CloneNotSupportedException e) {
492            throw new AssertionError(e);
493        }
494    }
495}
496