1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.  Oracle designates this
9 * particular file as subject to the "Classpath" exception as provided
10 * by Oracle in the LICENSE file that accompanied this code.
11 *
12 * This code is distributed in the hope that it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 * version 2 for more details (a copy is included in the LICENSE file that
16 * accompanied this code).
17 *
18 * You should have received a copy of the GNU General Public License version
19 * 2 along with this work; if not, write to the Free Software Foundation,
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23 * or visit www.oracle.com if you need additional information or have any
24 * questions.
25 */
26
27/*
28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
29 * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
30 *
31 * The original version of this source code and documentation
32 * is copyrighted and owned by Taligent, Inc., a wholly-owned
33 * subsidiary of IBM. These materials are provided under terms
34 * of a License Agreement between Taligent and Sun. This technology
35 * is protected by multiple US and International patents.
36 *
37 * This notice and attribution to Taligent may not be removed.
38 * Taligent is a registered trademark of Taligent, Inc.
39 *
40 */
41
42package java.text;
43
44import java.util.Locale;
45
46
47// Android-changed: Discourage modification on CharacterIterator after setText. http://b/80456574
48/**
49 * The <code>BreakIterator</code> class implements methods for finding
50 * the location of boundaries in text. Instances of <code>BreakIterator</code>
51 * maintain a current position and scan over text
52 * returning the index of characters where boundaries occur.
53 * Internally, <code>BreakIterator</code> scans text using a
54 * <code>CharacterIterator</code>, and is thus able to scan text held
55 * by any object implementing that protocol. A <code>StringCharacterIterator</code>
56 * is used to scan <code>String</code> objects passed to <code>setText</code>.
57 * The <code>CharacterIterator</code> object must not be modified after having been
58 * passed to <code>setText</code>. If the text in the <code>CharacterIterator</code> object
59 * is changed, the caller must reset <code>BreakIterator</code> by calling
60 * <code>setText</code>.
61 *
62 * <p>
63 * You use the factory methods provided by this class to create
64 * instances of various types of break iterators. In particular,
65 * use <code>getWordInstance</code>, <code>getLineInstance</code>,
66 * <code>getSentenceInstance</code>, and <code>getCharacterInstance</code>
67 * to create <code>BreakIterator</code>s that perform
68 * word, line, sentence, and character boundary analysis respectively.
69 * A single <code>BreakIterator</code> can work only on one unit
70 * (word, line, sentence, and so on). You must use a different iterator
71 * for each unit boundary analysis you wish to perform.
72 *
73 * <p><a name="line"></a>
74 * Line boundary analysis determines where a text string can be
75 * broken when line-wrapping. The mechanism correctly handles
76 * punctuation and hyphenated words. Actual line breaking needs
77 * to also consider the available line width and is handled by
78 * higher-level software.
79 *
80 * <p><a name="sentence"></a>
81 * Sentence boundary analysis allows selection with correct interpretation
82 * of periods within numbers and abbreviations, and trailing punctuation
83 * marks such as quotation marks and parentheses.
84 *
85 * <p><a name="word"></a>
86 * Word boundary analysis is used by search and replace functions, as
87 * well as within text editing applications that allow the user to
88 * select words with a double click. Word selection provides correct
89 * interpretation of punctuation marks within and following
90 * words. Characters that are not part of a word, such as symbols
91 * or punctuation marks, have word-breaks on both sides.
92 *
93 * <p><a name="character"></a>
94 * Character boundary analysis allows users to interact with characters
95 * as they expect to, for example, when moving the cursor through a text
96 * string. Character boundary analysis provides correct navigation
97 * through character strings, regardless of how the character is stored.
98 * The boundaries returned may be those of supplementary characters,
99 * combining character sequences, or ligature clusters.
100 * For example, an accented character might be stored as a base character
101 * and a diacritical mark. What users consider to be a character can
102 * differ between languages.
103 *
104 * <p>
105 * The <code>BreakIterator</code> instances returned by the factory methods
106 * of this class are intended for use with natural languages only, not for
107 * programming language text. It is however possible to define subclasses
108 * that tokenize a programming language.
109 *
110 * <P>
111 * <strong>Examples</strong>:<P>
112 * Creating and using text boundaries:
113 * <blockquote>
114 * <pre>
115 * public static void main(String args[]) {
116 *      if (args.length == 1) {
117 *          String stringToExamine = args[0];
118 *          //print each word in order
119 *          BreakIterator boundary = BreakIterator.getWordInstance();
120 *          boundary.setText(stringToExamine);
121 *          printEachForward(boundary, stringToExamine);
122 *          //print each sentence in reverse order
123 *          boundary = BreakIterator.getSentenceInstance(Locale.US);
124 *          boundary.setText(stringToExamine);
125 *          printEachBackward(boundary, stringToExamine);
126 *          printFirst(boundary, stringToExamine);
127 *          printLast(boundary, stringToExamine);
128 *      }
129 * }
130 * </pre>
131 * </blockquote>
132 *
133 * Print each element in order:
134 * <blockquote>
135 * <pre>
136 * public static void printEachForward(BreakIterator boundary, String source) {
137 *     int start = boundary.first();
138 *     for (int end = boundary.next();
139 *          end != BreakIterator.DONE;
140 *          start = end, end = boundary.next()) {
141 *          System.out.println(source.substring(start,end));
142 *     }
143 * }
144 * </pre>
145 * </blockquote>
146 *
147 * Print each element in reverse order:
148 * <blockquote>
149 * <pre>
150 * public static void printEachBackward(BreakIterator boundary, String source) {
151 *     int end = boundary.last();
152 *     for (int start = boundary.previous();
153 *          start != BreakIterator.DONE;
154 *          end = start, start = boundary.previous()) {
155 *         System.out.println(source.substring(start,end));
156 *     }
157 * }
158 * </pre>
159 * </blockquote>
160 *
161 * Print first element:
162 * <blockquote>
163 * <pre>
164 * public static void printFirst(BreakIterator boundary, String source) {
165 *     int start = boundary.first();
166 *     int end = boundary.next();
167 *     System.out.println(source.substring(start,end));
168 * }
169 * </pre>
170 * </blockquote>
171 *
172 * Print last element:
173 * <blockquote>
174 * <pre>
175 * public static void printLast(BreakIterator boundary, String source) {
176 *     int end = boundary.last();
177 *     int start = boundary.previous();
178 *     System.out.println(source.substring(start,end));
179 * }
180 * </pre>
181 * </blockquote>
182 *
183 * Print the element at a specified position:
184 * <blockquote>
185 * <pre>
186 * public static void printAt(BreakIterator boundary, int pos, String source) {
187 *     int end = boundary.following(pos);
188 *     int start = boundary.previous();
189 *     System.out.println(source.substring(start,end));
190 * }
191 * </pre>
192 * </blockquote>
193 *
194 * Find the next word:
195 * <blockquote>
196 * <pre>{@code
197 * public static int nextWordStartAfter(int pos, String text) {
198 *     BreakIterator wb = BreakIterator.getWordInstance();
199 *     wb.setText(text);
200 *     int last = wb.following(pos);
201 *     int current = wb.next();
202 *     while (current != BreakIterator.DONE) {
203 *         for (int p = last; p < current; p++) {
204 *             if (Character.isLetter(text.codePointAt(p)))
205 *                 return last;
206 *         }
207 *         last = current;
208 *         current = wb.next();
209 *     }
210 *     return BreakIterator.DONE;
211 * }
212 * }</pre>
213 * (The iterator returned by BreakIterator.getWordInstance() is unique in that
214 * the break positions it returns don't represent both the start and end of the
215 * thing being iterated over.  That is, a sentence-break iterator returns breaks
216 * that each represent the end of one sentence and the beginning of the next.
217 * With the word-break iterator, the characters between two boundaries might be a
218 * word, or they might be the punctuation or whitespace between two words.  The
219 * above code uses a simple heuristic to determine which boundary is the beginning
220 * of a word: If the characters between this boundary and the next boundary
221 * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
222 * a Hangul syllable, a Kana character, etc.), then the text between this boundary
223 * and the next is a word; otherwise, it's the material between words.)
224 * </blockquote>
225 *
226 * @see CharacterIterator
227 *
228 */
229
230public abstract class BreakIterator implements Cloneable
231{
232    /**
233     * Constructor. BreakIterator is stateless and has no default behavior.
234     */
235    protected BreakIterator()
236    {
237    }
238
239    /**
240     * Create a copy of this iterator
241     * @return A copy of this
242     */
243    @Override
244    public Object clone()
245    {
246        try {
247            return super.clone();
248        }
249        catch (CloneNotSupportedException e) {
250            throw new InternalError(e);
251        }
252    }
253
254    /**
255     * DONE is returned by previous(), next(), next(int), preceding(int)
256     * and following(int) when either the first or last text boundary has been
257     * reached.
258     */
259    public static final int DONE = -1;
260
261    /**
262     * Returns the first boundary. The iterator's current position is set
263     * to the first text boundary.
264     * @return The character index of the first text boundary.
265     */
266    public abstract int first();
267
268    /**
269     * Returns the last boundary. The iterator's current position is set
270     * to the last text boundary.
271     * @return The character index of the last text boundary.
272     */
273    public abstract int last();
274
275    /**
276     * Returns the nth boundary from the current boundary. If either
277     * the first or last text boundary has been reached, it returns
278     * <code>BreakIterator.DONE</code> and the current position is set to either
279     * the first or last text boundary depending on which one is reached. Otherwise,
280     * the iterator's current position is set to the new boundary.
281     * For example, if the iterator's current position is the mth text boundary
282     * and three more boundaries exist from the current boundary to the last text
283     * boundary, the next(2) call will return m + 2. The new text position is set
284     * to the (m + 2)th text boundary. A next(4) call would return
285     * <code>BreakIterator.DONE</code> and the last text boundary would become the
286     * new text position.
287     * @param n which boundary to return.  A value of 0
288     * does nothing.  Negative values move to previous boundaries
289     * and positive values move to later boundaries.
290     * @return The character index of the nth boundary from the current position
291     * or <code>BreakIterator.DONE</code> if either first or last text boundary
292     * has been reached.
293     */
294    public abstract int next(int n);
295
296    /**
297     * Returns the boundary following the current boundary. If the current boundary
298     * is the last text boundary, it returns <code>BreakIterator.DONE</code> and
299     * the iterator's current position is unchanged. Otherwise, the iterator's
300     * current position is set to the boundary following the current boundary.
301     * @return The character index of the next text boundary or
302     * <code>BreakIterator.DONE</code> if the current boundary is the last text
303     * boundary.
304     * Equivalent to next(1).
305     * @see #next(int)
306     */
307    public abstract int next();
308
309    /**
310     * Returns the boundary preceding the current boundary. If the current boundary
311     * is the first text boundary, it returns <code>BreakIterator.DONE</code> and
312     * the iterator's current position is unchanged. Otherwise, the iterator's
313     * current position is set to the boundary preceding the current boundary.
314     * @return The character index of the previous text boundary or
315     * <code>BreakIterator.DONE</code> if the current boundary is the first text
316     * boundary.
317     */
318    public abstract int previous();
319
320    /**
321     * Returns the first boundary following the specified character offset. If the
322     * specified offset equals to the last text boundary, it returns
323     * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
324     * Otherwise, the iterator's current position is set to the returned boundary.
325     * The value returned is always greater than the offset or the value
326     * <code>BreakIterator.DONE</code>.
327     * @param offset the character offset to begin scanning.
328     * @return The first boundary after the specified offset or
329     * <code>BreakIterator.DONE</code> if the last text boundary is passed in
330     * as the offset.
331     * @exception  IllegalArgumentException if the specified offset is less than
332     * the first text boundary or greater than the last text boundary.
333     */
334    public abstract int following(int offset);
335
336    /**
337     * Returns the last boundary preceding the specified character offset. If the
338     * specified offset equals to the first text boundary, it returns
339     * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
340     * Otherwise, the iterator's current position is set to the returned boundary.
341     * The value returned is always less than the offset or the value
342     * <code>BreakIterator.DONE</code>.
343     * @param offset the character offset to begin scanning.
344     * @return The last boundary before the specified offset or
345     * <code>BreakIterator.DONE</code> if the first text boundary is passed in
346     * as the offset.
347     * @exception   IllegalArgumentException if the specified offset is less than
348     * the first text boundary or greater than the last text boundary.
349     * @since 1.2
350     */
351    public int preceding(int offset) {
352        // NOTE:  This implementation is here solely because we can't add new
353        // abstract methods to an existing class.  There is almost ALWAYS a
354        // better, faster way to do this.
355        int pos = following(offset);
356        while (pos >= offset && pos != DONE) {
357            pos = previous();
358        }
359        return pos;
360    }
361
362    /**
363     * Returns true if the specified character offset is a text boundary.
364     * @param offset the character offset to check.
365     * @return <code>true</code> if "offset" is a boundary position,
366     * <code>false</code> otherwise.
367     * @exception   IllegalArgumentException if the specified offset is less than
368     * the first text boundary or greater than the last text boundary.
369     * @since 1.2
370     */
371    public boolean isBoundary(int offset) {
372        // NOTE: This implementation probably is wrong for most situations
373        // because it fails to take into account the possibility that a
374        // CharacterIterator passed to setText() may not have a begin offset
375        // of 0.  But since the abstract BreakIterator doesn't have that
376        // knowledge, it assumes the begin offset is 0.  If you subclass
377        // BreakIterator, copy the SimpleTextBoundary implementation of this
378        // function into your subclass.  [This should have been abstract at
379        // this level, but it's too late to fix that now.]
380        if (offset == 0) {
381            return true;
382        }
383        int boundary = following(offset - 1);
384        if (boundary == DONE) {
385            throw new IllegalArgumentException();
386        }
387        return boundary == offset;
388    }
389
390    /**
391     * Returns character index of the text boundary that was most
392     * recently returned by next(), next(int), previous(), first(), last(),
393     * following(int) or preceding(int). If any of these methods returns
394     * <code>BreakIterator.DONE</code> because either first or last text boundary
395     * has been reached, it returns the first or last text boundary depending on
396     * which one is reached.
397     * @return The text boundary returned from the above methods, first or last
398     * text boundary.
399     * @see #next()
400     * @see #next(int)
401     * @see #previous()
402     * @see #first()
403     * @see #last()
404     * @see #following(int)
405     * @see #preceding(int)
406     */
407    public abstract int current();
408
409    /**
410     * Get the text being scanned
411     * @return the text being scanned
412     */
413    public abstract CharacterIterator getText();
414
415    /**
416     * Set a new text string to be scanned.  The current scan
417     * position is reset to first().
418     * @param newText new text to scan.
419     */
420    public void setText(String newText)
421    {
422        setText(new StringCharacterIterator(newText));
423    }
424
425    /**
426     * Set a new text for scanning.  The current scan
427     * position is reset to first().
428     * @param newText new text to scan.
429     */
430    public abstract void setText(CharacterIterator newText);
431
432    // Android-removed: Removed code related to BreakIteratorProvider support.
433
434    /**
435     * Returns a new <code>BreakIterator</code> instance
436     * for <a href="BreakIterator.html#word">word breaks</a>
437     * for the {@linkplain Locale#getDefault() default locale}.
438     * @return A break iterator for word breaks
439     */
440    public static BreakIterator getWordInstance()
441    {
442        return getWordInstance(Locale.getDefault());
443    }
444
445    /**
446     * Returns a new <code>BreakIterator</code> instance
447     * for <a href="BreakIterator.html#word">word breaks</a>
448     * for the given locale.
449     * @param locale the desired locale
450     * @return A break iterator for word breaks
451     * @exception NullPointerException if <code>locale</code> is null
452     */
453    public static BreakIterator getWordInstance(Locale locale)
454    {
455        // Android-changed: Switched to ICU.
456        return new IcuIteratorWrapper(
457                android.icu.text.BreakIterator.getWordInstance(locale));
458    }
459
460    /**
461     * Returns a new <code>BreakIterator</code> instance
462     * for <a href="BreakIterator.html#line">line breaks</a>
463     * for the {@linkplain Locale#getDefault() default locale}.
464     * @return A break iterator for line breaks
465     */
466    public static BreakIterator getLineInstance()
467    {
468        return getLineInstance(Locale.getDefault());
469    }
470
471    /**
472     * Returns a new <code>BreakIterator</code> instance
473     * for <a href="BreakIterator.html#line">line breaks</a>
474     * for the given locale.
475     * @param locale the desired locale
476     * @return A break iterator for line breaks
477     * @exception NullPointerException if <code>locale</code> is null
478     */
479    public static BreakIterator getLineInstance(Locale locale)
480    {
481        // Android-changed: Switched to ICU.
482        return new IcuIteratorWrapper(
483                android.icu.text.BreakIterator.getLineInstance(locale));
484    }
485
486    /**
487     * Returns a new <code>BreakIterator</code> instance
488     * for <a href="BreakIterator.html#character">character breaks</a>
489     * for the {@linkplain Locale#getDefault() default locale}.
490     * @return A break iterator for character breaks
491     */
492    public static BreakIterator getCharacterInstance()
493    {
494        return getCharacterInstance(Locale.getDefault());
495    }
496
497    /**
498     * Returns a new <code>BreakIterator</code> instance
499     * for <a href="BreakIterator.html#character">character breaks</a>
500     * for the given locale.
501     * @param locale the desired locale
502     * @return A break iterator for character breaks
503     * @exception NullPointerException if <code>locale</code> is null
504     */
505    public static BreakIterator getCharacterInstance(Locale locale)
506    {
507        // Android-changed: Switched to ICU.
508        return new IcuIteratorWrapper(
509                android.icu.text.BreakIterator.getCharacterInstance(locale));
510    }
511
512    /**
513     * Returns a new <code>BreakIterator</code> instance
514     * for <a href="BreakIterator.html#sentence">sentence breaks</a>
515     * for the {@linkplain Locale#getDefault() default locale}.
516     * @return A break iterator for sentence breaks
517     */
518    public static BreakIterator getSentenceInstance()
519    {
520        return getSentenceInstance(Locale.getDefault());
521    }
522
523    /**
524     * Returns a new <code>BreakIterator</code> instance
525     * for <a href="BreakIterator.html#sentence">sentence breaks</a>
526     * for the given locale.
527     * @param locale the desired locale
528     * @return A break iterator for sentence breaks
529     * @exception NullPointerException if <code>locale</code> is null
530     */
531    public static BreakIterator getSentenceInstance(Locale locale)
532    {
533        // Android-changed: Switched to ICU.
534        return new IcuIteratorWrapper(
535                android.icu.text.BreakIterator.getSentenceInstance(locale));
536    }
537
538    // Android-removed: Removed code related to BreakIteratorProvider support.
539
540    // Android-changed: Removed references to BreakIteratorProvider from JavaDoc.
541    /**
542     * Returns an array of all locales for which the
543     * <code>get*Instance</code> methods of this class can return
544     * localized instances.
545     *
546     * @return An array of locales for which localized
547     *         <code>BreakIterator</code> instances are available.
548     */
549    public static synchronized Locale[] getAvailableLocales()
550    {
551        // Android-changed: Switched to ICU.
552        return android.icu.text.BreakIterator.getAvailableLocales();
553    }
554}
555