1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 * Copyright (c) 1996, 2006, Oracle and/or its affiliates. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.  Oracle designates this
9 * particular file as subject to the "Classpath" exception as provided
10 * by Oracle in the LICENSE file that accompanied this code.
11 *
12 * This code is distributed in the hope that it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 * version 2 for more details (a copy is included in the LICENSE file that
16 * accompanied this code).
17 *
18 * You should have received a copy of the GNU General Public License version
19 * 2 along with this work; if not, write to the Free Software Foundation,
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23 * or visit www.oracle.com if you need additional information or have any
24 * questions.
25 */
26
27/*
28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
29 * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
30 *
31 * The original version of this source code and documentation
32 * is copyrighted and owned by Taligent, Inc., a wholly-owned
33 * subsidiary of IBM. These materials are provided under terms
34 * of a License Agreement between Taligent and Sun. This technology
35 * is protected by multiple US and International patents.
36 *
37 * This notice and attribution to Taligent may not be removed.
38 * Taligent is a registered trademark of Taligent, Inc.
39 *
40 */
41
42package java.text;
43
44import java.util.Locale;
45
46
47/**
48 * The <code>BreakIterator</code> class implements methods for finding
49 * the location of boundaries in text. Instances of <code>BreakIterator</code>
50 * maintain a current position and scan over text
51 * returning the index of characters where boundaries occur.
52 * Internally, <code>BreakIterator</code> scans text using a
53 * <code>CharacterIterator</code>, and is thus able to scan text held
54 * by any object implementing that protocol. A <code>StringCharacterIterator</code>
55 * is used to scan <code>String</code> objects passed to <code>setText</code>.
56 *
57 * <p>
58 * You use the factory methods provided by this class to create
59 * instances of various types of break iterators. In particular,
60 * use <code>getWordInstance</code>, <code>getLineInstance</code>,
61 * <code>getSentenceInstance</code>, and <code>getCharacterInstance</code>
62 * to create <code>BreakIterator</code>s that perform
63 * word, line, sentence, and character boundary analysis respectively.
64 * A single <code>BreakIterator</code> can work only on one unit
65 * (word, line, sentence, and so on). You must use a different iterator
66 * for each unit boundary analysis you wish to perform.
67 *
68 * <p><a name="line"></a>
69 * Line boundary analysis determines where a text string can be
70 * broken when line-wrapping. The mechanism correctly handles
71 * punctuation and hyphenated words. Actual line breaking needs
72 * to also consider the available line width and is handled by
73 * higher-level software.
74 *
75 * <p><a name="sentence"></a>
76 * Sentence boundary analysis allows selection with correct interpretation
77 * of periods within numbers and abbreviations, and trailing punctuation
78 * marks such as quotation marks and parentheses.
79 *
80 * <p><a name="word"></a>
81 * Word boundary analysis is used by search and replace functions, as
82 * well as within text editing applications that allow the user to
83 * select words with a double click. Word selection provides correct
84 * interpretation of punctuation marks within and following
85 * words. Characters that are not part of a word, such as symbols
86 * or punctuation marks, have word-breaks on both sides.
87 *
88 * <p><a name="character"></a>
89 * Character boundary analysis allows users to interact with characters
90 * as they expect to, for example, when moving the cursor through a text
91 * string. Character boundary analysis provides correct navigation
92 * through character strings, regardless of how the character is stored.
93 * The boundaries returned may be those of supplementary characters,
94 * combining character sequences, or ligature clusters.
95 * For example, an accented character might be stored as a base character
96 * and a diacritical mark. What users consider to be a character can
97 * differ between languages.
98 *
99 * <p>
100 * The <code>BreakIterator</code> instances returned by the factory methods
101 * of this class are intended for use with natural languages only, not for
102 * programming language text. It is however possible to define subclasses
103 * that tokenize a programming language.
104 *
105 * <P>
106 * <strong>Examples</strong>:<P>
107 * Creating and using text boundaries:
108 * <blockquote>
109 * <pre>
110 * public static void main(String args[]) {
111 *      if (args.length == 1) {
112 *          String stringToExamine = args[0];
113 *          //print each word in order
114 *          BreakIterator boundary = BreakIterator.getWordInstance();
115 *          boundary.setText(stringToExamine);
116 *          printEachForward(boundary, stringToExamine);
117 *          //print each sentence in reverse order
118 *          boundary = BreakIterator.getSentenceInstance(Locale.US);
119 *          boundary.setText(stringToExamine);
120 *          printEachBackward(boundary, stringToExamine);
121 *          printFirst(boundary, stringToExamine);
122 *          printLast(boundary, stringToExamine);
123 *      }
124 * }
125 * </pre>
126 * </blockquote>
127 *
128 * Print each element in order:
129 * <blockquote>
130 * <pre>
131 * public static void printEachForward(BreakIterator boundary, String source) {
132 *     int start = boundary.first();
133 *     for (int end = boundary.next();
134 *          end != BreakIterator.DONE;
135 *          start = end, end = boundary.next()) {
136 *          System.out.println(source.substring(start,end));
137 *     }
138 * }
139 * </pre>
140 * </blockquote>
141 *
142 * Print each element in reverse order:
143 * <blockquote>
144 * <pre>
145 * public static void printEachBackward(BreakIterator boundary, String source) {
146 *     int end = boundary.last();
147 *     for (int start = boundary.previous();
148 *          start != BreakIterator.DONE;
149 *          end = start, start = boundary.previous()) {
150 *         System.out.println(source.substring(start,end));
151 *     }
152 * }
153 * </pre>
154 * </blockquote>
155 *
156 * Print first element:
157 * <blockquote>
158 * <pre>
159 * public static void printFirst(BreakIterator boundary, String source) {
160 *     int start = boundary.first();
161 *     int end = boundary.next();
162 *     System.out.println(source.substring(start,end));
163 * }
164 * </pre>
165 * </blockquote>
166 *
167 * Print last element:
168 * <blockquote>
169 * <pre>
170 * public static void printLast(BreakIterator boundary, String source) {
171 *     int end = boundary.last();
172 *     int start = boundary.previous();
173 *     System.out.println(source.substring(start,end));
174 * }
175 * </pre>
176 * </blockquote>
177 *
178 * Print the element at a specified position:
179 * <blockquote>
180 * <pre>
181 * public static void printAt(BreakIterator boundary, int pos, String source) {
182 *     int end = boundary.following(pos);
183 *     int start = boundary.previous();
184 *     System.out.println(source.substring(start,end));
185 * }
186 * </pre>
187 * </blockquote>
188 *
189 * Find the next word:
190 * <blockquote>
191 * <pre>
192 * public static int nextWordStartAfter(int pos, String text) {
193 *     BreakIterator wb = BreakIterator.getWordInstance();
194 *     wb.setText(text);
195 *     int last = wb.following(pos);
196 *     int current = wb.next();
197 *     while (current != BreakIterator.DONE) {
198 *         for (int p = last; p < current; p++) {
199 *             if (Character.isLetter(text.codePointAt(p)))
200 *                 return last;
201 *         }
202 *         last = current;
203 *         current = wb.next();
204 *     }
205 *     return BreakIterator.DONE;
206 * }
207 * </pre>
208 * (The iterator returned by BreakIterator.getWordInstance() is unique in that
209 * the break positions it returns don't represent both the start and end of the
210 * thing being iterated over.  That is, a sentence-break iterator returns breaks
211 * that each represent the end of one sentence and the beginning of the next.
212 * With the word-break iterator, the characters between two boundaries might be a
213 * word, or they might be the punctuation or whitespace between two words.  The
214 * above code uses a simple heuristic to determine which boundary is the beginning
215 * of a word: If the characters between this boundary and the next boundary
216 * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
217 * a Hangul syllable, a Kana character, etc.), then the text between this boundary
218 * and the next is a word; otherwise, it's the material between words.)
219 * </blockquote>
220 *
221 * @see CharacterIterator
222 *
223 */
224
225public abstract class BreakIterator implements Cloneable {
226
227    /**
228     * Constructor. BreakIterator is stateless and has no default behavior.
229     */
230    protected BreakIterator() {
231    }
232
233    /**
234     * Create a copy of this iterator
235     *
236     * @return A copy of this
237     */
238    @Override
239    public Object clone() {
240        try {
241            return super.clone();
242        } catch (CloneNotSupportedException e) {
243            throw new AssertionError(e);
244        }
245    }
246
247    /**
248     * DONE is returned by previous(), next(), next(int), preceding(int)
249     * and following(int) when either the first or last text boundary has been
250     * reached.
251     */
252    public static final int DONE = -1;
253
254    /**
255     * Returns the first boundary. The iterator's current position is set
256     * to the first text boundary.
257     *
258     * @return The character index of the first text boundary.
259     */
260    public abstract int first();
261
262    /**
263     * Returns the last boundary. The iterator's current position is set
264     * to the last text boundary.
265     *
266     * @return The character index of the last text boundary.
267     */
268    public abstract int last();
269
270    /**
271     * Returns the nth boundary from the current boundary. If either
272     * the first or last text boundary has been reached, it returns
273     * <code>BreakIterator.DONE</code> and the current position is set to either
274     * the first or last text boundary depending on which one is reached. Otherwise,
275     * the iterator's current position is set to the new boundary.
276     * For example, if the iterator's current position is the mth text boundary
277     * and three more boundaries exist from the current boundary to the last text
278     * boundary, the next(2) call will return m + 2. The new text position is set
279     * to the (m + 2)th text boundary. A next(4) call would return
280     * <code>BreakIterator.DONE</code> and the last text boundary would become the
281     * new text position.
282     *
283     * @param n which boundary to return.  A value of 0
284     *          does nothing.  Negative values move to previous boundaries
285     *          and positive values move to later boundaries.
286     * @return The character index of the nth boundary from the current position
287     * or <code>BreakIterator.DONE</code> if either first or last text boundary
288     * has been reached.
289     */
290    public abstract int next(int n);
291
292    /**
293     * Returns the boundary following the current boundary. If the current boundary
294     * is the last text boundary, it returns <code>BreakIterator.DONE</code> and
295     * the iterator's current position is unchanged. Otherwise, the iterator's
296     * current position is set to the boundary following the current boundary.
297     *
298     * @return The character index of the next text boundary or
299     * <code>BreakIterator.DONE</code> if the current boundary is the last text
300     * boundary.
301     * Equivalent to next(1).
302     * @see #next(int)
303     */
304    public abstract int next();
305
306    /**
307     * Returns the boundary preceding the current boundary. If the current boundary
308     * is the first text boundary, it returns <code>BreakIterator.DONE</code> and
309     * the iterator's current position is unchanged. Otherwise, the iterator's
310     * current position is set to the boundary preceding the current boundary.
311     *
312     * @return The character index of the previous text boundary or
313     * <code>BreakIterator.DONE</code> if the current boundary is the first text
314     * boundary.
315     */
316    public abstract int previous();
317
318    /**
319     * Returns the first boundary following the specified character offset. If the
320     * specified offset equals to the last text boundary, it returns
321     * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
322     * Otherwise, the iterator's current position is set to the returned boundary.
323     * The value returned is always greater than the offset or the value
324     * <code>BreakIterator.DONE</code>.
325     *
326     * @param offset the character offset to begin scanning.
327     * @return The first boundary after the specified offset or
328     * <code>BreakIterator.DONE</code> if the last text boundary is passed in
329     * as the offset.
330     * @throws IllegalArgumentException if the specified offset is less than
331     *                                  the first text boundary or greater than the last text
332     *                                  boundary.
333     */
334    public abstract int following(int offset);
335
336    /**
337     * Returns the last boundary preceding the specified character offset. If the
338     * specified offset equals to the first text boundary, it returns
339     * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
340     * Otherwise, the iterator's current position is set to the returned boundary.
341     * The value returned is always less than the offset or the value
342     * <code>BreakIterator.DONE</code>.
343     * @param offset the characater offset to begin scanning.
344     * @return The last boundary before the specified offset or
345     * <code>BreakIterator.DONE</code> if the first text boundary is passed in
346     * as the offset.
347     * @exception   IllegalArgumentException if the specified offset is less than
348     * the first text boundary or greater than the last text boundary.
349     * @since 1.2
350     */
351    public int preceding(int offset) {
352        // NOTE:  This implementation is here solely because we can't add new
353        // abstract methods to an existing class.  There is almost ALWAYS a
354        // better, faster way to do this.
355        int pos = following(offset);
356        while (pos >= offset && pos != DONE)
357            pos = previous();
358        return pos;
359    }
360
361    /**
362     * Returns true if the specified character offset is a text boundary.
363     * @param offset the character offset to check.
364     * @return <code>true</code> if "offset" is a boundary position,
365     * <code>false</code> otherwise.
366     * @exception   IllegalArgumentException if the specified offset is less than
367     * the first text boundary or greater than the last text boundary.
368     * @since 1.2
369     */
370    public boolean isBoundary(int offset) {
371        // NOTE: This implementation probably is wrong for most situations
372        // because it fails to take into account the possibility that a
373        // CharacterIterator passed to setText() may not have a begin offset
374        // of 0.  But since the abstract BreakIterator doesn't have that
375        // knowledge, it assumes the begin offset is 0.  If you subclass
376        // BreakIterator, copy the SimpleTextBoundary implementation of this
377        // function into your subclass.  [This should have been abstract at
378        // this level, but it's too late to fix that now.]
379        if (offset == 0) {
380            return true;
381        }
382        int boundary = following(offset - 1);
383        if (boundary == DONE) {
384            throw new IllegalArgumentException();
385        }
386        return boundary == offset;
387    }
388
389    /**
390     * Returns character index of the text boundary that was most
391     * recently returned by next(), next(int), previous(), first(), last(),
392     * following(int) or preceding(int). If any of these methods returns
393     * <code>BreakIterator.DONE</code> because either first or last text boundary
394     * has been reached, it returns the first or last text boundary depending on
395     * which one is reached.
396     *
397     * @return The text boundary returned from the above methods, first or last
398     * text boundary.
399     * @see #next()
400     * @see #next(int)
401     * @see #previous()
402     * @see #first()
403     * @see #last()
404     * @see #following(int)
405     * @see #preceding(int)
406     */
407    public abstract int current();
408
409    /**
410     * Get the text being scanned
411     *
412     * @return the text being scanned
413     */
414    public abstract CharacterIterator getText();
415
416    /**
417     * Set a new text string to be scanned.  The current scan
418     * position is reset to first().
419     *
420     * @param newText new text to scan.
421     */
422    public void setText(String newText) {
423        setText(new StringCharacterIterator(newText));
424    }
425
426    /**
427     * Set a new text for scanning.  The current scan
428     * position is reset to first().
429     *
430     * @param newText new text to scan.
431     */
432    public abstract void setText(CharacterIterator newText);
433
434    /**
435     * Returns a new <code>BreakIterator</code> instance
436     * for <a href="#word">word breaks</a>
437     * for the {@linkplain Locale#getDefault() default locale}.
438     *
439     * @return A break iterator for word breaks
440     */
441    public static BreakIterator getWordInstance() {
442        return getWordInstance(Locale.getDefault());
443    }
444
445    /**
446     * Returns a new <code>BreakIterator</code> instance
447     * for <a href="#word">word breaks</a>
448     * for the given locale.
449     *
450     * @param locale the desired locale
451     * @return A break iterator for word breaks
452     * @throws NullPointerException if <code>locale</code> is null
453     */
454    public static BreakIterator getWordInstance(Locale locale) {
455        return new IcuIteratorWrapper(
456                android.icu.text.BreakIterator.getWordInstance(locale));
457    }
458
459    /**
460     * Returns a new <code>BreakIterator</code> instance
461     * for <a href="#line">line breaks</a>
462     * for the {@linkplain Locale#getDefault() default locale}.
463     *
464     * @return A break iterator for line breaks
465     */
466    public static BreakIterator getLineInstance() {
467        return getLineInstance(Locale.getDefault());
468    }
469
470    /**
471     * Returns a new <code>BreakIterator</code> instance
472     * for <a href="#line">line breaks</a>
473     * for the given locale.
474     *
475     * @param locale the desired locale
476     * @return A break iterator for line breaks
477     * @throws NullPointerException if <code>locale</code> is null
478     */
479    public static BreakIterator getLineInstance(Locale locale) {
480        return new IcuIteratorWrapper(
481                android.icu.text.BreakIterator.getLineInstance(locale));
482    }
483
484    /**
485     * Returns a new <code>BreakIterator</code> instance
486     * for <a href="#character">character breaks</a>
487     * for the {@linkplain Locale#getDefault() default locale}.
488     *
489     * @return A break iterator for character breaks
490     */
491    public static BreakIterator getCharacterInstance() {
492        return getCharacterInstance(Locale.getDefault());
493    }
494
495    /**
496     * Returns a new <code>BreakIterator</code> instance
497     * for <a href="#character">character breaks</a>
498     * for the given locale.
499     *
500     * @param locale the desired locale
501     * @return A break iterator for character breaks
502     * @throws NullPointerException if <code>locale</code> is null
503     */
504    public static BreakIterator getCharacterInstance(Locale locale) {
505        return new IcuIteratorWrapper(
506                android.icu.text.BreakIterator.getCharacterInstance(locale));
507    }
508
509    /**
510     * Returns a new <code>BreakIterator</code> instance
511     * for <a href="#sentence">sentence breaks</a>
512     * for the {@linkplain Locale#getDefault() default locale}.
513     *
514     * @return A break iterator for sentence breaks
515     */
516    public static BreakIterator getSentenceInstance() {
517        return getSentenceInstance(Locale.getDefault());
518    }
519
520    /**
521     * Returns a new <code>BreakIterator</code> instance
522     * for <a href="#sentence">sentence breaks</a>
523     * for the given locale.
524     *
525     * @param locale the desired locale
526     * @return A break iterator for sentence breaks
527     * @throws NullPointerException if <code>locale</code> is null
528     */
529    public static BreakIterator getSentenceInstance(Locale locale) {
530        return new IcuIteratorWrapper(
531                android.icu.text.BreakIterator.getSentenceInstance(locale));
532    }
533
534    /**
535     * Returns an array of all locales for which the
536     * <code>get*Instance</code> methods of this class can return
537     * localized instances.
538     * The returned array represents the union of locales supported by the Java
539     * runtime and by installed
540     * {@link java.text.spi.BreakIteratorProvider BreakIteratorProvider} implementations.
541     * It must contain at least a <code>Locale</code>
542     * instance equal to {@link java.util.Locale#US Locale.US}.
543     *
544     * @return An array of locales for which localized
545     * <code>BreakIterator</code> instances are available.
546     */
547    public static synchronized Locale[] getAvailableLocales() {
548        return android.icu.text.BreakIterator.getAvailableLocales();
549    }
550}
551