1/*
2 *******************************************************************************
3 * Copyright (C) 1996-2015, International Business Machines Corporation and    *
4 * others. All Rights Reserved.                                                *
5 *******************************************************************************
6 */
7
8package com.ibm.icu.text;
9
10import java.lang.ref.SoftReference;
11import java.text.CharacterIterator;
12import java.text.StringCharacterIterator;
13import java.util.Locale;
14import java.util.MissingResourceException;
15
16import com.ibm.icu.impl.ICUDebug;
17import com.ibm.icu.util.ICUCloneNotSupportedException;
18import com.ibm.icu.util.ULocale;
19
20/**
21 * {@icuenhanced java.text.BreakIterator}.{@icu _usage_}
22 *
23 * <p>A class that locates boundaries in text.  This class defines a protocol for
24 * objects that break up a piece of natural-language text according to a set
25 * of criteria.  Instances or subclasses of BreakIterator can be provided, for
26 * example, to break a piece of text into words, sentences, or logical characters
27 * according to the conventions of some language or group of languages.
28 *
29 * We provide five built-in types of BreakIterator:
30 * <ul><li>getTitleInstance() returns a BreakIterator that locates boundaries
31 * between title breaks.
32 * <li>getSentenceInstance() returns a BreakIterator that locates boundaries
33 * between sentences.  This is useful for triple-click selection, for example.
34 * <li>getWordInstance() returns a BreakIterator that locates boundaries between
35 * words.  This is useful for double-click selection or "find whole words" searches.
36 * This type of BreakIterator makes sure there is a boundary position at the
37 * beginning and end of each legal word.  (Numbers count as words, too.)  Whitespace
38 * and punctuation are kept separate from real words.
39 * <li>getLineInstance() returns a BreakIterator that locates positions where it is
40 * legal for a text editor to wrap lines.  This is similar to word breaking, but
41 * not the same: punctuation and whitespace are generally kept with words (you don't
42 * want a line to start with whitespace, for example), and some special characters
43 * can force a position to be considered a line-break position or prevent a position
44 * from being a line-break position.
45 * <li>getCharacterInstance() returns a BreakIterator that locates boundaries between
46 * logical characters.  Because of the structure of the Unicode encoding, a logical
47 * character may be stored internally as more than one Unicode code point.  (A with an
48 * umlaut may be stored as an a followed by a separate combining umlaut character,
49 * for example, but the user still thinks of it as one character.)  This iterator allows
50 * various processes (especially text editors) to treat as characters the units of text
51 * that a user would think of as characters, rather than the units of text that the
52 * computer sees as "characters".</ul>
53 * The text boundary positions are found according to the rules
54 * described in Unicode Standard Annex #29, Text Boundaries, and
55 * Unicode Standard Annex #14, Line Breaking Properties.  These
56 * are available at http://www.unicode.org/reports/tr14/ and
57 * http://www.unicode.org/reports/tr29/.
58 * <p>
59 * BreakIterator's interface follows an "iterator" model (hence the name), meaning it
60 * has a concept of a "current position" and methods like first(), last(), next(),
61 * and previous() that update the current position.  All BreakIterators uphold the
62 * following invariants:
63 * <ul><li>The beginning and end of the text are always treated as boundary positions.
64 * <li>The current position of the iterator is always a boundary position (random-
65 * access methods move the iterator to the nearest boundary position before or
66 * after the specified position, not _to_ the specified position).
67 * <li>DONE is used as a flag to indicate when iteration has stopped.  DONE is only
68 * returned when the current position is the end of the text and the user calls next(),
69 * or when the current position is the beginning of the text and the user calls
70 * previous().
71 * <li>Break positions are numbered by the positions of the characters that follow
72 * them.  Thus, under normal circumstances, the position before the first character
73 * is 0, the position after the first character is 1, and the position after the
74 * last character is 1 plus the length of the string.
75 * <li>The client can change the position of an iterator, or the text it analyzes,
76 * at will, but cannot change the behavior.  If the user wants different behavior, he
77 * must instantiate a new iterator.</ul>
78 *
79 * BreakIterator accesses the text it analyzes through a CharacterIterator, which makes
80 * it possible to use BreakIterator to analyze text in any text-storage vehicle that
81 * provides a CharacterIterator interface.
82 *
83 * <b>Note:</b>  Some types of BreakIterator can take a long time to create, and
84 * instances of BreakIterator are not currently cached by the system.  For
85 * optimal performance, keep instances of BreakIterator around as long as makes
86 * sense.  For example, when word-wrapping a document, don't create and destroy a
87 * new BreakIterator for each line.  Create one break iterator for the whole document
88 * (or whatever stretch of text you're wrapping) and use it to do the whole job of
89 * wrapping the text.
90 *
91  * <P>
92 * <strong>Examples</strong>:<P>
93 * Creating and using text boundaries
94 * <blockquote>
95 * <pre>
96 * public static void main(String args[]) {
97 *      if (args.length == 1) {
98 *          String stringToExamine = args[0];
99 *          //print each word in order
100 *          BreakIterator boundary = BreakIterator.getWordInstance();
101 *          boundary.setText(stringToExamine);
102 *          printEachForward(boundary, stringToExamine);
103 *          //print each sentence in reverse order
104 *          boundary = BreakIterator.getSentenceInstance(Locale.US);
105 *          boundary.setText(stringToExamine);
106 *          printEachBackward(boundary, stringToExamine);
107 *          printFirst(boundary, stringToExamine);
108 *          printLast(boundary, stringToExamine);
109 *      }
110 * }
111 * </pre>
112 * </blockquote>
113 *
114 * Print each element in order
115 * <blockquote>
116 * <pre>
117 * public static void printEachForward(BreakIterator boundary, String source) {
118 *     int start = boundary.first();
119 *     for (int end = boundary.next();
120 *          end != BreakIterator.DONE;
121 *          start = end, end = boundary.next()) {
122 *          System.out.println(source.substring(start,end));
123 *     }
124 * }
125 * </pre>
126 * </blockquote>
127 *
128 * Print each element in reverse order
129 * <blockquote>
130 * <pre>
131 * public static void printEachBackward(BreakIterator boundary, String source) {
132 *     int end = boundary.last();
133 *     for (int start = boundary.previous();
134 *          start != BreakIterator.DONE;
135 *          end = start, start = boundary.previous()) {
136 *         System.out.println(source.substring(start,end));
137 *     }
138 * }
139 * </pre>
140 * </blockquote>
141 *
142 * Print first element
143 * <blockquote>
144 * <pre>
145 * public static void printFirst(BreakIterator boundary, String source) {
146 *     int start = boundary.first();
147 *     int end = boundary.next();
148 *     System.out.println(source.substring(start,end));
149 * }
150 * </pre>
151 * </blockquote>
152 *
153 * Print last element
154 * <blockquote>
155 * <pre>
156 * public static void printLast(BreakIterator boundary, String source) {
157 *     int end = boundary.last();
158 *     int start = boundary.previous();
159 *     System.out.println(source.substring(start,end));
160 * }
161 * </pre>
162 * </blockquote>
163 *
164 * Print the element at a specified position
165 * <blockquote>
166 * <pre>
167 * public static void printAt(BreakIterator boundary, int pos, String source) {
168 *     int end = boundary.following(pos);
169 *     int start = boundary.previous();
170 *     System.out.println(source.substring(start,end));
171 * }
172 * </pre>
173 * </blockquote>
174 *
175 * Find the next word
176 * <blockquote>
177 * <pre>
178 * public static int nextWordStartAfter(int pos, String text) {
179 *     BreakIterator wb = BreakIterator.getWordInstance();
180 *     wb.setText(text);
181 *     int last = wb.following(pos);
182 *     int current = wb.next();
183 *     while (current != BreakIterator.DONE) {
184 *         for (int p = last; p < current; p++) {
185 *             if (Character.isLetter(text.charAt(p)))
186 *                 return last;
187 *         }
188 *         last = current;
189 *         current = wb.next();
190 *     }
191 *     return BreakIterator.DONE;
192 * }
193 * </pre>
194 * (The iterator returned by BreakIterator.getWordInstance() is unique in that
195 * the break positions it returns don't represent both the start and end of the
196 * thing being iterated over.  That is, a sentence-break iterator returns breaks
197 * that each represent the end of one sentence and the beginning of the next.
198 * With the word-break iterator, the characters between two boundaries might be a
199 * word, or they might be the punctuation or whitespace between two words.  The
200 * above code uses a simple heuristic to determine which boundary is the beginning
201 * of a word: If the characters between this boundary and the next boundary
202 * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
203 * a Hangul syllable, a Kana character, etc.), then the text between this boundary
204 * and the next is a word; otherwise, it's the material between words.)
205 * </blockquote>
206 *
207 * @see CharacterIterator
208 * @stable ICU 2.0
209 *
210 */
211
212public abstract class BreakIterator implements Cloneable
213{
214
215    private static final boolean DEBUG = ICUDebug.enabled("breakiterator");
216
217    /**
218     * Default constructor.  There is no state that is carried by this abstract
219     * base class.
220     * @stable ICU 2.0
221     */
222    protected BreakIterator()
223    {
224    }
225
226    /**
227     * Clone method.  Creates another BreakIterator with the same behavior and
228     * current state as this one.
229     * @return The clone.
230     * @stable ICU 2.0
231     */
232    public Object clone()
233    {
234        try {
235            return super.clone();
236        }
237        catch (CloneNotSupportedException e) {
238            ///CLOVER:OFF
239            throw new ICUCloneNotSupportedException(e);
240            ///CLOVER:ON
241        }
242    }
243
244    /**
245     * DONE is returned by previous() and next() after all valid
246     * boundaries have been returned.
247     * @stable ICU 2.0
248     */
249    public static final int DONE = -1;
250
251    /**
252     * Set the iterator to the first boundary position.  This is always the beginning
253     * index of the text this iterator iterates over.  For example, if
254     * the iterator iterates over a whole string, this function will
255     * always return 0.
256     * @return The character offset of the beginning of the stretch of text
257     * being broken.
258     * @stable ICU 2.0
259     */
260    public abstract int first();
261
262    /**
263     * Set the iterator to the last boundary position.  This is always the "past-the-end"
264     * index of the text this iterator iterates over.  For example, if the
265     * iterator iterates over a whole string (call it "text"), this function
266     * will always return text.length().
267     * @return The character offset of the end of the stretch of text
268     * being broken.
269     * @stable ICU 2.0
270     */
271    public abstract int last();
272
273    /**
274     * Move the iterator by the specified number of steps in the text.
275     * A positive number moves the iterator forward; a negative number
276     * moves the iterator backwards. If this causes the iterator
277     * to move off either end of the text, this function returns DONE;
278     * otherwise, this function returns the position of the appropriate
279     * boundary.  Calling this function is equivalent to calling next() or
280     * previous() n times.
281     * @param n The number of boundaries to advance over (if positive, moves
282     * forward; if negative, moves backwards).
283     * @return The position of the boundary n boundaries from the current
284     * iteration position, or DONE if moving n boundaries causes the iterator
285     * to advance off either end of the text.
286     * @stable ICU 2.0
287     */
288    public abstract int next(int n);
289
290    /**
291     * Advances the iterator forward one boundary.  The current iteration
292     * position is updated to point to the next boundary position after the
293     * current position, and this is also the value that is returned.  If
294     * the current position is equal to the value returned by last(), or to
295     * DONE, this function returns DONE and sets the current position to
296     * DONE.
297     * @return The position of the first boundary position following the
298     * iteration position.
299     * @stable ICU 2.0
300     */
301    public abstract int next();
302
303    /**
304     * Move the iterator backward one boundary.  The current iteration
305     * position is updated to point to the last boundary position before
306     * the current position, and this is also the value that is returned.  If
307     * the current position is equal to the value returned by first(), or to
308     * DONE, this function returns DONE and sets the current position to
309     * DONE.
310     * @return The position of the last boundary position preceding the
311     * iteration position.
312     * @stable ICU 2.0
313     */
314    public abstract int previous();
315
316    /**
317     * Sets the iterator's current iteration position to be the first
318     * boundary position following the specified position.  (Whether the
319     * specified position is itself a boundary position or not doesn't
320     * matter-- this function always moves the iteration position to the
321     * first boundary after the specified position.)  If the specified
322     * position is the past-the-end position, returns DONE.
323     * @param offset The character position to start searching from.
324     * @return The position of the first boundary position following
325     * "offset" (whether or not "offset" itself is a boundary position),
326     * or DONE if "offset" is the past-the-end offset.
327     * @stable ICU 2.0
328     */
329    public abstract int following(int offset);
330
331    /**
332     * Sets the iterator's current iteration position to be the last
333     * boundary position preceding the specified position.  (Whether the
334     * specified position is itself a boundary position or not doesn't
335     * matter-- this function always moves the iteration position to the
336     * last boundary before the specified position.)  If the specified
337     * position is the starting position, returns DONE.
338     * @param offset The character position to start searching from.
339     * @return The position of the last boundary position preceding
340     * "offset" (whether of not "offset" itself is a boundary position),
341     * or DONE if "offset" is the starting offset of the iterator.
342     * @stable ICU 2.0
343     */
344    public int preceding(int offset) {
345        // NOTE:  This implementation is here solely because we can't add new
346        // abstract methods to an existing class.  There is almost ALWAYS a
347        // better, faster way to do this.
348        int pos = following(offset);
349        while (pos >= offset && pos != DONE)
350            pos = previous();
351        return pos;
352    }
353
354    /**
355     * Return true if the specified position is a boundary position.  If the
356     * function returns true, the current iteration position is set to the
357     * specified position; if the function returns false, the current
358     * iteration position is set as though following() had been called.
359     * @param offset the offset to check.
360     * @return True if "offset" is a boundary position.
361     * @stable ICU 2.0
362     */
363    public boolean isBoundary(int offset) {
364        // Again, this is the default implementation, which is provided solely because
365        // we couldn't add a new abstract method to an existing class.  The real
366        // implementations will usually need to do a little more work.
367        if (offset == 0) {
368            return true;
369        }
370        else
371            return following(offset - 1) == offset;
372    }
373
374    /**
375     * Return the iterator's current position.
376     * @return The iterator's current position.
377     * @stable ICU 2.0
378     */
379    public abstract int current();
380
381
382    /**
383     * Tag value for "words" that do not fit into any of other categories.
384     * Includes spaces and most punctuation.
385     * @stable ICU 53
386     */
387    public static final int WORD_NONE           = 0;
388
389    /**
390     * Upper bound for tags for uncategorized words.
391     * @stable ICU 53
392     */
393    public static final int WORD_NONE_LIMIT     = 100;
394
395    /**
396     * Tag value for words that appear to be numbers, lower limit.
397     * @stable ICU 53
398     */
399    public static final int WORD_NUMBER         = 100;
400
401    /**
402     * Tag value for words that appear to be numbers, upper limit.
403     * @stable ICU 53
404     */
405    public static final int WORD_NUMBER_LIMIT   = 200;
406
407    /**
408     * Tag value for words that contain letters, excluding
409     * hiragana, katakana or ideographic characters, lower limit.
410     * @stable ICU 53
411     */
412    public static final int WORD_LETTER         = 200;
413
414    /**
415     * Tag value for words containing letters, upper limit
416     * @stable ICU 53
417     */
418    public static final int WORD_LETTER_LIMIT   = 300;
419
420    /**
421     * Tag value for words containing kana characters, lower limit
422     * @stable ICU 53
423     */
424    public static final int WORD_KANA           = 300;
425
426    /**
427     * Tag value for words containing kana characters, upper limit
428     * @stable ICU 53
429     */
430    public static final int WORD_KANA_LIMIT     = 400;
431
432    /**
433     * Tag value for words containing ideographic characters, lower limit
434     * @stable ICU 53
435     */
436    public static final int WORD_IDEO           = 400;
437
438    /**
439     * Tag value for words containing ideographic characters, upper limit
440     * @stable ICU 53
441     */
442    public static final int WORD_IDEO_LIMIT     = 500;
443
444    /**
445     * For RuleBasedBreakIterators, return the status tag from the
446     * break rule that determined the most recently
447     * returned break position.
448     * <p>
449     * For break iterator types that do not support a rule status,
450     * a default value of 0 is returned.
451     * <p>
452     * @return The status from the break rule that determined the most recently
453     *         returned break position.
454     *
455     * @stable ICU 52
456     */
457
458    public int  getRuleStatus() {
459        return 0;
460    }
461
462    /**
463     * For RuleBasedBreakIterators, get the status (tag) values from the break rule(s)
464     * that determined the most recently returned break position.
465     * <p>
466     * For break iterator types that do not support rule status,
467     * no values are returned.
468     * <p>
469     * If the size  of the output array is insufficient to hold the data,
470     *  the output will be truncated to the available length.  No exception
471     *  will be thrown.
472     *
473     * @param fillInArray an array to be filled in with the status values.
474     * @return          The number of rule status values from rules that determined
475     *                  the most recent boundary returned by the break iterator.
476     *                  In the event that the array is too small, the return value
477     *                  is the total number of status values that were available,
478     *                  not the reduced number that were actually returned.
479     * @stable ICU 52
480     */
481    public int getRuleStatusVec(int[] fillInArray) {
482        if (fillInArray != null && fillInArray.length > 0) {
483            fillInArray[0] = 0;
484        }
485        return 1;
486    }
487
488    /**
489     * Returns a CharacterIterator over the text being analyzed.
490     * For at least some subclasses of BreakIterator, this is a reference
491     * to the <b>actual iterator being used</b> by the BreakIterator,
492     * and therefore, this function's return value should be treated as
493     * <tt>const</tt>.  No guarantees are made about the current position
494     * of this iterator when it is returned.  If you need to move that
495     * position to examine the text, clone this function's return value first.
496     * @return A CharacterIterator over the text being analyzed.
497     * @stable ICU 2.0
498     */
499    public abstract CharacterIterator getText();
500
501    /**
502     * Sets the iterator to analyze a new piece of text.  The new
503     * piece of text is passed in as a String, and the current
504     * iteration position is reset to the beginning of the string.
505     * (The old text is dropped.)
506     * @param newText A String containing the text to analyze with
507     * this BreakIterator.
508     * @stable ICU 2.0
509     */
510    public void setText(String newText)
511    {
512        setText(new StringCharacterIterator(newText));
513    }
514
515    /**
516     * Sets the iterator to analyze a new piece of text.  The
517     * BreakIterator is passed a CharacterIterator through which
518     * it will access the text itself.  The current iteration
519     * position is reset to the CharacterIterator's start index.
520     * (The old iterator is dropped.)
521     * @param newText A CharacterIterator referring to the text
522     * to analyze with this BreakIterator (the iterator's current
523     * position is ignored, but its other state is significant).
524     * @stable ICU 2.0
525     */
526    public abstract void setText(CharacterIterator newText);
527
528    /**
529     * {@icu}
530     * @stable ICU 2.4
531     */
532    public static final int KIND_CHARACTER = 0;
533    /**
534     * {@icu}
535     * @stable ICU 2.4
536     */
537    public static final int KIND_WORD = 1;
538    /**
539     * {@icu}
540     * @stable ICU 2.4
541     */
542    public static final int KIND_LINE = 2;
543    /**
544     * {@icu}
545     * @stable ICU 2.4
546     */
547    public static final int KIND_SENTENCE = 3;
548    /**
549     * {@icu}
550     * @stable ICU 2.4
551     */
552    public static final int KIND_TITLE = 4;
553
554    /**
555     * @since ICU 2.8
556     */
557    private static final int KIND_COUNT = 5;
558
559    private static final SoftReference<?>[] iterCache = new SoftReference<?>[5];
560
561    /**
562     * Returns a new instance of BreakIterator that locates word boundaries.
563     * This function assumes that the text being analyzed is in the default
564     * locale's language.
565     * @return An instance of BreakIterator that locates word boundaries.
566     * @stable ICU 2.0
567     */
568    public static BreakIterator getWordInstance()
569    {
570        return getWordInstance(ULocale.getDefault());
571    }
572
573    /**
574     * Returns a new instance of BreakIterator that locates word boundaries.
575     * @param where A locale specifying the language of the text to be
576     * analyzed.
577     * @return An instance of BreakIterator that locates word boundaries.
578     * @throws NullPointerException if <code>where</code> is null.
579     * @stable ICU 2.0
580     */
581    public static BreakIterator getWordInstance(Locale where)
582    {
583        return getBreakInstance(ULocale.forLocale(where), KIND_WORD);
584    }
585
586    /**
587     * {@icu} Returns a new instance of BreakIterator that locates word boundaries.
588     * @param where A locale specifying the language of the text to be
589     * analyzed.
590     * @return An instance of BreakIterator that locates word boundaries.
591     * @throws NullPointerException if <code>where</code> is null.
592     * @stable ICU 3.2
593     */
594    public static BreakIterator getWordInstance(ULocale where)
595    {
596        return getBreakInstance(where, KIND_WORD);
597    }
598
599    /**
600     * Returns a new instance of BreakIterator that locates legal line-
601     * wrapping positions.  This function assumes the text being broken
602     * is in the default locale's language.
603     * @return A new instance of BreakIterator that locates legal
604     * line-wrapping positions.
605     * @stable ICU 2.0
606     */
607    public static BreakIterator getLineInstance()
608    {
609        return getLineInstance(ULocale.getDefault());
610    }
611
612    /**
613     * Returns a new instance of BreakIterator that locates legal line-
614     * wrapping positions.
615     * @param where A Locale specifying the language of the text being broken.
616     * @return A new instance of BreakIterator that locates legal
617     * line-wrapping positions.
618     * @throws NullPointerException if <code>where</code> is null.
619     * @stable ICU 2.0
620     */
621    public static BreakIterator getLineInstance(Locale where)
622    {
623        return getBreakInstance(ULocale.forLocale(where), KIND_LINE);
624    }
625
626    /**
627     * {@icu} Returns a new instance of BreakIterator that locates legal line-
628     * wrapping positions.
629     * @param where A Locale specifying the language of the text being broken.
630     * @return A new instance of BreakIterator that locates legal
631     * line-wrapping positions.
632     * @throws NullPointerException if <code>where</code> is null.
633     * @stable ICU 3.2
634     */
635    public static BreakIterator getLineInstance(ULocale where)
636    {
637        return getBreakInstance(where, KIND_LINE);
638    }
639
640    /**
641     * Returns a new instance of BreakIterator that locates logical-character
642     * boundaries.  This function assumes that the text being analyzed is
643     * in the default locale's language.
644     * @return A new instance of BreakIterator that locates logical-character
645     * boundaries.
646     * @stable ICU 2.0
647     */
648    public static BreakIterator getCharacterInstance()
649    {
650        return getCharacterInstance(ULocale.getDefault());
651    }
652
653    /**
654     * Returns a new instance of BreakIterator that locates logical-character
655     * boundaries.
656     * @param where A Locale specifying the language of the text being analyzed.
657     * @return A new instance of BreakIterator that locates logical-character
658     * boundaries.
659     * @throws NullPointerException if <code>where</code> is null.
660     * @stable ICU 2.0
661     */
662    public static BreakIterator getCharacterInstance(Locale where)
663    {
664        return getBreakInstance(ULocale.forLocale(where), KIND_CHARACTER);
665    }
666
667    /**
668     * {@icu} Returns a new instance of BreakIterator that locates logical-character
669     * boundaries.
670     * @param where A Locale specifying the language of the text being analyzed.
671     * @return A new instance of BreakIterator that locates logical-character
672     * boundaries.
673     * @throws NullPointerException if <code>where</code> is null.
674     * @stable ICU 3.2
675     */
676    public static BreakIterator getCharacterInstance(ULocale where)
677    {
678        return getBreakInstance(where, KIND_CHARACTER);
679    }
680
681    /**
682     * Returns a new instance of BreakIterator that locates sentence boundaries.
683     * This function assumes the text being analyzed is in the default locale's
684     * language.
685     * @return A new instance of BreakIterator that locates sentence boundaries.
686     * @stable ICU 2.0
687     */
688    public static BreakIterator getSentenceInstance()
689    {
690        return getSentenceInstance(ULocale.getDefault());
691    }
692
693    /**
694     * Returns a new instance of BreakIterator that locates sentence boundaries.
695     * @param where A Locale specifying the language of the text being analyzed.
696     * @return A new instance of BreakIterator that locates sentence boundaries.
697     * @throws NullPointerException if <code>where</code> is null.
698     * @stable ICU 2.0
699     */
700    public static BreakIterator getSentenceInstance(Locale where)
701    {
702        return getBreakInstance(ULocale.forLocale(where), KIND_SENTENCE);
703    }
704
705    /**
706     * {@icu} Returns a new instance of BreakIterator that locates sentence boundaries.
707     * @param where A Locale specifying the language of the text being analyzed.
708     * @return A new instance of BreakIterator that locates sentence boundaries.
709     * @throws NullPointerException if <code>where</code> is null.
710     * @stable ICU 3.2
711     */
712    public static BreakIterator getSentenceInstance(ULocale where)
713    {
714        return getBreakInstance(where, KIND_SENTENCE);
715    }
716
717    /**
718     * {@icu} Returns a new instance of BreakIterator that locates title boundaries.
719     * This function assumes the text being analyzed is in the default locale's
720     * language. The iterator returned locates title boundaries as described for
721     * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
722     * please use a word boundary iterator. {@link #getWordInstance}
723     * @return A new instance of BreakIterator that locates title boundaries.
724     * @stable ICU 2.0
725     */
726    public static BreakIterator getTitleInstance()
727    {
728        return getTitleInstance(ULocale.getDefault());
729    }
730
731    /**
732     * {@icu} Returns a new instance of BreakIterator that locates title boundaries.
733     * The iterator returned locates title boundaries as described for
734     * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
735     * please use Word Boundary iterator.{@link #getWordInstance}
736     * @param where A Locale specifying the language of the text being analyzed.
737     * @return A new instance of BreakIterator that locates title boundaries.
738     * @throws NullPointerException if <code>where</code> is null.
739     * @stable ICU 2.0
740     */
741    public static BreakIterator getTitleInstance(Locale where)
742    {
743        return getBreakInstance(ULocale.forLocale(where), KIND_TITLE);
744    }
745
746    /**
747     * {@icu} Returns a new instance of BreakIterator that locates title boundaries.
748     * The iterator returned locates title boundaries as described for
749     * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
750     * please use Word Boundary iterator.{@link #getWordInstance}
751     * @param where A Locale specifying the language of the text being analyzed.
752     * @return A new instance of BreakIterator that locates title boundaries.
753     * @throws NullPointerException if <code>where</code> is null.
754     * @stable ICU 3.2
755s     */
756    public static BreakIterator getTitleInstance(ULocale where)
757    {
758        return getBreakInstance(where, KIND_TITLE);
759    }
760
761    /**
762     * {@icu} Registers a new break iterator of the indicated kind, to use in the given
763     * locale.  Clones of the iterator will be returned if a request for a break iterator
764     * of the given kind matches or falls back to this locale.
765     *
766     * <p>Because ICU may choose to cache BreakIterator objects internally, this must
767     * be called at application startup, prior to any calls to
768     * BreakIterator.getInstance to avoid undefined behavior.
769     *
770     * @param iter the BreakIterator instance to adopt.
771     * @param locale the Locale for which this instance is to be registered
772     * @param kind the type of iterator for which this instance is to be registered
773     * @return a registry key that can be used to unregister this instance
774     * @stable ICU 2.4
775     */
776    public static Object registerInstance(BreakIterator iter, Locale locale, int kind) {
777        return registerInstance(iter, ULocale.forLocale(locale), kind);
778    }
779
780    /**
781     * {@icu} Registers a new break iterator of the indicated kind, to use in the given
782     * locale.  Clones of the iterator will be returned if a request for a break iterator
783     * of the given kind matches or falls back to this locale.
784     *
785     * <p>Because ICU may choose to cache BreakIterator objects internally, this must
786     * be called at application startup, prior to any calls to
787     * BreakIterator.getInstance to avoid undefined behavior.
788     *
789     * @param iter the BreakIterator instance to adopt.
790     * @param locale the Locale for which this instance is to be registered
791     * @param kind the type of iterator for which this instance is to be registered
792     * @return a registry key that can be used to unregister this instance
793     * @stable ICU 3.2
794     */
795    public static Object registerInstance(BreakIterator iter, ULocale locale, int kind) {
796        // If the registered object matches the one in the cache, then
797        // flush the cached object.
798        if (iterCache[kind] != null) {
799            BreakIteratorCache cache = (BreakIteratorCache) iterCache[kind].get();
800            if (cache != null) {
801                if (cache.getLocale().equals(locale)) {
802                    iterCache[kind] = null;
803                }
804            }
805        }
806        return getShim().registerInstance(iter, locale, kind);
807    }
808
809    /**
810     * {@icu} Unregisters a previously-registered BreakIterator using the key returned
811     * from the register call.  Key becomes invalid after this call and should not be used
812     * again.
813     * @param key the registry key returned by a previous call to registerInstance
814     * @return true if the iterator for the key was successfully unregistered
815     * @stable ICU 2.4
816     */
817    public static boolean unregister(Object key) {
818        if (key == null) {
819            throw new IllegalArgumentException("registry key must not be null");
820        }
821        // TODO: we don't do code coverage for the following lines
822        // because in getBreakInstance we always instantiate the shim,
823        // and test execution is such that we always instantiate a
824        // breakiterator before we get to the break iterator tests.
825        // this is for modularization, and we could remove the
826        // dependencies in getBreakInstance by rewriting part of the
827        // LocaleData code, or perhaps by accepting it into the
828        // module.
829        ///CLOVER:OFF
830        if (shim != null) {
831            // Unfortunately, we don't know what is being unregistered
832            // -- what `kind' and what locale -- so we flush all
833            // caches.  This is safe but inefficient if people are
834            // actively registering and unregistering.
835            for (int kind=0; kind<KIND_COUNT; ++kind) {
836                iterCache[kind] = null;
837            }
838            return shim.unregister(key);
839        }
840        return false;
841        ///CLOVER:ON
842    }
843
844    // end of registration
845
846    /**
847     * Returns a particular kind of BreakIterator for a locale.
848     * Avoids writing a switch statement with getXYZInstance(where) calls.
849     * @internal
850     * @deprecated This API is ICU internal only.
851     */
852    @Deprecated
853    public static BreakIterator getBreakInstance(ULocale where, int kind) {
854        if (where == null) {
855            throw new NullPointerException("Specified locale is null");
856        }
857        if (iterCache[kind] != null) {
858            BreakIteratorCache cache = (BreakIteratorCache)iterCache[kind].get();
859            if (cache != null) {
860                if (cache.getLocale().equals(where)) {
861                    return cache.createBreakInstance();
862                }
863            }
864        }
865
866        // sigh, all to avoid linking in ICULocaleData...
867        BreakIterator result = getShim().createBreakIterator(where, kind);
868
869        BreakIteratorCache cache = new BreakIteratorCache(where, result);
870        iterCache[kind] = new SoftReference<BreakIteratorCache>(cache);
871        if (result instanceof RuleBasedBreakIterator) {
872            RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator)result;
873            rbbi.setBreakType(kind);
874        }
875
876        return result;
877    }
878
879
880    /**
881     * Returns a list of locales for which BreakIterators can be used.
882     * @return An array of Locales.  All of the locales in the array can
883     * be used when creating a BreakIterator.
884     * @stable ICU 2.6
885     */
886    public static synchronized Locale[] getAvailableLocales()
887    {
888        // to avoid linking ICULocaleData
889        return getShim().getAvailableLocales();
890    }
891
892    /**
893     * {@icu} Returns a list of locales for which BreakIterators can be used.
894     * @return An array of Locales.  All of the locales in the array can
895     * be used when creating a BreakIterator.
896     * @draft ICU 3.2 (retain)
897     * @provisional This API might change or be removed in a future release.
898     */
899    public static synchronized ULocale[] getAvailableULocales()
900    {
901        // to avoid linking ICULocaleData
902        return getShim().getAvailableULocales();
903    }
904
905    private static final class BreakIteratorCache {
906
907        private BreakIterator iter;
908        private ULocale where;
909
910        BreakIteratorCache(ULocale where, BreakIterator iter) {
911            this.where = where;
912            this.iter = (BreakIterator) iter.clone();
913        }
914
915        ULocale getLocale() {
916            return where;
917        }
918
919        BreakIterator createBreakInstance() {
920            return (BreakIterator) iter.clone();
921        }
922    }
923
924    static abstract class BreakIteratorServiceShim {
925        public abstract Object registerInstance(BreakIterator iter, ULocale l, int k);
926        public abstract boolean unregister(Object key);
927        public abstract Locale[] getAvailableLocales();
928        public abstract ULocale[] getAvailableULocales();
929        public abstract BreakIterator createBreakIterator(ULocale l, int k);
930    }
931
932    private static BreakIteratorServiceShim shim;
933    private static BreakIteratorServiceShim getShim() {
934        // Note: this instantiation is safe on loose-memory-model configurations
935        // despite lack of synchronization, since the shim instance has no state--
936        // it's all in the class init.  The worst problem is we might instantiate
937        // two shim instances, but they'll share the same state so that's ok.
938        if (shim == null) {
939            try {
940                Class<?> cls = Class.forName("com.ibm.icu.text.BreakIteratorFactory");
941                shim = (BreakIteratorServiceShim)cls.newInstance();
942            }
943            catch (MissingResourceException e)
944            {
945                throw e;
946            }
947            catch (Exception e) {
948                ///CLOVER:OFF
949                if(DEBUG){
950                    e.printStackTrace();
951                }
952                throw new RuntimeException(e.getMessage());
953                ///CLOVER:ON
954            }
955        }
956        return shim;
957    }
958
959    // -------- BEGIN ULocale boilerplate --------
960
961    /**
962     * {@icu} Returns the locale that was used to create this object, or null.
963     * This may may differ from the locale requested at the time of
964     * this object's creation.  For example, if an object is created
965     * for locale <tt>en_US_CALIFORNIA</tt>, the actual data may be
966     * drawn from <tt>en</tt> (the <i>actual</i> locale), and
967     * <tt>en_US</tt> may be the most specific locale that exists (the
968     * <i>valid</i> locale).
969     *
970     * <p>Note: The <i>actual</i> locale is returned correctly, but the <i>valid</i>
971     * locale is not, in most cases.
972     * @param type type of information requested, either {@link
973     * com.ibm.icu.util.ULocale#VALID_LOCALE} or {@link
974     * com.ibm.icu.util.ULocale#ACTUAL_LOCALE}.
975     * @return the information specified by <i>type</i>, or null if
976     * this object was not constructed from locale data.
977     * @see com.ibm.icu.util.ULocale
978     * @see com.ibm.icu.util.ULocale#VALID_LOCALE
979     * @see com.ibm.icu.util.ULocale#ACTUAL_LOCALE
980     * @draft ICU 2.8 (retain)
981     * @provisional This API might change or be removed in a future release.
982     */
983    public final ULocale getLocale(ULocale.Type type) {
984        return type == ULocale.ACTUAL_LOCALE ?
985            this.actualLocale : this.validLocale;
986    }
987
988    /**
989     * Set information about the locales that were used to create this
990     * object.  If the object was not constructed from locale data,
991     * both arguments should be set to null.  Otherwise, neither
992     * should be null.  The actual locale must be at the same level or
993     * less specific than the valid locale.  This method is intended
994     * for use by factories or other entities that create objects of
995     * this class.
996     * @param valid the most specific locale containing any resource
997     * data, or null
998     * @param actual the locale containing data used to construct this
999     * object, or null
1000     * @see com.ibm.icu.util.ULocale
1001     * @see com.ibm.icu.util.ULocale#VALID_LOCALE
1002     * @see com.ibm.icu.util.ULocale#ACTUAL_LOCALE
1003     */
1004    final void setLocale(ULocale valid, ULocale actual) {
1005        // Change the following to an assertion later
1006        if ((valid == null) != (actual == null)) {
1007            ///CLOVER:OFF
1008            throw new IllegalArgumentException();
1009            ///CLOVER:ON
1010        }
1011        // Another check we could do is that the actual locale is at
1012        // the same level or less specific than the valid locale.
1013        this.validLocale = valid;
1014        this.actualLocale = actual;
1015    }
1016
1017    /**
1018     * The most specific locale containing any resource data, or null.
1019     * @see com.ibm.icu.util.ULocale
1020     */
1021    private ULocale validLocale;
1022
1023    /**
1024     * The locale containing data used to construct this object, or
1025     * null.
1026     * @see com.ibm.icu.util.ULocale
1027     */
1028    private ULocale actualLocale;
1029
1030    // -------- END ULocale boilerplate --------
1031}
1032