/* ******************************************************************************* * Copyright (C) 1996-2012, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.text; import java.text.CharacterIterator; import java.text.StringCharacterIterator; import java.util.Locale; import com.ibm.icu.util.ULocale; /** * {@icuenhanced java.text.BreakIterator}.{@icu _usage_} * *

A class that locates boundaries in text. This class defines a protocol for * objects that break up a piece of natural-language text according to a set * of criteria. Instances or subclasses of BreakIterator can be provided, for * example, to break a piece of text into words, sentences, or logical characters * according to the conventions of some language or group of languages. * * We provide five built-in types of BreakIterator: *

* * BreakIterator's interface follows an "iterator" model (hence the name), meaning it * has a concept of a "current position" and methods like first(), last(), next(), * and previous() that update the current position. All BreakIterators uphold the * following invariants: * * * BreakIterator accesses the text it analyzes through a CharacterIterator, which makes * it possible to use BreakIterator to analyze text in any text-storage vehicle that * provides a CharacterIterator interface. * * Note: Some types of BreakIterator can take a long time to create, and * instances of BreakIterator are not currently cached by the system. For * optimal performance, keep instances of BreakIterator around as long as makes * sense. For example, when word-wrapping a document, don't create and destroy a * new BreakIterator for each line. Create one break iterator for the whole document * (or whatever stretch of text you're wrapping) and use it to do the whole job of * wrapping the text. * *

* Examples:

* Creating and using text boundaries *

*
 * public static void main(String args[]) {
 *      if (args.length == 1) {
 *          String stringToExamine = args[0];
 *          //print each word in order
 *          BreakIterator boundary = BreakIterator.getWordInstance();
 *          boundary.setText(stringToExamine);
 *          printEachForward(boundary, stringToExamine);
 *          //print each sentence in reverse order
 *          boundary = BreakIterator.getSentenceInstance(Locale.US);
 *          boundary.setText(stringToExamine);
 *          printEachBackward(boundary, stringToExamine);
 *          printFirst(boundary, stringToExamine);
 *          printLast(boundary, stringToExamine);
 *      }
 * }
 * 
*
* * Print each element in order *
*
 * public static void printEachForward(BreakIterator boundary, String source) {
 *     int start = boundary.first();
 *     for (int end = boundary.next();
 *          end != BreakIterator.DONE;
 *          start = end, end = boundary.next()) {
 *          System.out.println(source.substring(start,end));
 *     }
 * }
 * 
*
* * Print each element in reverse order *
*
 * public static void printEachBackward(BreakIterator boundary, String source) {
 *     int end = boundary.last();
 *     for (int start = boundary.previous();
 *          start != BreakIterator.DONE;
 *          end = start, start = boundary.previous()) {
 *         System.out.println(source.substring(start,end));
 *     }
 * }
 * 
*
* * Print first element *
*
 * public static void printFirst(BreakIterator boundary, String source) {
 *     int start = boundary.first();
 *     int end = boundary.next();
 *     System.out.println(source.substring(start,end));
 * }
 * 
*
* * Print last element *
*
 * public static void printLast(BreakIterator boundary, String source) {
 *     int end = boundary.last();
 *     int start = boundary.previous();
 *     System.out.println(source.substring(start,end));
 * }
 * 
*
* * Print the element at a specified position *
*
 * public static void printAt(BreakIterator boundary, int pos, String source) {
 *     int end = boundary.following(pos);
 *     int start = boundary.previous();
 *     System.out.println(source.substring(start,end));
 * }
 * 
*
* * Find the next word *
*
 * public static int nextWordStartAfter(int pos, String text) {
 *     BreakIterator wb = BreakIterator.getWordInstance();
 *     wb.setText(text);
 *     int last = wb.following(pos);
 *     int current = wb.next();
 *     while (current != BreakIterator.DONE) {
 *         for (int p = last; p < current; p++) {
 *             if (Character.isLetter(text.charAt(p)))
 *                 return last;
 *         }
 *         last = current;
 *         current = wb.next();
 *     }
 *     return BreakIterator.DONE;
 * }
 * 
* (The iterator returned by BreakIterator.getWordInstance() is unique in that * the break positions it returns don't represent both the start and end of the * thing being iterated over. That is, a sentence-break iterator returns breaks * that each represent the end of one sentence and the beginning of the next. * With the word-break iterator, the characters between two boundaries might be a * word, or they might be the punctuation or whitespace between two words. The * above code uses a simple heuristic to determine which boundary is the beginning * of a word: If the characters between this boundary and the next boundary * include at least one letter (this can be an alphabetical letter, a CJK ideograph, * a Hangul syllable, a Kana character, etc.), then the text between this boundary * and the next is a word; otherwise, it's the material between words.) *
* * @see CharacterIterator * @stable ICU 2.0 * */ public abstract class BreakIterator implements Cloneable { /** * Default constructor. There is no state that is carried by this abstract * base class. * @stable ICU 2.0 */ protected BreakIterator() { } /** * Clone method. Creates another BreakIterator with the same behavior and * current state as this one. * @return The clone. * @stable ICU 2.0 */ public Object clone() { try { return super.clone(); } catch (CloneNotSupportedException e) { ///CLOVER:OFF throw new IllegalStateException(); ///CLOVER:ON } } /** * DONE is returned by previous() and next() after all valid * boundaries have been returned. * @stable ICU 2.0 */ public static final int DONE = -1; /** * Return the first boundary position. This is always the beginning * index of the text this iterator iterates over. For example, if * the iterator iterates over a whole string, this function will * always return 0. This function also updates the iteration position * to point to the beginning of the text. * @return The character offset of the beginning of the stretch of text * being broken. * @stable ICU 2.0 */ public abstract int first(); /** * Return the last boundary position. This is always the "past-the-end" * index of the text this iterator iterates over. For example, if the * iterator iterates over a whole string (call it "text"), this function * will always return text.length(). This function also updated the * iteration position to point to the end of the text. * @return The character offset of the end of the stretch of text * being broken. * @stable ICU 2.0 */ public abstract int last(); /** * Advances the specified number of steps forward in the text (a negative * number, therefore, advances backwards). If this causes the iterator * to advance off either end of the text, this function returns DONE; * otherwise, this function returns the position of the appropriate * boundary. Calling this function is equivalent to calling next() or * previous() n times. * @param n The number of boundaries to advance over (if positive, moves * forward; if negative, moves backwards). * @return The position of the boundary n boundaries from the current * iteration position, or DONE if moving n boundaries causes the iterator * to advance off either end of the text. * @stable ICU 2.0 */ public abstract int next(int n); /** * Advances the iterator forward one boundary. The current iteration * position is updated to point to the next boundary position after the * current position, and this is also the value that is returned. If * the current position is equal to the value returned by last(), or to * DONE, this function returns DONE and sets the current position to * DONE. * @return The position of the first boundary position following the * iteration position. * @stable ICU 2.0 */ public abstract int next(); /** * Advances the iterator backward one boundary. The current iteration * position is updated to point to the last boundary position before * the current position, and this is also the value that is returned. If * the current position is equal to the value returned by first(), or to * DONE, this function returns DONE and sets the current position to * DONE. * @return The position of the last boundary position preceding the * iteration position. * @stable ICU 2.0 */ public abstract int previous(); /** * Sets the iterator's current iteration position to be the first * boundary position following the specified position. (Whether the * specified position is itself a boundary position or not doesn't * matter-- this function always moves the iteration position to the * first boundary after the specified position.) If the specified * position is the past-the-end position, returns DONE. * @param offset The character position to start searching from. * @return The position of the first boundary position following * "offset" (whether or not "offset" itself is a boundary position), * or DONE if "offset" is the past-the-end offset. * @stable ICU 2.0 */ public abstract int following(int offset); /** * Sets the iterator's current iteration position to be the last * boundary position preceding the specified position. (Whether the * specified position is itself a boundary position or not doesn't * matter-- this function always moves the iteration position to the * last boundary before the specified position.) If the specified * position is the starting position, returns DONE. * @param offset The character position to start searching from. * @return The position of the last boundary position preceding * "offset" (whether of not "offset" itself is a boundary position), * or DONE if "offset" is the starting offset of the iterator. * @stable ICU 2.0 */ public int preceding(int offset) { // NOTE: This implementation is here solely because we can't add new // abstract methods to an existing class. There is almost ALWAYS a // better, faster way to do this. int pos = following(offset); while (pos >= offset && pos != DONE) pos = previous(); return pos; } /** * Return true if the specfied position is a boundary position. If the * function returns true, the current iteration position is set to the * specified position; if the function returns false, the current * iteration position is set as though following() had been called. * @param offset the offset to check. * @return True if "offset" is a boundary position. * @stable ICU 2.0 */ public boolean isBoundary(int offset) { // Again, this is the default implementation, which is provided solely because // we couldn't add a new abstract method to an existing class. The real // implementations will usually need to do a little more work. if (offset == 0) { return true; } else return following(offset - 1) == offset; } /** * Return the iterator's current position. * @return The iterator's current position. * @stable ICU 2.0 */ public abstract int current(); /** * Returns a CharacterIterator over the text being analyzed. * For at least some subclasses of BreakIterator, this is a reference * to the actual iterator being used by the BreakIterator, * and therefore, this function's return value should be treated as * const. No guarantees are made about the current position * of this iterator when it is returned. If you need to move that * position to examine the text, clone this function's return value first. * @return A CharacterIterator over the text being analyzed. * @stable ICU 2.0 */ public abstract CharacterIterator getText(); /** * Sets the iterator to analyze a new piece of text. The new * piece of text is passed in as a String, and the current * iteration position is reset to the beginning of the string. * (The old text is dropped.) * @param newText A String containing the text to analyze with * this BreakIterator. * @stable ICU 2.0 */ public void setText(String newText) { setText(new StringCharacterIterator(newText)); } /** * Sets the iterator to analyze a new piece of text. The * BreakIterator is passed a CharacterIterator through which * it will access the text itself. The current iteration * position is reset to the CharacterIterator's start index. * (The old iterator is dropped.) * @param newText A CharacterIterator referring to the text * to analyze with this BreakIterator (the iterator's current * position is ignored, but its other state is significant). * @stable ICU 2.0 */ public abstract void setText(CharacterIterator newText); /** * {@icu} * @stable ICU 2.4 */ public static final int KIND_CHARACTER = 0; /** * {@icu} * @stable ICU 2.4 */ public static final int KIND_WORD = 1; /** * {@icu} * @stable ICU 2.4 */ public static final int KIND_LINE = 2; /** * {@icu} * @stable ICU 2.4 */ public static final int KIND_SENTENCE = 3; // /** // * {@icu} // * @stable ICU 2.4 // */ // public static final int KIND_TITLE = 4; /** * Returns a new instance of BreakIterator that locates word boundaries. * This function assumes that the text being analyzed is in the default * locale's language. * @return An instance of BreakIterator that locates word boundaries. * @stable ICU 2.0 */ public static BreakIterator getWordInstance() { return getWordInstance(Locale.getDefault()); } /** * Returns a new instance of BreakIterator that locates word boundaries. * @param where A locale specifying the language of the text to be * analyzed. * @return An instance of BreakIterator that locates word boundaries. * @stable ICU 2.0 */ public static BreakIterator getWordInstance(Locale where) { return getBreakInstance(where, KIND_WORD); } /** * {@icu} Returns a new instance of BreakIterator that locates word boundaries. * @param where A locale specifying the language of the text to be * analyzed. * @return An instance of BreakIterator that locates word boundaries. * @stable ICU 3.2 */ public static BreakIterator getWordInstance(ULocale where) { return getBreakInstance(where.toLocale(), KIND_WORD); } /** * Returns a new instance of BreakIterator that locates legal line- * wrapping positions. This function assumes the text being broken * is in the default locale's language. * @return A new instance of BreakIterator that locates legal * line-wrapping positions. * @stable ICU 2.0 */ public static BreakIterator getLineInstance() { return getLineInstance(Locale.getDefault()); } /** * Returns a new instance of BreakIterator that locates legal line- * wrapping positions. * @param where A Locale specifying the language of the text being broken. * @return A new instance of BreakIterator that locates legal * line-wrapping positions. * @stable ICU 2.0 */ public static BreakIterator getLineInstance(Locale where) { return getBreakInstance(where, KIND_LINE); } /** * {@icu} Returns a new instance of BreakIterator that locates legal line- * wrapping positions. * @param where A Locale specifying the language of the text being broken. * @return A new instance of BreakIterator that locates legal * line-wrapping positions. * @stable ICU 3.2 */ public static BreakIterator getLineInstance(ULocale where) { return getBreakInstance(where.toLocale(), KIND_LINE); } /** * Returns a new instance of BreakIterator that locates logical-character * boundaries. This function assumes that the text being analyzed is * in the default locale's language. * @return A new instance of BreakIterator that locates logical-character * boundaries. * @stable ICU 2.0 */ public static BreakIterator getCharacterInstance() { return getCharacterInstance(Locale.getDefault()); } /** * Returns a new instance of BreakIterator that locates logical-character * boundaries. * @param where A Locale specifying the language of the text being analyzed. * @return A new instance of BreakIterator that locates logical-character * boundaries. * @stable ICU 2.0 */ public static BreakIterator getCharacterInstance(Locale where) { return getBreakInstance(where, KIND_CHARACTER); } /** * {@icu} Returns a new instance of BreakIterator that locates logical-character * boundaries. * @param where A Locale specifying the language of the text being analyzed. * @return A new instance of BreakIterator that locates logical-character * boundaries. * @stable ICU 3.2 */ public static BreakIterator getCharacterInstance(ULocale where) { return getBreakInstance(where.toLocale(), KIND_CHARACTER); } /** * Returns a new instance of BreakIterator that locates sentence boundaries. * This function assumes the text being analyzed is in the default locale's * language. * @return A new instance of BreakIterator that locates sentence boundaries. * @stable ICU 2.0 */ public static BreakIterator getSentenceInstance() { return getSentenceInstance(Locale.getDefault()); } /** * Returns a new instance of BreakIterator that locates sentence boundaries. * @param where A Locale specifying the language of the text being analyzed. * @return A new instance of BreakIterator that locates sentence boundaries. * @stable ICU 2.0 */ public static BreakIterator getSentenceInstance(Locale where) { return getBreakInstance(where, KIND_SENTENCE); } /** * {@icu} Returns a new instance of BreakIterator that locates sentence boundaries. * @param where A Locale specifying the language of the text being analyzed. * @return A new instance of BreakIterator that locates sentence boundaries. * @stable ICU 3.2 */ public static BreakIterator getSentenceInstance(ULocale where) { return getBreakInstance(where.toLocale(), KIND_SENTENCE); } // /** // * {@icu} Returns a new instance of BreakIterator that locates title boundaries. // * This function assumes the text being analyzed is in the default locale's // * language. The iterator returned locates title boundaries as described for // * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration, // * please use a word boundary iterator. {@link #getWordInstance} // * @return A new instance of BreakIterator that locates title boundaries. // * @stable ICU 2.0 // */ // public static BreakIterator getTitleInstance() // { // return getTitleInstance(Locale.getDefault()); // } // /** // * {@icu} Returns a new instance of BreakIterator that locates title boundaries. // * The iterator returned locates title boundaries as described for // * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration, // * please use Word Boundary iterator.{@link #getWordInstance} // * @param where A Locale specifying the language of the text being analyzed. // * @return A new instance of BreakIterator that locates title boundaries. // * @stable ICU 2.0 // */ // public static BreakIterator getTitleInstance(Locale where) // { // return getBreakInstance(where, KIND_TITLE); // } // /** // * {@icu} Returns a new instance of BreakIterator that locates title boundaries. // * The iterator returned locates title boundaries as described for // * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration, // * please use Word Boundary iterator.{@link #getWordInstance} // * @param where A Locale specifying the language of the text being analyzed. // * @return A new instance of BreakIterator that locates title boundaries. // * @stable ICU 3.2 //s */ // public static BreakIterator getTitleInstance(ULocale where) // { // return getBreakInstance(where.toLocale(), KIND_TITLE); // } // /** // * {@icu} Registers a new break iterator of the indicated kind, to use in the given // * locale. Clones of the iterator will be returned if a request for a break iterator // * of the given kind matches or falls back to this locale. // * @param iter the BreakIterator instance to adopt. // * @param locale the Locale for which this instance is to be registered // * @param kind the type of iterator for which this instance is to be registered // * @return a registry key that can be used to unregister this instance // * @stable ICU 2.4 // */ // public static Object registerInstance(BreakIterator iter, Locale locale, int kind) { // throw new UnsupportedOperationException("Method not supported by com.ibm.icu.base"); // } // /** // * {@icu} Registers a new break iterator of the indicated kind, to use in the given // * locale. Clones of the iterator will be returned if a request for a break iterator // * of the given kind matches or falls back to this locale. // * @param iter the BreakIterator instance to adopt. // * @param locale the Locale for which this instance is to be registered // * @param kind the type of iterator for which this instance is to be registered // * @return a registry key that can be used to unregister this instance // * @stable ICU 3.2 // */ // public static Object registerInstance(BreakIterator iter, ULocale locale, int kind) { // throw new UnsupportedOperationException("Method not supported by com.ibm.icu.base"); // } // /** // * {@icu} Unregisters a previously-registered BreakIterator using the key returned // * from the register call. Key becomes invalid after this call and should not be used // * again. // * @param key the registry key returned by a previous call to registerInstance // * @return true if the iterator for the key was successfully unregistered // * @stable ICU 2.4 // */ // public static boolean unregister(Object key) { // throw new UnsupportedOperationException("Method not supported by com.ibm.icu.base"); // } // end of registration /** * Returns a particular kind of BreakIterator for a locale. * Avoids writing a switch statement with getXYZInstance(where) calls. * @internal * @deprecated This API is ICU internal only. */ public static BreakIterator getBreakInstance(ULocale where, int kind) { return getBreakInstance(where.toLocale(), KIND_SENTENCE); } private static BreakIterator getBreakInstance(Locale where, int kind) { java.text.BreakIterator br = null; switch(kind) { case KIND_CHARACTER: br = java.text.BreakIterator.getCharacterInstance(where); break; case KIND_WORD: br = java.text.BreakIterator.getWordInstance(where); break; case KIND_LINE: br = java.text.BreakIterator.getLineInstance(where); break; case KIND_SENTENCE: br = java.text.BreakIterator.getSentenceInstance(where); break; // case KIND_TITLE: throw new UnsupportedOperationException("Title break is not supported by com.ibm.icu.base"); } return new BreakIteratorHandle(br); } /** * Returns a list of locales for which BreakIterators can be used. * @return An array of Locales. All of the locales in the array can * be used when creating a BreakIterator. * @stable ICU 2.6 */ public static synchronized Locale[] getAvailableLocales() { return java.text.BreakIterator.getAvailableLocales(); } /** * {@icu} Returns a list of locales for which BreakIterators can be used. * @return An array of Locales. All of the locales in the array can * be used when creating a BreakIterator. * @draft ICU 3.2 (retain) * @provisional This API might change or be removed in a future release. */ public static synchronized ULocale[] getAvailableULocales() { Locale[] locales = java.text.BreakIterator.getAvailableLocales(); ULocale[] ulocales = new ULocale[locales.length]; for (int i = 0; i < locales.length; ++i) { ulocales[i] = ULocale.forLocale(locales[i]); } return ulocales; } // /** // * {@icu} Returns the locale that was used to create this object, or null. // * This may may differ from the locale requested at the time of // * this object's creation. For example, if an object is created // * for locale en_US_CALIFORNIA, the actual data may be // * drawn from en (the actual locale), and // * en_US may be the most specific locale that exists (the // * valid locale). // * // *

Note: The actual locale is returned correctly, but the valid // * locale is not, in most cases. // * @param type type of information requested, either {@link // * com.ibm.icu.util.ULocale#VALID_LOCALE} or {@link // * com.ibm.icu.util.ULocale#ACTUAL_LOCALE}. // * @return the information specified by type, or null if // * this object was not constructed from locale data. // * @see com.ibm.icu.util.ULocale // * @see com.ibm.icu.util.ULocale#VALID_LOCALE // * @see com.ibm.icu.util.ULocale#ACTUAL_LOCALE // * @draft ICU 2.8 (retain) // * @provisional This API might change or be removed in a future release. // */ // public final ULocale getLocale(ULocale.Type type) { // throw new UnsupportedOperationException("Method not supported by com.ibm.icu.base"); // } // forwarding implementation class static final class BreakIteratorHandle extends BreakIterator { /** * @internal */ public final java.text.BreakIterator breakIterator; /** * @internal * @param delegate the BreakIterator to which to delegate */ public BreakIteratorHandle(java.text.BreakIterator delegate) { this.breakIterator = delegate; } public int first() { return breakIterator.first(); } public int last() { return breakIterator.last(); } public int next(int n) { return breakIterator.next(n); } public int next() { return breakIterator.next(); } public int previous() { return breakIterator.previous(); } public int following(int offset) { return breakIterator.following(offset); } public int preceding(int offset) { return breakIterator.preceding(offset); } public boolean isBoundary(int offset) { return breakIterator.isBoundary(offset); } public int current() { return breakIterator.current(); } public CharacterIterator getText() { return breakIterator.getText(); } public void setText(CharacterIterator newText) { breakIterator.setText(newText); } /** * Return a string suitable for debugging. * @return a string suitable for debugging * @stable ICU 3.4.3 */ public String toString() { return breakIterator.toString(); } /** * Return a clone of this BreakIterator. * @return a clone of this BreakIterator * @stable ICU 3.4.3 */ public Object clone() { return new BreakIteratorHandle((java.text.BreakIterator)breakIterator.clone()); } /** * Return true if rhs is a BreakIterator with the same break behavior as this. * @return true if rhs equals this * @stable ICU 3.4.3 */ public boolean equals(Object rhs) { try { return breakIterator.equals(((BreakIteratorHandle)rhs).breakIterator); } catch (Exception e) { return false; } } /** * Return a hashCode. * @return a hashCode * @stable ICU 3.4.3 */ public int hashCode() { return breakIterator.hashCode(); } } }