/* GENERATED SOURCE. DO NOT MODIFY. */ // © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html#License /* ******************************************************************************* * Copyright (C) 2005-2016 International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ package android.icu.text; import static android.icu.impl.CharacterIteration.DONE32; import static android.icu.impl.CharacterIteration.next32; import static android.icu.impl.CharacterIteration.nextTrail32; import static android.icu.impl.CharacterIteration.previous32; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; import java.text.CharacterIterator; import java.util.ArrayList; import java.util.List; import android.icu.impl.CharacterIteration; import android.icu.impl.ICUBinary; import android.icu.impl.ICUDebug; import android.icu.impl.Trie2; import android.icu.lang.UCharacter; import android.icu.lang.UProperty; import android.icu.lang.UScript; /** * Rule Based Break Iterator * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. * * @hide Only a subset of ICU is exposed in Android */ public class RuleBasedBreakIterator extends BreakIterator { //======================================================================= // Constructors & Factories //======================================================================= /** * private constructor */ private RuleBasedBreakIterator() { fDictionaryCharCount = 0; synchronized(gAllBreakEngines) { fBreakEngines = new ArrayList(gAllBreakEngines); } } /** * Create a break iterator from a precompiled set of break rules. * * Creating a break iterator from the binary rules is much faster than * creating one from source rules. * * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. * Binary break iterator rules are not guaranteed to be compatible between * different versions of ICU. * * @param is an input stream supplying the compiled binary rules. * @throws IOException if there is an error while reading the rules from the InputStream. * @see #compileRules(String, OutputStream) */ public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { RuleBasedBreakIterator This = new RuleBasedBreakIterator(); This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); return This; } /** * Create a break iterator from a precompiled set of break rules. * * Creating a break iterator from the binary rules is much faster than * creating one from source rules. * * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. * Binary break iterator rules are not guaranteed to be compatible between * different versions of ICU. * * @param bytes a buffer supplying the compiled binary rules. * @throws IOException if there is an error while reading the rules from the buffer. * @see #compileRules(String, OutputStream) * @deprecated This API is ICU internal only. * @hide draft / provisional / internal are hidden on Android */ @Deprecated public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { RuleBasedBreakIterator This = new RuleBasedBreakIterator(); This.fRData = RBBIDataWrapper.get(bytes); return This; } /** * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. * @param rules The break rules to be used. */ public RuleBasedBreakIterator(String rules) { this(); try { ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); compileRules(rules, ruleOS); fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); } catch (IOException e) { ///CLOVER:OFF // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, // causing bogus compiled rules to be produced, but with no compile error raised. RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: " + e.getMessage()); throw rte; ///CLOVER:ON } } //======================================================================= // Boilerplate //======================================================================= /** * Clones this iterator. * @return A newly-constructed RuleBasedBreakIterator with the same * behavior as this one. */ @Override public Object clone() { RuleBasedBreakIterator result; result = (RuleBasedBreakIterator)super.clone(); if (fText != null) { result.fText = (CharacterIterator)(fText.clone()); } synchronized (gAllBreakEngines) { result.fBreakEngines = new ArrayList(gAllBreakEngines); } result.fLookAheadMatches = new LookAheadResults(); result.fBreakCache = result.new BreakCache(fBreakCache); result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); return result; } /** * Returns true if both BreakIterators are of the same class, have the same * rules, and iterate over the same text. */ @Override public boolean equals(Object that) { if (that == null) { return false; } if (this == that) { return true; } try { RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; if (fRData != other.fRData && (fRData == null || other.fRData == null)) { return false; } if (fRData != null && other.fRData != null && (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { return false; } if (fText == null && other.fText == null) { return true; } if (fText == null || other.fText == null || !fText.equals(other.fText)) { return false; } return fPosition == other.fPosition; } catch(ClassCastException e) { return false; } } /** * Returns the description (rules) used to create this iterator. * (In ICU4C, the same function is RuleBasedBreakIterator::getRules()) */ @Override public String toString() { String retStr = ""; if (fRData != null) { retStr = fRData.fRuleSource; } return retStr; } /** * Compute a hashcode for this BreakIterator * @return A hash code */ @Override public int hashCode() { return fRData.fRuleSource.hashCode(); } private static final int START_STATE = 1; // The state number of the starting state private static final int STOP_STATE = 0; // The state-transition value indicating "stop" // RBBIRunMode - the state machine runs an extra iteration at the beginning and end // of user text. A variable with this enum type keeps track of where we // are. The state machine only fetches user text input while in RUN mode. private static final int RBBI_START = 0; private static final int RBBI_RUN = 1; private static final int RBBI_END = 2; /* * The character iterator through which this BreakIterator accesses the text. */ private CharacterIterator fText = new java.text.StringCharacterIterator(""); /** * The rule data for this BreakIterator instance. Package private. */ RBBIDataWrapper fRData; /** * The iteration state - current position, rule status for the current position, * and whether the iterator ran off the end, yielding UBRK_DONE. * Current position is pinned to be 0 < position <= text.length. * Current position is always set to a boundary. * * The current position of the iterator. Pinned, 0 < fPosition <= text.length. * Never has the value UBRK_DONE (-1). */ private int fPosition; /** * Index of the Rule {tag} values for the most recent match. */ private int fRuleStatusIndex; /** * True when iteration has run off the end, and iterator functions should return UBRK_DONE. */ private boolean fDone; /** * Cache of previously determined boundary positions. */ private BreakCache fBreakCache = new BreakCache(); /** * Counter for the number of characters encountered with the "dictionary" * flag set. Normal RBBI iterators don't use it, although the code * for updating it is live. Dictionary Based break iterators (a subclass * of us) access this field directly. * @hide draft / provisional / internal are hidden on Android */ private int fDictionaryCharCount; private DictionaryCache fDictionaryCache = new DictionaryCache(); /* * ICU debug argument name for RBBI */ private static final String RBBI_DEBUG_ARG = "rbbi"; /** * Debugging flag. Trace operation of state machine when true. */ private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG) && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; /** * What kind of break iterator this is. * Defaulting BreakType to word gives reasonable dictionary behavior for * Break Iterators that are built from rules. */ private int fBreakType = KIND_WORD; /** * The "default" break engine - just skips over ranges of dictionary words, * producing no breaks. Should only be used if characters need to be handled * by a dictionary but we have no dictionary implementation for them. * * Only one instance; shared by all break iterators. */ private static final UnhandledBreakEngine gUnhandledBreakEngine; /** * List of all known break engines, common for all break iterators. * Lazily updated as break engines are needed, because instantiation of * break engines is expensive. * * Because gAllBreakEngines can be referenced concurrently from different * BreakIterator instances, all access is synchronized. */ private static final List gAllBreakEngines; static { gUnhandledBreakEngine = new UnhandledBreakEngine(); gAllBreakEngines = new ArrayList(); gAllBreakEngines.add(gUnhandledBreakEngine); } /** * List of all known break engines. Similar to gAllBreakEngines, but local to a * break iterator, allowing it to be used without synchronization. */ private List fBreakEngines; /** * Dump the contents of the state table and character classes for this break iterator. * For debugging only. * @deprecated This API is ICU internal only. * @hide draft / provisional / internal are hidden on Android */ @Deprecated public void dump(java.io.PrintStream out) { if (out == null) { out = System.out; } this.fRData.dump(out); } /** * Compile a set of source break rules into the binary state tables used * by the break iterator engine. Creating a break iterator from precompiled * rules is much faster than creating one from source rules. * * Binary break rules are not guaranteed to be compatible between different * versions of ICU. * * * @param rules The source form of the break rules * @param ruleBinary An output stream to receive the compiled rules. * @throws IOException If there is an error writing the output. * @see #getInstanceFromCompiledRules(InputStream) */ public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { RBBIRuleBuilder.compileRules(rules, ruleBinary); } //======================================================================= // BreakIterator overrides //======================================================================= /** * Sets the current iteration position to the beginning of the text. * (i.e., the CharacterIterator's starting offset). * @return The offset of the beginning of the text. */ @Override public int first() { if (fText == null) { return BreakIterator.DONE; } fText.first(); int start = fText.getIndex(); if (!fBreakCache.seek(start)) { fBreakCache.populateNear(start); } fBreakCache.current(); assert(fPosition == start); return fPosition; } /** * Sets the current iteration position to the end of the text. * (i.e., the CharacterIterator's ending offset). * @return The text's past-the-end offset. */ @Override public int last() { if (fText == null) { return BreakIterator.DONE; } int endPos = fText.getEndIndex(); boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. assert(endShouldBeBoundary); if (fPosition != endPos) { assert(fPosition == endPos); } return endPos; } /** * Advances the iterator either forward or backward the specified number of steps. * Negative values move backward, and positive values move forward. This is * equivalent to repeatedly calling next() or previous(). * @param n The number of steps to move. The sign indicates the direction * (negative is backwards, and positive is forwards). * @return The character offset of the boundary position n boundaries away from * the current one. */ @Override public int next(int n) { int result = 0; if (n > 0) { for (; n > 0 && result != DONE; --n) { result = next(); } } else if (n < 0) { for (; n < 0 && result != DONE; ++n) { result = previous(); } } else { result = current(); } return result; } /** * Advances the iterator to the next boundary position. * @return The position of the first boundary after this one. */ @Override public int next() { fBreakCache.next(); return fDone ? DONE : fPosition; } /** * Moves the iterator backwards, to the boundary preceding the current one. * @return The position of the boundary position immediately preceding the starting position. */ @Override public int previous() { fBreakCache.previous(); return fDone ? DONE : fPosition; } /** * Sets the iterator to refer to the first boundary position following * the specified position. * @param startPos The position from which to begin searching for a break position. * @return The position of the first break after the current position. */ @Override public int following(int startPos) { // if the supplied position is before the beginning, return the // text's starting offset if (startPos < fText.getBeginIndex()) { return first(); } // Move requested offset to a code point start. It might be on a trail surrogate. // Or it may be beyond the end of the text. startPos = CISetIndex32(fText, startPos); fBreakCache.following(startPos); return fDone ? DONE : fPosition; } /** * Sets the iterator to refer to the last boundary position before the * specified position. * @param offset The position to begin searching for a break from. * @return The position of the last boundary before the starting position. */ @Override public int preceding(int offset) { if (fText == null || offset > fText.getEndIndex()) { return last(); } else if (offset < fText.getBeginIndex()) { return first(); } // Move requested offset to a code point start. It might be on a trail surrogate. // int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior. int adjustedOffset = offset; fBreakCache.preceding(adjustedOffset); return fDone ? DONE : fPosition; } /** * Throw IllegalArgumentException unless begin <= offset < end. */ protected static final void checkOffset(int offset, CharacterIterator text) { if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { throw new IllegalArgumentException("offset out of bounds"); } } /** * Returns true if the specified position is a boundary position. As a side * effect, leaves the iterator pointing to the first boundary position at * or after "offset". * @param offset the offset to check. * @return True if "offset" is a boundary position. */ @Override public boolean isBoundary(int offset) { // TODO: behavior difference with ICU4C, which considers out-of-range offsets // to not be boundaries, and to not be errors. checkOffset(offset, fText); // Adjust offset to be on a code point boundary and not beyond the end of the text. // Note that isBoundary() is always be false for offsets that are not on code point boundaries. // But we still need the side effect of leaving iteration at the following boundary. int adjustedOffset = CISetIndex32(fText, offset); boolean result = false; if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) { result = (fBreakCache.current() == offset); } if (!result) { // Not on a boundary. isBoundary() must leave iterator on the following boundary. // fBreakCache.seek(), above, left us on the preceding boundary, so advance one. next(); } return result; } /** * Returns the current iteration position. Note that UBRK_DONE is never * returned from this function; if iteration has run to the end of a * string, current() will return the length of the string while * next() will return BreakIterator.DONE). * @return The current iteration position. */ @Override public int current() { return (fText != null) ? fPosition : BreakIterator.DONE; } /** * Return the status tag from the break rule that determined the most recently * returned break position. The values appear in the rule source * within brackets, {123}, for example. For rules that do not specify a * status, a default value of 0 is returned. If more than one rule applies, * the numerically largest of the possible status values is returned. *

* Of the standard types of ICU break iterators, only the word and line break * iterator provides status values. The values are defined in * class RuleBasedBreakIterator, and allow distinguishing between words * that contain alphabetic letters, "words" that appear to be numbers, * punctuation and spaces, words containing ideographic characters, and * more. Call getRuleStatus after obtaining a boundary * position from next(), previous(), or * any other break iterator functions that returns a boundary position. *

* @return the status from the break rule that determined the most recently * returned break position. */ @Override public int getRuleStatus() { // Status records have this form: // Count N <-- fLastRuleStatusIndex points here. // Status val 0 // Status val 1 // ... // Status val N-1 <-- the value we need to return // The status values are sorted in ascending order. // This function returns the last (largest) of the array of status values. int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex]; int tagVal = fRData.fStatusTable[idx]; return tagVal; } /** * Get the status (tag) values from the break rule(s) that determined the most * recently returned break position. The values appear in the rule source * within brackets, {123}, for example. The default status value for rules * that do not explicitly provide one is zero. *

* The status values used by the standard ICU break rules are defined * as public constants in class RuleBasedBreakIterator. *

* If the size of the output array is insufficient to hold the data, * the output will be truncated to the available length. No exception * will be thrown. * * @param fillInArray an array to be filled in with the status values. * @return The number of rule status values from rules that determined * the most recent boundary returned by the break iterator. * In the event that the array is too small, the return value * is the total number of status values that were available, * not the reduced number that were actually returned. */ @Override public int getRuleStatusVec(int[] fillInArray) { int numStatusVals = fRData.fStatusTable[fRuleStatusIndex]; if (fillInArray != null) { int numToCopy = Math.min(numStatusVals, fillInArray.length); for (int i=0; i