// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html#License /* ******************************************************************************* * Copyright (C) 2005-2016 International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ package com.ibm.icu.text; import static com.ibm.icu.impl.CharacterIteration.DONE32; import static com.ibm.icu.impl.CharacterIteration.next32; import static com.ibm.icu.impl.CharacterIteration.nextTrail32; import static com.ibm.icu.impl.CharacterIteration.previous32; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; import java.text.CharacterIterator; import java.util.ArrayList; import java.util.List; import com.ibm.icu.impl.CharacterIteration; import com.ibm.icu.impl.ICUBinary; import com.ibm.icu.impl.ICUDebug; import com.ibm.icu.impl.Trie2; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; import com.ibm.icu.lang.UScript; /** * Rule Based Break Iterator * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. * * @stable ICU 2.0 */ public class RuleBasedBreakIterator extends BreakIterator { //======================================================================= // Constructors & Factories //======================================================================= /** * private constructor */ private RuleBasedBreakIterator() { fDictionaryCharCount = 0; synchronized(gAllBreakEngines) { fBreakEngines = new ArrayList(gAllBreakEngines); } } /** * Create a break iterator from a precompiled set of break rules. * * Creating a break iterator from the binary rules is much faster than * creating one from source rules. * * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. * Binary break iterator rules are not guaranteed to be compatible between * different versions of ICU. * * @param is an input stream supplying the compiled binary rules. * @throws IOException if there is an error while reading the rules from the InputStream. * @see #compileRules(String, OutputStream) * @stable ICU 4.8 */ public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { RuleBasedBreakIterator This = new RuleBasedBreakIterator(); This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); return This; } /** * Create a break iterator from a precompiled set of break rules. * * Creating a break iterator from the binary rules is much faster than * creating one from source rules. * * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. * Binary break iterator rules are not guaranteed to be compatible between * different versions of ICU. * * @param bytes a buffer supplying the compiled binary rules. * @throws IOException if there is an error while reading the rules from the buffer. * @see #compileRules(String, OutputStream) * @internal * @deprecated This API is ICU internal only. */ @Deprecated public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { RuleBasedBreakIterator This = new RuleBasedBreakIterator(); This.fRData = RBBIDataWrapper.get(bytes); return This; } /** * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. * @param rules The break rules to be used. * @stable ICU 2.2 */ public RuleBasedBreakIterator(String rules) { this(); try { ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); compileRules(rules, ruleOS); fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); } catch (IOException e) { ///CLOVER:OFF // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, // causing bogus compiled rules to be produced, but with no compile error raised. RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: " + e.getMessage()); throw rte; ///CLOVER:ON } } //======================================================================= // Boilerplate //======================================================================= /** * Clones this iterator. * @return A newly-constructed RuleBasedBreakIterator with the same * behavior as this one. * @stable ICU 2.0 */ @Override public Object clone() { RuleBasedBreakIterator result; result = (RuleBasedBreakIterator)super.clone(); if (fText != null) { result.fText = (CharacterIterator)(fText.clone()); } synchronized (gAllBreakEngines) { result.fBreakEngines = new ArrayList(gAllBreakEngines); } result.fLookAheadMatches = new LookAheadResults(); result.fBreakCache = result.new BreakCache(fBreakCache); result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); return result; } /** * Returns true if both BreakIterators are of the same class, have the same * rules, and iterate over the same text. * @stable ICU 2.0 */ @Override public boolean equals(Object that) { if (that == null) { return false; } if (this == that) { return true; } try { RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; if (fRData != other.fRData && (fRData == null || other.fRData == null)) { return false; } if (fRData != null && other.fRData != null && (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { return false; } if (fText == null && other.fText == null) { return true; } if (fText == null || other.fText == null || !fText.equals(other.fText)) { return false; } return fPosition == other.fPosition; } catch(ClassCastException e) { return false; } } /** * Returns the description (rules) used to create this iterator. * (In ICU4C, the same function is RuleBasedBreakIterator::getRules()) * @stable ICU 2.0 */ @Override public String toString() { String retStr = ""; if (fRData != null) { retStr = fRData.fRuleSource; } return retStr; } /** * Compute a hashcode for this BreakIterator * @return A hash code * @stable ICU 2.0 */ @Override public int hashCode() { return fRData.fRuleSource.hashCode(); } private static final int START_STATE = 1; // The state number of the starting state private static final int STOP_STATE = 0; // The state-transition value indicating "stop" // RBBIRunMode - the state machine runs an extra iteration at the beginning and end // of user text. A variable with this enum type keeps track of where we // are. The state machine only fetches user text input while in RUN mode. private static final int RBBI_START = 0; private static final int RBBI_RUN = 1; private static final int RBBI_END = 2; /* * The character iterator through which this BreakIterator accesses the text. */ private CharacterIterator fText = new java.text.StringCharacterIterator(""); /** * The rule data for this BreakIterator instance. Package private. */ RBBIDataWrapper fRData; /** * The iteration state - current position, rule status for the current position, * and whether the iterator ran off the end, yielding UBRK_DONE. * Current position is pinned to be 0 < position <= text.length. * Current position is always set to a boundary. * * The current position of the iterator. Pinned, 0 < fPosition <= text.length. * Never has the value UBRK_DONE (-1). */ private int fPosition; /** * Index of the Rule {tag} values for the most recent match. */ private int fRuleStatusIndex; /** * True when iteration has run off the end, and iterator functions should return UBRK_DONE. */ private boolean fDone; /** * Cache of previously determined boundary positions. */ private BreakCache fBreakCache = new BreakCache(); /** * Counter for the number of characters encountered with the "dictionary" * flag set. Normal RBBI iterators don't use it, although the code * for updating it is live. Dictionary Based break iterators (a subclass * of us) access this field directly. * @internal */ private int fDictionaryCharCount; private DictionaryCache fDictionaryCache = new DictionaryCache(); /* * ICU debug argument name for RBBI */ private static final String RBBI_DEBUG_ARG = "rbbi"; /** * Debugging flag. Trace operation of state machine when true. */ private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG) && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; /** * What kind of break iterator this is. * Defaulting BreakType to word gives reasonable dictionary behavior for * Break Iterators that are built from rules. */ private int fBreakType = KIND_WORD; /** * The "default" break engine - just skips over ranges of dictionary words, * producing no breaks. Should only be used if characters need to be handled * by a dictionary but we have no dictionary implementation for them. * * Only one instance; shared by all break iterators. */ private static final UnhandledBreakEngine gUnhandledBreakEngine; /** * List of all known break engines, common for all break iterators. * Lazily updated as break engines are needed, because instantiation of * break engines is expensive. * * Because gAllBreakEngines can be referenced concurrently from different * BreakIterator instances, all access is synchronized. */ private static final List gAllBreakEngines; static { gUnhandledBreakEngine = new UnhandledBreakEngine(); gAllBreakEngines = new ArrayList(); gAllBreakEngines.add(gUnhandledBreakEngine); } /** * List of all known break engines. Similar to gAllBreakEngines, but local to a * break iterator, allowing it to be used without synchronization. */ private List fBreakEngines; /** * Dump the contents of the state table and character classes for this break iterator. * For debugging only. * @internal * @deprecated This API is ICU internal only. */ @Deprecated public void dump(java.io.PrintStream out) { if (out == null) { out = System.out; } this.fRData.dump(out); } /** * Compile a set of source break rules into the binary state tables used * by the break iterator engine. Creating a break iterator from precompiled * rules is much faster than creating one from source rules. * * Binary break rules are not guaranteed to be compatible between different * versions of ICU. * * * @param rules The source form of the break rules * @param ruleBinary An output stream to receive the compiled rules. * @throws IOException If there is an error writing the output. * @see #getInstanceFromCompiledRules(InputStream) * @stable ICU 4.8 */ public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { RBBIRuleBuilder.compileRules(rules, ruleBinary); } //======================================================================= // BreakIterator overrides //======================================================================= /** * Sets the current iteration position to the beginning of the text. * (i.e., the CharacterIterator's starting offset). * @return The offset of the beginning of the text. * @stable ICU 2.0 */ @Override public int first() { if (fText == null) { return BreakIterator.DONE; } fText.first(); int start = fText.getIndex(); if (!fBreakCache.seek(start)) { fBreakCache.populateNear(start); } fBreakCache.current(); assert(fPosition == start); return fPosition; } /** * Sets the current iteration position to the end of the text. * (i.e., the CharacterIterator's ending offset). * @return The text's past-the-end offset. * @stable ICU 2.0 */ @Override public int last() { if (fText == null) { return BreakIterator.DONE; } int endPos = fText.getEndIndex(); boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. assert(endShouldBeBoundary); if (fPosition != endPos) { assert(fPosition == endPos); } return endPos; } /** * Advances the iterator either forward or backward the specified number of steps. * Negative values move backward, and positive values move forward. This is * equivalent to repeatedly calling next() or previous(). * @param n The number of steps to move. The sign indicates the direction * (negative is backwards, and positive is forwards). * @return The character offset of the boundary position n boundaries away from * the current one. * @stable ICU 2.0 */ @Override public int next(int n) { int result = 0; if (n > 0) { for (; n > 0 && result != DONE; --n) { result = next(); } } else if (n < 0) { for (; n < 0 && result != DONE; ++n) { result = previous(); } } else { result = current(); } return result; } /** * Advances the iterator to the next boundary position. * @return The position of the first boundary after this one. * @stable ICU 2.0 */ @Override public int next() { fBreakCache.next(); return fDone ? DONE : fPosition; } /** * Moves the iterator backwards, to the boundary preceding the current one. * @return The position of the boundary position immediately preceding the starting position. * @stable ICU 2.0 */ @Override public int previous() { fBreakCache.previous(); return fDone ? DONE : fPosition; } /** * Sets the iterator to refer to the first boundary position following * the specified position. * @param startPos The position from which to begin searching for a break position. * @return The position of the first break after the current position. * @stable ICU 2.0 */ @Override public int following(int startPos) { // if the supplied position is before the beginning, return the // text's starting offset if (startPos < fText.getBeginIndex()) { return first(); } // Move requested offset to a code point start. It might be on a trail surrogate. // Or it may be beyond the end of the text. startPos = CISetIndex32(fText, startPos); fBreakCache.following(startPos); return fDone ? DONE : fPosition; } /** * Sets the iterator to refer to the last boundary position before the * specified position. * @param offset The position to begin searching for a break from. * @return The position of the last boundary before the starting position. * @stable ICU 2.0 */ @Override public int preceding(int offset) { if (fText == null || offset > fText.getEndIndex()) { return last(); } else if (offset < fText.getBeginIndex()) { return first(); } // Move requested offset to a code point start. It might be on a trail surrogate. // int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior. int adjustedOffset = offset; fBreakCache.preceding(adjustedOffset); return fDone ? DONE : fPosition; } /** * Throw IllegalArgumentException unless begin <= offset < end. * @stable ICU 2.0 */ protected static final void checkOffset(int offset, CharacterIterator text) { if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { throw new IllegalArgumentException("offset out of bounds"); } } /** * Returns true if the specified position is a boundary position. As a side * effect, leaves the iterator pointing to the first boundary position at * or after "offset". * @param offset the offset to check. * @return True if "offset" is a boundary position. * @stable ICU 2.0 */ @Override public boolean isBoundary(int offset) { // TODO: behavior difference with ICU4C, which considers out-of-range offsets // to not be boundaries, and to not be errors. checkOffset(offset, fText); // Adjust offset to be on a code point boundary and not beyond the end of the text. // Note that isBoundary() is always be false for offsets that are not on code point boundaries. // But we still need the side effect of leaving iteration at the following boundary. int adjustedOffset = CISetIndex32(fText, offset); boolean result = false; if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) { result = (fBreakCache.current() == offset); } if (!result) { // Not on a boundary. isBoundary() must leave iterator on the following boundary. // fBreakCache.seek(), above, left us on the preceding boundary, so advance one. next(); } return result; } /** * Returns the current iteration position. Note that UBRK_DONE is never * returned from this function; if iteration has run to the end of a * string, current() will return the length of the string while * next() will return BreakIterator.DONE). * @return The current iteration position. * @stable ICU 2.0 */ @Override public int current() { return (fText != null) ? fPosition : BreakIterator.DONE; } /** * Return the status tag from the break rule that determined the most recently * returned break position. The values appear in the rule source * within brackets, {123}, for example. For rules that do not specify a * status, a default value of 0 is returned. If more than one rule applies, * the numerically largest of the possible status values is returned. *

* Of the standard types of ICU break iterators, only the word and line break * iterator provides status values. The values are defined in * class RuleBasedBreakIterator, and allow distinguishing between words * that contain alphabetic letters, "words" that appear to be numbers, * punctuation and spaces, words containing ideographic characters, and * more. Call getRuleStatus after obtaining a boundary * position from next(), previous(), or * any other break iterator functions that returns a boundary position. *

* @return the status from the break rule that determined the most recently * returned break position. * * @stable ICU 60 */ @Override public int getRuleStatus() { // Status records have this form: // Count N <-- fLastRuleStatusIndex points here. // Status val 0 // Status val 1 // ... // Status val N-1 <-- the value we need to return // The status values are sorted in ascending order. // This function returns the last (largest) of the array of status values. int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex]; int tagVal = fRData.fStatusTable[idx]; return tagVal; } /** * Get the status (tag) values from the break rule(s) that determined the most * recently returned break position. The values appear in the rule source * within brackets, {123}, for example. The default status value for rules * that do not explicitly provide one is zero. *

* The status values used by the standard ICU break rules are defined * as public constants in class RuleBasedBreakIterator. *

* If the size of the output array is insufficient to hold the data, * the output will be truncated to the available length. No exception * will be thrown. * * @param fillInArray an array to be filled in with the status values. * @return The number of rule status values from rules that determined * the most recent boundary returned by the break iterator. * In the event that the array is too small, the return value * is the total number of status values that were available, * not the reduced number that were actually returned. * @stable ICU 60 */ @Override public int getRuleStatusVec(int[] fillInArray) { int numStatusVals = fRData.fStatusTable[fRuleStatusIndex]; if (fillInArray != null) { int numToCopy = Math.min(numStatusVals, fillInArray.length); for (int i=0; i engine. script = UScript.HAN; } LanguageBreakEngine eng; try { switch (script) { case UScript.THAI: eng = new ThaiBreakEngine(); break; case UScript.LAO: eng = new LaoBreakEngine(); break; case UScript.MYANMAR: eng = new BurmeseBreakEngine(); break; case UScript.KHMER: eng = new KhmerBreakEngine(); break; case UScript.HAN: if (getBreakType() == KIND_WORD) { eng = new CjkBreakEngine(false); } else { gUnhandledBreakEngine.handleChar(c, getBreakType()); eng = gUnhandledBreakEngine; } break; case UScript.HANGUL: if (getBreakType() == KIND_WORD) { eng = new CjkBreakEngine(true); } else { gUnhandledBreakEngine.handleChar(c, getBreakType()); eng = gUnhandledBreakEngine; } break; default: gUnhandledBreakEngine.handleChar(c, getBreakType()); eng = gUnhandledBreakEngine; break; } } catch (IOException e) { eng = null; } if (eng != null && eng != gUnhandledBreakEngine) { gAllBreakEngines.add(eng); fBreakEngines.add(eng); } return eng; } // end synchronized(gAllBreakEngines) } private static final int kMaxLookaheads = 8; private static class LookAheadResults { int fUsedSlotLimit; int[] fPositions; int[] fKeys; LookAheadResults() { fUsedSlotLimit= 0; fPositions = new int[kMaxLookaheads]; fKeys = new int[kMaxLookaheads]; } int getPosition(int key) { for (int i=0; i= kMaxLookaheads) { assert(false); i = kMaxLookaheads - 1; } fKeys[i] = key; fPositions[i] = position; assert(fUsedSlotLimit == i); fUsedSlotLimit = i + 1; } void reset() { fUsedSlotLimit = 0; } }; private LookAheadResults fLookAheadMatches = new LookAheadResults(); /** * The State Machine Engine for moving forward is here. * This function is the heart of the RBBI run time engine. * * Input * fPosition, the position in the text to begin from. * Output * fPosition: the boundary following the starting position. * fDictionaryCharCount the number of dictionary characters encountered. * If > 0, the segment will be further subdivided * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. * * @return the new iterator position * * A note on supplementary characters and the position of underlying * Java CharacterIterator: Normally, a character iterator is positioned at * the char most recently returned by next(). Within this function, when * a supplementary char is being processed, the char iterator is left * sitting on the trail surrogate, in the middle of the code point. * This is different from everywhere else, where an iterator always * points at the lead surrogate of a supplementary. */ private int handleNext() { if (TRACE) { System.out.println("Handle Next pos char state category"); } // handleNext always sets the break tag value. // Set the default for it. fRuleStatusIndex = 0; fDictionaryCharCount = 0; // caches for quicker access CharacterIterator text = fText; Trie2 trie = fRData.fTrie; short[] stateTable = fRData.fFTable; int initialPosition = fPosition; text.setIndex(initialPosition); int result = initialPosition; // Set up the starting char int c = text.current(); if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { c = nextTrail32(text, c); if (c == DONE32) { fDone = true; return BreakIterator.DONE; } } // Set the initial state for the state machine int state = START_STATE; int row = fRData.getRowIndex(state); short category = 3; int flagsState = fRData.getStateTableFlags(stateTable); int mode = RBBI_RUN; if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { category = 2; mode = RBBI_START; if (TRACE) { System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); System.out.print(RBBIDataWrapper.intToHexString(c, 10)); System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); } } fLookAheadMatches.reset(); // loop until we reach the end of the text or transition to state 0 while (state != STOP_STATE) { if (c == DONE32) { // Reached end of input string. if (mode == RBBI_END) { // We have already run the loop one last time with the // character set to the pseudo {eof} value. Now it is time // to unconditionally bail out. break; } // Run the loop one last time with the fake end-of-input character category mode = RBBI_END; category = 1; } else if (mode == RBBI_RUN) { // Get the char category. An incoming category of 1 or 2 mens that // we are preset for doing the beginning or end of input, and // that we shouldn't get a category from an actual text input character. // // look up the current character's character category, which tells us // which column in the state table to look at. // category = (short) trie.get(c); // Check the dictionary bit in the character's category. // Counter is only used by dictionary based iterators (subclasses). // Chars that need to be handled by a dictionary have a flag bit set // in their category values. // if ((category & 0x4000) != 0) { fDictionaryCharCount++; // And off the dictionary flag bit. category &= ~0x4000; } if (TRACE) { System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); System.out.print(RBBIDataWrapper.intToHexString(c, 10)); System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); } // Advance to the next character. // If this is a beginning-of-input loop iteration, don't advance. // The next iteration will be processing the first real input character. c = text.next(); if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { c = nextTrail32(text, c); } } else { mode = RBBI_RUN; } // look up a state transition in the state table state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; row = fRData.getRowIndex(state); if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { // Match found, common case result = text.getIndex(); if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { // The iterator has been left in the middle of a surrogate pair. // We want the start of it. result--; } // Remember the break status (tag) values. fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; } int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING]; if (completedRule > 0) { // Lookahead match is completed int lookaheadResult = fLookAheadMatches.getPosition(completedRule); if (lookaheadResult >= 0) { fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; fPosition = lookaheadResult; return lookaheadResult; } } int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; if (rule != 0) { // At the position of a '/' in a look-ahead match. Record it. int pos = text.getIndex(); if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { // The iterator has been left in the middle of a surrogate pair. // We want the beginning of it. pos--; } fLookAheadMatches.setPosition(rule, pos); } } // End of state machine main loop // The state machine is done. Check whether it found a match... // If the iterator failed to advance in the match engine force it ahead by one. // This indicates a defect in the break rules, which should always match // at least one character. if (result == initialPosition) { if (TRACE) { System.out.println("Iterator did not move. Advancing by 1."); } text.setIndex(initialPosition); next32(text); result = text.getIndex(); fRuleStatusIndex = 0; } // Leave the iterator at our result position. // (we may have advanced beyond the last accepting position chasing after // longer matches that never completed.) fPosition = result; if (TRACE) { System.out.println("result = " + result); } return result; } /** * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules. * This locates a "Safe Position" from which the forward break rules * will operate correctly. A Safe Position is not necessarily a boundary itself. * * The logic of this function is very similar to handleNext(), above. * * @param fromPosition the position in the input text to begin the iteration. * @internal */ private int handlePrevious(int fromPosition) { if (fText == null) { return 0; } int state; int category = 0; int mode; int row; int c; int result = 0; int initialPosition = fromPosition; fLookAheadMatches.reset(); short[] stateTable = fRData.fSRTable; CISetIndex32(fText, fromPosition); if (fromPosition == fText.getBeginIndex()) { return BreakIterator.DONE; } // set up the starting char result = initialPosition; c = previous32(fText); // Set up the initial state for the state machine state = START_STATE; row = fRData.getRowIndex(state); category = 3; // TODO: obsolete? from the old start/run mode scheme? mode = RBBI_RUN; if ((fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { category = 2; mode = RBBI_START; } if (TRACE) { System.out.println("Handle Prev pos char state category "); } // loop until we reach the beginning of the text or transition to state 0 // mainLoop: for (;;) { if (c == DONE32) { // Reached end of input string. if (mode == RBBI_END) { // We have already done the {eof} iteration. Now is the time // to unconditionally bail out. break mainLoop; } mode = RBBI_END; category = 1; } if (mode == RBBI_RUN) { // look up the current character's category, which tells us // which column in the state table to look at. // // And off the dictionary flag bit. For reverse iteration it is not used. category = (short) fRData.fTrie.get(c); category &= ~0x4000; } if (TRACE) { System.out.print(" " + fText.getIndex() + " "); if (0x20 <= c && c < 0x7f) { System.out.print(" " + c + " "); } else { System.out.print(" " + Integer.toHexString(c) + " "); } System.out.println(" " + state + " " + category + " "); } // State Transition - move machine to its next state // state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; row = fRData.getRowIndex(state); if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { // Match found, common case, could have lookahead so we move // on to check it result = fText.getIndex(); } int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING]; if (completedRule > 0) { // Lookahead match is completed. int lookaheadResult = fLookAheadMatches.getPosition(completedRule); if (lookaheadResult >= 0) { result = lookaheadResult; break mainLoop; } } int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; if (rule != 0) { // At the position of a '/' in a look-ahead match. Record it. int pos = fText.getIndex(); fLookAheadMatches.setPosition(rule, pos); } if (state == STOP_STATE) { // Normal loop exit is here break mainLoop; } // then move iterator position backwards one character // if (mode == RBBI_RUN) { c = previous32(fText); } else { if (mode == RBBI_START) { mode = RBBI_RUN; } } } // End of the main loop. // The state machine is done. Check whether it found a match... // // If the iterator failed to move in the match engine, force it back by one code point. // (This really indicates a defect in the break rules. They should always match // at least one character.) if (result == initialPosition) { CISetIndex32(fText, initialPosition); previous32(fText); result = fText.getIndex(); } if (TRACE) { System.out.println("Result = " + result); } return result; } /** * Set the index of a CharacterIterator. * Pin the index to the valid range range of BeginIndex <= index <= EndIndex. * If the index points to a trail surrogate of a supplementary character, adjust it * to the start (lead surrogate) index. * * @param ci A CharacterIterator to set * @param index the index to set * @return the resulting index, possibly pinned or adjusted. */ private static int CISetIndex32(CharacterIterator ci, int index) { if (index <= ci.getBeginIndex()) { ci.first(); } else if (index >= ci.getEndIndex()) { ci.setIndex(ci.getEndIndex()); } else if (Character.isLowSurrogate(ci.setIndex(index))) { if (!Character.isHighSurrogate(ci.previous())) { ci.next(); } } return ci.getIndex(); } /* DictionaryCache stores the boundaries obtained from a run of dictionary characters. * Dictionary boundaries are moved first to this cache, then from here * to the main BreakCache, where they may inter-leave with non-dictionary * boundaries. The public BreakIterator API always fetches directly * from the main BreakCache, not from here. * * In common situations, the number of boundaries in a single dictionary run * should be quite small, it will be terminated by punctuation, spaces, * or any other non-dictionary characters. The main BreakCache may end * up with boundaries from multiple dictionary based runs. * * The boundaries are stored in a simple ArrayList (vector), with the * assumption that they will be accessed sequentially. */ class DictionaryCache { void reset() { fPositionInCache = -1; fStart = 0; fLimit = 0; fFirstRuleStatusIndex = 0; fOtherRuleStatusIndex = 0; fBreaks.removeAllElements(); }; boolean following(int fromPos) { if (fromPos >= fLimit || fromPos < fStart) { fPositionInCache = -1; return false; } // Sequential iteration, move from previous boundary to the following int r = 0; if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { ++fPositionInCache; if (fPositionInCache >= fBreaks.size()) { fPositionInCache = -1; return false; } r = fBreaks.elementAt(fPositionInCache); assert(r > fromPos); fBoundary = r; fStatusIndex = fOtherRuleStatusIndex; return true; } // Random indexing. Linear search for the boundary following the given position. for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { r= fBreaks.elementAt(fPositionInCache); if (r > fromPos) { fBoundary = r; fStatusIndex = fOtherRuleStatusIndex; return true; } } // Internal error. fStart <= fromPos < fLimit, but no cached boundary. assert(false); fPositionInCache = -1; return false; }; boolean preceding(int fromPos) { if (fromPos <= fStart || fromPos > fLimit) { fPositionInCache = -1; return false; } if (fromPos == fLimit) { fPositionInCache = fBreaks.size() - 1; if (fPositionInCache >= 0) { assert(fBreaks.elementAt(fPositionInCache) == fromPos); } } int r; if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { --fPositionInCache; r = fBreaks.elementAt(fPositionInCache); assert(r < fromPos); fBoundary = r; fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; return true; } if (fPositionInCache == 0) { fPositionInCache = -1; return false; } for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { r = fBreaks.elementAt(fPositionInCache); if (r < fromPos) { fBoundary = r; fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; return true; } } assert(false); fPositionInCache = -1; return false; }; /** * Populate the cache with the dictionary based boundaries within a region of text. * @param startPos The start position of a range of text * @param endPos The end position of a range of text * @param firstRuleStatus The rule status index that applies to the break at startPos * @param otherRuleStatus The rule status index that applies to boundaries other than startPos * @internal */ void populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus) { if ((endPos - startPos) <= 1) { return; } reset(); fFirstRuleStatusIndex = firstRuleStatus; fOtherRuleStatusIndex = otherRuleStatus; int rangeStart = startPos; int rangeEnd = endPos; int category; int current; int foundBreakCount = 0; // Loop through the text, looking for ranges of dictionary characters. // For each span, find the appropriate break engine, and ask it to find // any breaks within the span. fText.setIndex(rangeStart); int c = CharacterIteration.current32(fText); category = (short)fRData.fTrie.get(c); while(true) { while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) { c = CharacterIteration.next32(fText); // pre-increment category = (short)fRData.fTrie.get(c); } if (current >= rangeEnd) { break; } // We now have a dictionary character. Get the appropriate language object // to deal with it. LanguageBreakEngine lbe = getLanguageBreakEngine(c); // Ask the language object if there are any breaks. It will add them to the cache and // leave the text pointer on the other side of its range, ready to search for the next one. if (lbe != null) { foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreakType, fBreaks); } // Reload the loop variables for the next go-round c = CharacterIteration.current32(fText); category = (short)fRData.fTrie.get(c); } // If we found breaks, ensure that the first and last entries are // the original starting and ending position. And initialize the // cache iteration position to the first entry. // System.out.printf("foundBreakCount = %d%n", foundBreakCount); if (foundBreakCount > 0) { assert(foundBreakCount == fBreaks.size()); if (startPos < fBreaks.elementAt(0)) { // The dictionary did not place a boundary at the start of the segment of text. // Add one now. This should not commonly happen, but it would be easy for interactions // of the rules for dictionary segments and the break engine implementations to // inadvertently cause it. Cover it here, just in case. fBreaks.offer(startPos); } if (endPos > fBreaks.peek()) { fBreaks.push(endPos); } fPositionInCache = 0; // Note: Dictionary matching may extend beyond the original limit. fStart = fBreaks.elementAt(0); fLimit = fBreaks.peek(); } else { // there were no language-based breaks, even though the segment contained // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache // for this range will fail, and the calling code will fall back to the rule based boundaries. } }; DictionaryCache() { fPositionInCache = -1; fBreaks = new DictionaryBreakEngine.DequeI(); } /** * copy constructor. Used by RuleBasedBreakIterator.clone(). * * @param src the source object to be copied. */ DictionaryCache(DictionaryCache src) { try { fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } fPositionInCache = src.fPositionInCache; fStart = src.fStart; fLimit = src.fLimit; fFirstRuleStatusIndex = src.fFirstRuleStatusIndex; fOtherRuleStatusIndex = src.fOtherRuleStatusIndex; fBoundary = src.fBoundary; fStatusIndex = src.fStatusIndex; } // A data structure containing the boundaries themselves. Essentially a vector of raw ints. DictionaryBreakEngine.DequeI fBreaks; int fPositionInCache; // Index in fBreaks of last boundary returned by following() // // or preceding(). Optimizes sequential access. int fStart; // Text position of first boundary in cache. int fLimit; // Last boundary in cache. Which is the limit of the // // text segment being handled by the dictionary. int fFirstRuleStatusIndex; // Rule status info for first boundary. int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. int fBoundary; // Current boundary. Set by preceding(), following(). int fStatusIndex; // Current rule status index. Set by preceding, following(). }; /* * class BreakCache * * Cache of break boundary positions and rule status values. * Break iterator API functions, next(), previous(), etc., will use cached results * when possible, and otherwise cache new results as they are obtained. * * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. * * The cache is implemented as a single circular buffer. */ /* * size of the circular cache buffer. */ class BreakCache { BreakCache() { reset(); }; void reset(int pos, int ruleStatus) { fStartBufIdx = 0; fEndBufIdx = 0; fTextIdx = pos; fBufIdx = 0; fBoundaries[0] = pos; fStatuses[0] = (short)ruleStatus; } void reset() {reset(0, 0); }; void next() { if (fBufIdx == fEndBufIdx) { fDone = !populateFollowing(); fPosition = fTextIdx; fRuleStatusIndex = fStatuses[fBufIdx]; } else { fBufIdx = modChunkSize(fBufIdx + 1); fTextIdx = fPosition = fBoundaries[fBufIdx]; fRuleStatusIndex = fStatuses[fBufIdx]; } }; void previous() { int initialBufIdx = fBufIdx; if (fBufIdx == fStartBufIdx) { // At start of cache. Prepend to it. populatePreceding(); } else { // Cache already holds the next boundary fBufIdx = modChunkSize(fBufIdx - 1); fTextIdx = fBoundaries[fBufIdx]; } fDone = (fBufIdx == initialBufIdx); fPosition = fTextIdx; fRuleStatusIndex = fStatuses[fBufIdx]; return; }; // Move the iteration state to the position following the startPosition. // Input position must be pinned to the input length. void following(int startPos) { if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { // startPos is in the cache. Do a next() from that position. // TODO: an awkward set of interactions with bi->fDone // seek() does not clear it; it can't because of interactions with populateNear(). // next() does not clear it in the fast-path case, where everything matters. Maybe it should. // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. fDone = false; next(); } }; void preceding(int startPos) { if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { if (startPos == fTextIdx) { previous(); } else { // seek() leaves the BreakCache positioned at the preceding boundary // if the requested position is between two bounaries. // current() pushes the BreakCache position out to the BreakIterator itself. assert(startPos > fTextIdx); current(); } } return; }; /* * Update the state of the public BreakIterator (fBI) to reflect the * current state of the break iterator cache (this). */ int current() { fPosition = fTextIdx; fRuleStatusIndex = fStatuses[fBufIdx]; fDone = false; return fTextIdx; }; /** * Add boundaries to the cache near the specified position. * The given position need not be a boundary itself. * The input position must be within the range of the text, and * on a code point boundary. * If the requested position is a break boundary, leave the iteration * position on it. * If the requested position is not a boundary, leave the iteration * position on the preceding boundary and include both the the * preceding and following boundaries in the cache. * Additional boundaries, either preceding or following, may be added * to the cache as a side effect. * * Return false if the operation failed. */ boolean populateNear(int position) { assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); // Find a boundary somewhere in the vicinity of the requested position. // Depending on the safe rules and the text data, it could be either before, at, or after // the requested position. // If the requested position is not near already cached positions, clear the existing cache, // find a near-by boundary and begin new cache contents there. if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { int aBoundary = fText.getBeginIndex(); int ruleStatusIndex = 0; // TODO: check for position == length of text. Although may still need to back up to get rule status. if (position > aBoundary + 20) { int backupPos = handlePrevious(position); fPosition = backupPos; aBoundary = handleNext(); // Ignore dictionary, just finding a rule based boundary. ruleStatusIndex = fRuleStatusIndex; } reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. } // Fill in boundaries between existing cache content and the new requested position. if (fBoundaries[fEndBufIdx] < position) { // The last position in the cache precedes the requested position. // Add following position(s) to the cache. while (fBoundaries[fEndBufIdx] < position) { if (!populateFollowing()) { assert false; return false; } } fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. previous(); } return true; } if (fBoundaries[fStartBufIdx] > position) { // The first position in the cache is beyond the requested position. // back up more until we get a boundary <= the requested position. while (fBoundaries[fStartBufIdx] > position) { populatePreceding(); } fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. next(); } if (fTextIdx > position) { // If position is not itself a boundary, the next() loop above will overshoot. // Back up one, leaving cache position at the boundary preceding the requested position. previous(); } return true; } assert fTextIdx == position; return true; }; /** * Add boundary(s) to the cache following the current last boundary. * Return false if at the end of the text, and no more boundaries can be added. * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. */ boolean populateFollowing() { int fromPosition = fBoundaries[fEndBufIdx]; int fromRuleStatusIdx = fStatuses[fEndBufIdx]; int pos = 0; int ruleStatusIdx = 0; if (fDictionaryCache.following(fromPosition)) { addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); return true; } fPosition = fromPosition; pos = handleNext(); if (pos == BreakIterator.DONE) { return false; } ruleStatusIdx = fRuleStatusIndex; if (fDictionaryCharCount > 0) { // The text segment obtained from the rules includes dictionary characters. // Subdivide it, with subdivided results going into the dictionary cache. fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); if (fDictionaryCache.following(fromPosition)) { addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); return true; // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point. // But be careful with interactions with populateNear(). } } // Rule based segment did not include dictionary characters. // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, // meaning that we didn't take the return, above. // Add its end point to the cache. addFollowing(pos, ruleStatusIdx, UpdateCachePosition); // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. // for (int count=0; count<6; ++count) { pos = handleNext(); if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) { break; } addFollowing(pos, fRuleStatusIndex, RetainCachePosition); } return true; }; /** * Add one or more boundaries to the cache preceding the first currently cached boundary. * Leave the iteration position on the first added boundary. * Return false if no boundaries could be added (if at the start of the text.) */ boolean populatePreceding() { int textBegin = fText.getBeginIndex(); int fromPosition = fBoundaries[fStartBufIdx]; if (fromPosition == textBegin) { return false; } int position = textBegin; int positionStatusIdx = 0; if (fDictionaryCache.preceding(fromPosition)) { addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); return true; } int backupPosition = fromPosition; // Find a boundary somewhere preceding the first already-cached boundary do { backupPosition = backupPosition - 30; if (backupPosition <= textBegin) { backupPosition = textBegin; } else { backupPosition = handlePrevious(backupPosition); } if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) { position = textBegin; positionStatusIdx = 0; } else { fPosition = backupPosition; // TODO: pass starting position in a clearer way. position = handleNext(); positionStatusIdx = fRuleStatusIndex; } } while (position >= fromPosition); // Find boundaries between the one we just located and the first already-cached boundary // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.. fSideBuffer.removeAllElements(); fSideBuffer.push(position); fSideBuffer.push(positionStatusIdx); do { int prevPosition = fPosition = position; int prevStatusIdx = positionStatusIdx; position = handleNext(); positionStatusIdx = fRuleStatusIndex; if (position == BreakIterator.DONE) { break; } boolean segmentHandledByDictionary = false; if (fDictionaryCharCount != 0) { // Segment from the rules includes dictionary characters. // Subdivide it, with subdivided results going into the dictionary cache. int dictSegEndPosition = position; fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); while (fDictionaryCache.following(prevPosition)) { position = fDictionaryCache.fBoundary; positionStatusIdx = fDictionaryCache.fStatusIndex; segmentHandledByDictionary = true; assert(position > prevPosition); if (position >= fromPosition) { break; } assert(position <= dictSegEndPosition); fSideBuffer.push(position); fSideBuffer.push(positionStatusIdx); prevPosition = position; } assert(position==dictSegEndPosition || position>=fromPosition); } if (!segmentHandledByDictionary && position < fromPosition) { fSideBuffer.push(position); fSideBuffer.push(positionStatusIdx); } } while (position < fromPosition); // Move boundaries from the side buffer to the main circular buffer. boolean success = false; if (!fSideBuffer.isEmpty()) { positionStatusIdx = fSideBuffer.pop(); position = fSideBuffer.pop(); addPreceding(position, positionStatusIdx, UpdateCachePosition); success = true; } while (!fSideBuffer.isEmpty()) { positionStatusIdx = fSideBuffer.pop(); position = fSideBuffer.pop(); if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { // No space in circular buffer to hold a new preceding result while // also retaining the current cache (iteration) position. // Bailing out is safe; the cache will refill again if needed. break; } } return success; }; static final boolean RetainCachePosition = false; static final boolean UpdateCachePosition = true; /* * Add the boundary following the current position. * The current position can be left as it was, or changed to the newly added boundary, * as specified by the update parameter. */ void addFollowing(int position, int ruleStatusIdx, boolean update) { assert(position > fBoundaries[fEndBufIdx]); assert(ruleStatusIdx <= Short.MAX_VALUE); int nextIdx = modChunkSize(fEndBufIdx + 1); if (nextIdx == fStartBufIdx) { fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. } fBoundaries[nextIdx] = position; fStatuses[nextIdx] = (short)ruleStatusIdx; fEndBufIdx = nextIdx; if (update == UpdateCachePosition) { // Set current position to the newly added boundary. fBufIdx = nextIdx; fTextIdx = position; } else { // Retaining the original cache position. // Check if the added boundary wraps around the buffer, and would over-write the original position. // It's the responsibility of callers of this function to not add too many. assert(nextIdx != fBufIdx); } }; /* * Add the boundary preceding the current position. * The current position can be left as it was, or changed to the newly added boundary, * as specified by the update parameter. */ boolean addPreceding(int position, int ruleStatusIdx, boolean update) { assert(position < fBoundaries[fStartBufIdx]); assert(ruleStatusIdx <= Short.MAX_VALUE); int nextIdx = modChunkSize(fStartBufIdx - 1); if (nextIdx == fEndBufIdx) { if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { // Failure. The insertion of the new boundary would claim the buffer position that is the // current iteration position. And we also want to retain the current iteration position. // (The buffer is already completely full of entries that precede the iteration position.) return false; } fEndBufIdx = modChunkSize(fEndBufIdx - 1); } fBoundaries[nextIdx] = position; fStatuses[nextIdx] = (short)ruleStatusIdx; fStartBufIdx = nextIdx; if (update == UpdateCachePosition) { fBufIdx = nextIdx; fTextIdx = position; } return true; }; /** * Set the cache position to the specified position, or, if the position * falls between to cached boundaries, to the preceding boundary. * Fails if the requested position is outside of the range of boundaries currently held by the cache. * The startPosition must be on a code point boundary. * * Return true if successful, false if the specified position is after * the last cached boundary or before the first. */ boolean seek(int pos) { if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { return false; } if (pos == fBoundaries[fStartBufIdx]) { // Common case: seek(0), from BreakIterator::first() fBufIdx = fStartBufIdx; fTextIdx = fBoundaries[fBufIdx]; return true; } if (pos == fBoundaries[fEndBufIdx]) { fBufIdx = fEndBufIdx; fTextIdx = fBoundaries[fBufIdx]; return true; } int min = fStartBufIdx; int max = fEndBufIdx; while (min != max) { int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; probe = modChunkSize(probe); if (fBoundaries[probe] > pos) { max = probe; } else { min = modChunkSize(probe + 1); } } assert(fBoundaries[max] > pos); fBufIdx = modChunkSize(max - 1); fTextIdx = fBoundaries[fBufIdx]; assert(fTextIdx <= pos); return true; }; /** * copy constructor, used from RuleBasedBreakIterator.clone(). * * @param src */ BreakCache(BreakCache src) { fStartBufIdx = src.fStartBufIdx; fEndBufIdx = src.fEndBufIdx; fTextIdx = src.fTextIdx; fBufIdx = src.fBufIdx; fBoundaries = src.fBoundaries.clone(); fStatuses = src.fStatuses.clone(); fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents. } void dumpCache() { System.out.printf("fTextIdx:%d fBufIdx:%d%n", fTextIdx, fBufIdx); for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) { System.out.printf("%d %d%n", i, fBoundaries[i]); if (i == fEndBufIdx) { break; } } }; private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; static final int CACHE_SIZE = 128; // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); int fStartBufIdx; int fEndBufIdx; // inclusive int fTextIdx; int fBufIdx; int[] fBoundaries = new int[CACHE_SIZE]; short[] fStatuses = new short[CACHE_SIZE]; DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI(); }; }