12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others. 22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 59e281ba4837cba4a1cf9523d6f8b0621b150063dScott Russell * Copyright (C) 2005-2016 International Business Machines Corporation and 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * others. All Rights Reserved. 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.text; 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport static com.ibm.icu.impl.CharacterIteration.DONE32; 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport static com.ibm.icu.impl.CharacterIteration.next32; 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport static com.ibm.icu.impl.CharacterIteration.nextTrail32; 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport static com.ibm.icu.impl.CharacterIteration.previous32; 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.io.ByteArrayOutputStream; 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.io.IOException; 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.io.InputStream; 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.io.OutputStream; 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.nio.ByteBuffer; 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.text.CharacterIterator; 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.concurrent.ConcurrentHashMap; 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Assert; 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.CharTrie; 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.CharacterIteration; 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.ICUBinary; 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.ICUDebug; 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UCharacter; 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UProperty; 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UScript; 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 3587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * Rule Based Break Iterator 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. 3787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic class RuleBasedBreakIterator extends BreakIterator { 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //======================================================================= 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Constructors & Factories 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //======================================================================= 4487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 4587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert /** 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * private constructor 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private RuleBasedBreakIterator() { 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastStatusIndexValid = true; 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fDictionaryCharCount = 0; 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fBreakEngines.put(-1, fUnhandledBreakEngine); 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Create a break iterator from a precompiled set of break rules. 5687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Creating a break iterator from the binary rules is much faster than 5887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * creating one from source rules. 5987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Binary break iterator rules are not guaranteed to be compatible between 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * different versions of ICU. 6387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param is an input stream supplying the compiled binary rules. 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @throws IOException if there is an error while reading the rules from the InputStream. 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #compileRules(String, OutputStream) 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 4.8 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 71aacdd6f022693689b3bf76f70670711f3254a441Fredrik Roubert This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); 7287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert return This; 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Create a break iterator from a precompiled set of break rules. 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Creating a break iterator from the binary rules is much faster than 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * creating one from source rules. 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Binary break iterator rules are not guaranteed to be compatible between 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * different versions of ICU. 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param bytes a buffer supplying the compiled binary rules. 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @throws IOException if there is an error while reading the rules from the buffer. 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #compileRules(String, OutputStream) 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @internal 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @deprecated This API is ICU internal only. 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Deprecated 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert This.fRData = RBBIDataWrapper.get(bytes); 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return This; 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param rules The break rules to be used. 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.2 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public RuleBasedBreakIterator(String rules) { 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this(); 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert try { 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compileRules(rules, ruleOS); 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } catch (IOException e) { 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ///CLOVER:OFF 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // causing bogus compiled rules to be produced, but with no compile error raised. 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: " 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert + e.getMessage()); 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw rte; 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ///CLOVER:ON 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //======================================================================= 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Boilerplate 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //======================================================================= 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Clones this iterator. 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return A newly-constructed RuleBasedBreakIterator with the same 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * behavior as this one. 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1302d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public Object clone() 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert { 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone(); 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fText != null) { 13587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert result.fText = (CharacterIterator)(fText.clone()); 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns true if both BreakIterators are of the same class, have the same 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * rules, and iterate over the same text. 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1452d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean equals(Object that) { 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (that == null) { 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (this == that) { 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert try { 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fRData != other.fRData && (fRData == null || other.fRData == null)) { 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (fRData != null && other.fRData != null && 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fText == null && other.fText == null) { 16387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert return true; 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fText == null || other.fText == null) { 16687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert return false; 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return fText.equals(other.fText); 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert catch(ClassCastException e) { 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the description (rules) used to create this iterator. 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (In ICU4C, the same function is RuleBasedBreakIterator::getRules()) 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1802d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public String toString() { 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String retStr = ""; 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fRData != null) { 1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert retStr = fRData.fRuleSource; 1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return retStr; 1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compute a hashcode for this BreakIterator 1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return A hash code 1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1942d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int hashCode() 1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert { 19787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert return fRData.fRuleSource.hashCode(); 1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int START_STATE = 1; // The state number of the starting state 2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int STOP_STATE = 0; // The state-transition value indicating "stop" 20387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // of user text. A variable with this enum type keeps track of where we 2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // are. The state machine only fetches user text input while in RUN mode. 2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int RBBI_START = 0; 2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int RBBI_RUN = 1; 2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int RBBI_END = 2; 2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The character iterator through which this BreakIterator accesses the text. 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CharacterIterator fText = new java.text.StringCharacterIterator(""); 21587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The rule data for this BreakIterator instance. Package private. 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert RBBIDataWrapper fRData; 22087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 22287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * Index of the Rule {tag} values for the most recent match. 2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int fLastRuleStatusIndex; 2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Rule tag value valid flag. 2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Some iterator operations don't intrinsically set the correct tag value. 2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This flag lets us lazily compute the value if we are ever asked for it. 2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean fLastStatusIndexValid; 2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Counter for the number of characters encountered with the "dictionary" 2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * flag set. Normal RBBI iterators don't use it, although the code 2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * for updating it is live. Dictionary Based break iterators (a subclass 2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * of us) access this field directly. 2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @internal 2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int fDictionaryCharCount; 2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * ICU debug argument name for RBBI 2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final String RBBI_DEBUG_ARG = "rbbi"; 2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Debugging flag. Trace operation of state machine when true. 2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG) 2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; 2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 25487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * What kind of break iterator this is. Set to KIND_LINE by default, 2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * since this produces sensible output. 2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int fBreakType = KIND_LINE; 25887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The "default" break engine - just skips over ranges of dictionary words, 2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * producing no breaks. Should only be used if characters need to be handled 2627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * by a dictionary but we have no dictionary implementation for them. 2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine(); 26587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * when a range of characters is divided up using the dictionary, the break 2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * positions that are discovered are stored here, preventing us from having 2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * to use either the dictionary or the state table again until the iterator 2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * leaves this range of text 2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int[] fCachedBreakPositions; 2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * if fCachedBreakPositions is not null, this indicates which item in the 2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * cache the current iteration position refers to 2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int fPositionInCache; 2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 28087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 28187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert private final ConcurrentHashMap<Integer, LanguageBreakEngine> fBreakEngines = 2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert new ConcurrentHashMap<Integer, LanguageBreakEngine>(); 2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Dumps caches and performs other actions associated with a complete change 2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * in text or iteration position. 2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void reset() { 2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fCachedBreakPositions = null; 2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // fNumCachedBreakPositions = 0; 2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fDictionaryCharCount = 0; 2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fPositionInCache = 0; 2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Dump the contents of the state table and character classes for this break iterator. 2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * For debugging only. 2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @internal 2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @deprecated This API is ICU internal only. 2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Deprecated 3012d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert public void dump(java.io.PrintStream out) { 3022d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert if (out == null) { 3032d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert out = System.out; 3042d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert } 3052d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert this.fRData.dump(out); 3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compile a set of source break rules into the binary state tables used 3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * by the break iterator engine. Creating a break iterator from precompiled 3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * rules is much faster than creating one from source rules. 31287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * 3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Binary break rules are not guaranteed to be compatible between different 3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * versions of ICU. 31587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * 31687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * 3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param rules The source form of the break rules 3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param ruleBinary An output stream to receive the compiled rules. 3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @throws IOException If there is an error writing the output. 3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @see #getInstanceFromCompiledRules(InputStream) 3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 4.8 3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { 3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert RBBIRuleBuilder.compileRules(rules, ruleBinary); 3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 32687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //======================================================================= 3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // BreakIterator overrides 3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //======================================================================= 3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Sets the current iteration position to the beginning of the text. 3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (i.e., the CharacterIterator's starting offset). 3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The offset of the beginning of the text. 3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3372d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int first() { 3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fCachedBreakPositions = null; 3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fDictionaryCharCount = 0; 3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fPositionInCache = 0; 3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastRuleStatusIndex = 0; 3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastStatusIndexValid = true; 3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fText == null) { 3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return BreakIterator.DONE; 3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.first(); 3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return fText.getIndex(); 3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 35087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Sets the current iteration position to the end of the text. 3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (i.e., the CharacterIterator's ending offset). 3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The text's past-the-end offset. 3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3572d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int last() { 3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fCachedBreakPositions = null; 3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fDictionaryCharCount = 0; 3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fPositionInCache = 0; 3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fText == null) { 3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastRuleStatusIndex = 0; 3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastStatusIndexValid = true; 3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return BreakIterator.DONE; 3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // t.last() returns the offset of the last character, 3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // rather than the past-the-end offset 3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ... 3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // will work correctly. 3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastStatusIndexValid = false; 3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int pos = fText.getEndIndex(); 3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.setIndex(pos); 3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return pos; 3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 37887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Advances the iterator either forward or backward the specified number of steps. 3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Negative values move backward, and positive values move forward. This is 3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * equivalent to repeatedly calling next() or previous(). 3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param n The number of steps to move. The sign indicates the direction 3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (negative is backwards, and positive is forwards). 3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The character offset of the boundary position n boundaries away from 3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the current one. 3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 3892d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int next(int n) { 3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int result = current(); 3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (n > 0) { 3937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = next(); 3947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --n; 3957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (n < 0) { 3977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = previous(); 3987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++n; 3997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 4017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 40287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 4037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Advances the iterator to the next boundary position. 4057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The position of the first boundary after this one. 4067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 4077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4082d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 4097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int next() { 4107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we have cached break positions and we're still in the range 4117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // covered by them, just move one step forward in the cache 4127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fCachedBreakPositions != null) { 4137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fPositionInCache < fCachedBreakPositions.length - 1) { 4147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++fPositionInCache; 4157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int pos = fCachedBreakPositions[fPositionInCache]; 4167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.setIndex(pos); 4177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return pos; 4187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert else { 4207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reset(); 4217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int startPos = current(); 4257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fDictionaryCharCount = 0; 4267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int result = handleNext(fRData.fFTable); 4277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fDictionaryCharCount > 0) { 4287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = checkDictionary(startPos, result, false); 4297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 4317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 4347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * checkDictionary This function handles all processing of characters in 4357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the "dictionary" set. It will determine the appropriate 4367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * course of action, and possibly set up a cache in the 4377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * process. 4387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 4397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int checkDictionary(int startPos, int endPos, boolean reverse) { 44087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 4417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Reset the old break cache first. 4427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reset(); 4437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 44487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // note: code segment below assumes that dictionary chars are in the 4457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // startPos-endPos range 4467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // value returned should be next character in sequence 4477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((endPos - startPos) <= 1) { 4487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (reverse ? startPos : endPos); 4497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Starting from the starting point, scan towards the proposed result, 4527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // looking for the first dictionary character (which may be the one 4537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we're on, if we're starting in the middle of a range). 4547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.setIndex(reverse ? endPos : startPos); 4557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (reverse) { 4567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharacterIteration.previous32(fText); 4577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int rangeStart = startPos; 4607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int rangeEnd = endPos; 4617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int category; 4637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int current; 4647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert DictionaryBreakEngine.DequeI breaks = new DictionaryBreakEngine.DequeI(); 4657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int foundBreakCount = 0; 4667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c = CharacterIteration.current32(fText); 4677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert category = (short)fRData.fTrie.getCodePointValue(c); 4687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Is the character we're starting on a dictionary character? If so, we 4707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // need to back up to include the entire run; otherwise the results of 4717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the break algorithm will differ depending on where we start. Since 4727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the result is cached and there is typically a non-dictionary break 4737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // within a small number of words, there should be little performance impact. 4747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((category & 0x4000) != 0) { 4757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (reverse) { 4767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 4777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharacterIteration.next32(fText); 4787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = CharacterIteration.current32(fText); 4797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert category = (short)fRData.fTrie.getCodePointValue(c); 4807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while (c != CharacterIteration.DONE32 && ((category & 0x4000)) != 0); 48187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 4827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Back up to the last dictionary character 4837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rangeEnd = fText.getIndex(); 4847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (c == CharacterIteration.DONE32) { 4857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c = fText->last32(); 4867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: why was this if needed? 4877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = CharacterIteration.previous32(fText); 4887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert else { 4907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = CharacterIteration.previous32(fText); 4917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert else { 4947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 4957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = CharacterIteration.previous32(fText); 4967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert category = (short)fRData.fTrie.getCodePointValue(c); 4977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (c != CharacterIteration.DONE32 && ((category & 0x4000) != 0)); 4997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Back up to the last dictionary character 5007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (c == CharacterIteration.DONE32) { 5017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // c = fText->first32(); 5027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = CharacterIteration.current32(fText); 5037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert else { 5057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharacterIteration.next32(fText); 5067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = CharacterIteration.current32(fText); 5077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rangeStart = fText.getIndex(); 5097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert category = (short)fRData.fTrie.getCodePointValue(c); 5117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 51387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 5147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Loop through the text, looking for ranges of dictionary characters. 5157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // For each span, find the appropriate break engine, and ask it to find 5167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // any breaks within the span. 5177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: we always do this in the forward direction, so that the break 5187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // cache is built in the right order. 5197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (reverse) { 5207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.setIndex(rangeStart); 5217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = CharacterIteration.current32(fText); 5227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert category = (short)fRData.fTrie.getCodePointValue(c); 5237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert LanguageBreakEngine lbe = null; 5257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(true) { 5267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) { 5277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharacterIteration.next32(fText); 5287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = CharacterIteration.current32(fText); 5297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert category = (short)fRData.fTrie.getCodePointValue(c); 5307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (current >= rangeEnd) { 5327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 5337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 53487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 5357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We now have a dictionary character. Get the appropriate language object 5367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // to deal with it. 5377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lbe = getLanguageBreakEngine(c); 53887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 5397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Ask the language object if there are any breaks. It will leave the text 5407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // pointer on the other side of its range, ready to search for the next one. 5417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (lbe != null) { 5427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int startingIdx = fText.getIndex(); 5437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, false, fBreakType, breaks); 5447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert fText.getIndex() > startingIdx; 5457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 54687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 5477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Reload the loop variables for the next go-round 5487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = CharacterIteration.current32(fText); 5497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert category = (short)fRData.fTrie.getCodePointValue(c); 5507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 55187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 5527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If we found breaks, build a new break cache. The first and last entries must 5537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // be the original starting and ending position. 5547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (foundBreakCount > 0) { 5557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (foundBreakCount != breaks.size()) { 5567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.println("oops, foundBreakCount != breaks.size(). LBE = " + lbe.getClass()); 5577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert foundBreakCount == breaks.size(); 5597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (startPos < breaks.peekLast()) { 5607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert breaks.offer(startPos); 5617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (endPos > breaks.peek()) { 5637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert breaks.push(endPos); 5647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 56587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 5667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: get rid of this array, use results from the deque directly 5677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fCachedBreakPositions = new int[breaks.size()]; 56887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 5697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int i = 0; 5707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (breaks.size() > 0) { 5717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fCachedBreakPositions[i++] = breaks.pollLast(); 5727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 57387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 5747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If there are breaks, then by definition, we are replacing the original 5757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // proposed break by one of the breaks we found. Use following() and 5767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // preceding() to do the work. They should never recurse in this case. 5777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (reverse) { 5787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return preceding(endPos); 5797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert else { 5817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return following(startPos); 5827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If we get here, there were no language-based breaks. Set the text pointer 5867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // to the original proposed break. 5877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.setIndex(reverse ? startPos : endPos); 5887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (reverse ? startPos : endPos); 58987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 5907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 59187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 59287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 5937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 5947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Moves the iterator backwards, to the last boundary preceding this one. 5957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The position of the last boundary position preceding this one. 5967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 5977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5982d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 5997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int previous() { 6007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int result; 6017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int startPos; 60287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 6037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharacterIterator text = getText(); 6047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastStatusIndexValid = false; 6067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we have cached break positions and we're still in the range 6087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // covered by them, just move one step backward in the cache 6097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fCachedBreakPositions != null) { 6107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fPositionInCache > 0) { 6117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --fPositionInCache; 6127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If we're at the beginning of the cache, need to reevaluate the 6137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // rule status 6147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fPositionInCache <= 0) { 6157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastStatusIndexValid = false; 6167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int pos = fCachedBreakPositions[fPositionInCache]; 6187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text.setIndex(pos); 6197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return pos; 6207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 6217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert reset(); 6227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we're already sitting at the beginning of the text, return DONE 6267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert startPos = current(); 6277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fText == null || startPos == fText.getBeginIndex()) { 6287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastRuleStatusIndex = 0; 6297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastStatusIndexValid = true; 6307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return BreakIterator.DONE; 6317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Rules with an exact reverse table are handled here. 6347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fRData.fSRTable != null || fRData.fSFTable != null) { 6357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = handlePrevious(fRData.fRTable); 6367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fDictionaryCharCount > 0) { 6377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = checkDictionary(result, startPos, true); 6387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 6407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // old rule syntax 6437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // set things up. handlePrevious() will back us up to some valid 6447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // break position before the current position (we back our internal 6457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // iterator up one step to prevent handlePrevious() from returning 6467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the current position), but not necessarily the last one before 6477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // where we started 6487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int start = current(); 6507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert previous32(fText); 6527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int lastResult = handlePrevious(fRData.fRTable); 6537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (lastResult == BreakIterator.DONE) { 6547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastResult = fText.getBeginIndex(); 6557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.setIndex(lastResult); 6567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = lastResult; 6587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int lastTag = 0; 6597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean breakTagValid = false; 6607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // iterate forward from the known break position until we pass our 6627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // starting point. The last break position before the starting 6637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // point is our return value 6647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (;;) { 6667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = next(); 6677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (result == BreakIterator.DONE || result >= start) { 6687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 6697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastResult = result; 6717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastTag = fLastRuleStatusIndex; 6727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert breakTagValid = true; 6737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // fLastBreakTag wants to have the value for section of text preceding 6767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the result position that we are to return (in lastResult.) If 6777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the backwards rules overshot and the above loop had to do two or more 6787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // handleNext()s to move up to the desired return position, we will have a valid 6797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // tag value. But, if handlePrevious() took us to exactly the correct result position, 6807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we wont have a tag value for that position, which is only set by handleNext(). 6817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Set the current iteration position to be the last break position 6837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // before where we started, and then return that value. 6847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.setIndex(lastResult); 6857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() 6867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastStatusIndexValid = breakTagValid; 6877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return lastResult; 6887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 6917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Sets the iterator to refer to the first boundary position following 6927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the specified position. 6937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param offset The position from which to begin searching for a break position. 6947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The position of the first break after the current position. 6957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 6967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 6972d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 6987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int following(int offset) { 6997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharacterIterator text = getText(); 7007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we have no cached break positions, or if "offset" is outside the 7027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // range covered by the cache, then dump the cache and call our 7037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // inherited following() method. This will call other methods in this 7047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // class that may refresh the cache. 7057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fCachedBreakPositions == null || offset < fCachedBreakPositions[0] || 7067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert offset >= fCachedBreakPositions[fCachedBreakPositions.length - 1]) { 7077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fCachedBreakPositions = null; 7087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return rulesFollowing(offset); 7097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // on the other hand, if "offset" is within the range covered by the 7127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // cache, then just search the cache for the first break position 7137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // after "offset" 7147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert else { 7157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fPositionInCache = 0; 7167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (fPositionInCache < fCachedBreakPositions.length 7177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert && offset >= fCachedBreakPositions[fPositionInCache]) 7187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++fPositionInCache; 7197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text.setIndex(fCachedBreakPositions[fPositionInCache]); 7207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return text.getIndex(); 7217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 72387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 7247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int rulesFollowing(int offset) { 7257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if the offset passed in is already past the end of the text, 7267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // just return DONE; if it's before the beginning, return the 7277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // text's starting offset 7287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastRuleStatusIndex = 0; 7297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastStatusIndexValid = true; 7307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fText == null || offset >= fText.getEndIndex()) { 7317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert last(); 7327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return next(); 7337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert else if (offset < fText.getBeginIndex()) { 7357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return first(); 7367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // otherwise, set our internal iteration position (temporarily) 7397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // to the position passed in. If this is the _beginning_ position, 7407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // then we can just use next() to get our return value 7417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int result = 0; 7437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fRData.fSRTable != null) { 7457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Safe Point Reverse rules exist. 7467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This allows us to use the optimum algorithm. 7477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.setIndex(offset); 7487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // move forward one codepoint to prepare for moving back to a 7497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // safe point. 7507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // this handles offset being between a supplementary character 7517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert next32(fText); 7527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // handlePrevious will move most of the time to < 1 boundary away 7537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert handlePrevious(fRData.fSRTable); 7547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = next(); 7557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (result <= offset) { 7567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = next(); 7577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 7597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fRData.fSFTable != null) { 7617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No Safe point reverse table, but there is a safe pt forward table. 76287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // 7637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.setIndex(offset); 7647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert previous32(fText); 7657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // handle next will give result >= offset 7667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert handleNext(fRData.fSFTable); 7677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // previous will give result 0 or 1 boundary away from offset, 7687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // most of the time 7697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we have to 7707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int oldresult = previous(); 7717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (oldresult > offset) { 7727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = previous(); 7737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (result <= offset) { 7747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return oldresult; 7757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert oldresult = result; 7777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = next(); 7797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (result <= offset) { 7807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return next(); 7817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 7837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // otherwise, we have to sync up first. Use handlePrevious() to back 7857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // us up to a known break position before the specified position (if 7867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we can determine that the specified position is a break position, 7877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we don't back up at all). This may or may not be the last break 7887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // position at or before our starting position. Advance forward 7897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // from here until we've passed the starting position. The position 7907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we stop on will be the first break position after the specified one. 7917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // old rule syntax 7927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.setIndex(offset); 7947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (offset == fText.getBeginIndex()) { 7957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return next(); 7967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = previous(); 7987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (result != BreakIterator.DONE && result <= offset) { 8007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = next(); 8017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 8047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 8067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Sets the iterator to refer to the last boundary position before the 8077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * specified position. 8087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param offset The position to begin searching for a break from. 8097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The position of the last boundary before the starting position. 8107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 8117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8122d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 8137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int preceding(int offset) { 8147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharacterIterator text = getText(); 8157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we have no cached break positions, or "offset" is outside the 8177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // range covered by the cache, we can just call the inherited routine 8187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (which will eventually call other routines in this class that may 8197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // refresh the cache) 8207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fCachedBreakPositions == null || offset <= fCachedBreakPositions[0] || 8217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert offset > fCachedBreakPositions[fCachedBreakPositions.length - 1]) { 8227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fCachedBreakPositions = null; 8237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return rulesPreceding(offset); 8247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // on the other hand, if "offset" is within the range covered by the cache, 8277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // then all we have to do is search the cache for the last break position 8287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // before "offset" 8297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert else { 8307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fPositionInCache = 0; 8317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (fPositionInCache < fCachedBreakPositions.length 8327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert && offset > fCachedBreakPositions[fPositionInCache]) 8337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++fPositionInCache; 8347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert --fPositionInCache; 8357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text.setIndex(fCachedBreakPositions[fPositionInCache]); 8367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return text.getIndex(); 8377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 83987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 8407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int rulesPreceding(int offset) { 8417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if the offset passed in is already past the end of the text, 8427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // just return DONE; if it's before the beginning, return the 8437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // text's starting offset 8457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fText == null || offset > fText.getEndIndex()) { 8467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // return BreakIterator::DONE; 8477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return last(); 8487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert else if (offset < fText.getBeginIndex()) { 8507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return first(); 8517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we start by updating the current iteration position to the 8547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // position specified by the caller, we can just use previous() 8557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // to carry out this operation 8567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int result; 8587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fRData.fSFTable != null) { 8597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /// todo synwee 8607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // new rule syntax 8617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.setIndex(offset); 8627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // move backwards one codepoint to prepare for moving forwards to a 8637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // safe point. 8647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // this handles offset being between a supplementary character 8657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert previous32(fText); 8667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert handleNext(fRData.fSFTable); 8677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = previous(); 8687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (result >= offset) { 8697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = previous(); 8707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 8727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fRData.fSRTable != null) { 8747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // backup plan if forward safe table is not available 8757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.setIndex(offset); 8767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert next32(fText); 8777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // handle previous will give result <= offset 8787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert handlePrevious(fRData.fSRTable); 8797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // next will give result 0 or 1 boundary away from offset, 8817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // most of the time 8827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we have to 8837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int oldresult = next(); 8847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (oldresult < offset) { 8857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = next(); 8867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (result >= offset) { 8877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return oldresult; 8887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert oldresult = result; 8907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = previous(); 8927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (result >= offset) { 8937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return previous(); 8947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 8967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // old rule syntax 8997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.setIndex(offset); 9007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return previous(); 9017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 9049e281ba4837cba4a1cf9523d6f8b0621b150063dScott Russell * Throw IllegalArgumentException unless begin <= offset < end. 9057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 9067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 9077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert protected static final void checkOffset(int offset, CharacterIterator text) { 9087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { 9097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException("offset out of bounds"); 9107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 9157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns true if the specified position is a boundary position. As a side 9167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * effect, leaves the iterator pointing to the first boundary position at 9177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or after "offset". 9187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param offset the offset to check. 9197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return True if "offset" is a boundary position. 9207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 9217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 9222d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 9237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean isBoundary(int offset) { 9247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert checkOffset(offset, fText); 9257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the beginning index of the iterator is always a boundary position by definition 9277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (offset == fText.getBeginIndex()) { 9287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert first(); // For side effects on current position, tag values. 9297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 9307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (offset == fText.getEndIndex()) { 9337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert last(); // For side effects on current position, tag values. 9347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 9357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // otherwise, we can use following() on the position before the specified 9387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // one and return true if the position we get back is the one the user 9397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // specified 9407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // return following(offset - 1) == offset; 9427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: check whether it is safe to revert to the simpler offset-1 code 9437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The safe rules may take care of unpaired surrogates ok. 9447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.setIndex(offset); 9457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert previous32(fText); 9467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int pos = fText.getIndex(); 9477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean result = following(pos) == offset; 9487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 9497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 9527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the current iteration position. 9537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return The current iteration position. 9547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 9557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 9562d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 9577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int current() { 9587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (fText != null) ? fText.getIndex() : BreakIterator.DONE; 9597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void makeRuleStatusValid() { 9627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fLastStatusIndexValid == false) { 9637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No cached status is available. 9647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int curr = current(); 9657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (curr == BreakIterator.DONE || curr == fText.getBeginIndex()) { 9667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // At start of text, or there is no text. Status is always zero. 9677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastRuleStatusIndex = 0; 9687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastStatusIndexValid = true; 9697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 9707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Not at start of text. Find status the tedious way. 9717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int pa = fText.getIndex(); 9727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert first(); 9737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int pb = current(); 9747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (fText.getIndex() < pa) { 9757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pb = next(); 9767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Assert.assrt(pa == pb); 9787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Assert.assrt(fLastStatusIndexValid == true); 9807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Assert.assrt(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length); 9817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 9857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Return the status tag from the break rule that determined the most recently 9867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * returned break position. The values appear in the rule source 9877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * within brackets, {123}, for example. For rules that do not specify a 9887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * status, a default value of 0 is returned. If more than one rule applies, 9897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the numerically largest of the possible status values is returned. 9907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 9917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Of the standard types of ICU break iterators, only the word break 9927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * iterator provides status values. The values are defined in 9937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * class RuleBasedBreakIterator, and allow distinguishing between words 9947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * that contain alphabetic letters, "words" that appear to be numbers, 9957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * punctuation and spaces, words containing ideographic characters, and 9967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * more. Call <code>getRuleStatus</code> after obtaining a boundary 9979e281ba4837cba4a1cf9523d6f8b0621b150063dScott Russell * position from <code>next()</code>, <code>previous()</code>, or 9987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * any other break iterator functions that returns a boundary position. 9997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 10007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return the status from the break rule that determined the most recently 10017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * returned break position. 10027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 10037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @draft ICU 3.0 (retain) 10047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @provisional This is a draft API and might change in a future release of ICU. 10057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 10067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10072d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 10087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getRuleStatus() { 10097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert makeRuleStatusValid(); 10107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Status records have this form: 10117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Count N <-- fLastRuleStatusIndex points here. 10127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Status val 0 10137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Status val 1 10147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ... 10157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Status val N-1 <-- the value we need to return 10167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The status values are sorted in ascending order. 10177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This function returns the last (largest) of the array of status values. 10187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex]; 10197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int tagVal = fRData.fStatusTable[idx]; 10207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return tagVal; 10217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 102487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * Get the status (tag) values from the break rule(s) that determined the most 10257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * recently returned break position. The values appear in the rule source 10267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * within brackets, {123}, for example. The default status value for rules 10277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * that do not explicitly provide one is zero. 10287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 10297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The status values used by the standard ICU break rules are defined 10307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * as public constants in class RuleBasedBreakIterator. 10317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <p> 10327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If the size of the output array is insufficient to hold the data, 10337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the output will be truncated to the available length. No exception 10347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * will be thrown. 10357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 103687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * @param fillInArray an array to be filled in with the status values. 103787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * @return The number of rule status values from rules that determined 10387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the most recent boundary returned by the break iterator. 10397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * In the event that the array is too small, the return value 10407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * is the total number of status values that were available, 10417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * not the reduced number that were actually returned. 10427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @draft ICU 3.0 (retain) 10437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @provisional This is a draft API and might change in a future release of ICU. 10447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 10452d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 10467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getRuleStatusVec(int[] fillInArray) { 10477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert makeRuleStatusValid(); 10487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex]; 104987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (fillInArray != null) { 10507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int numToCopy = Math.min(numStatusVals, fillInArray.length); 10517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i=0; i<numToCopy; i++) { 10527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1]; 10537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return numStatusVals; 10567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 10597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Return a CharacterIterator over the text being analyzed. This version 10607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * of this method returns the actual CharacterIterator we're using internally. 10617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Changing the state of this iterator can have undefined consequences. If 10627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * you need to change it, clone it first. 10637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return An iterator over the text being analyzed. 10647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 10657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 10662d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 10677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public CharacterIterator getText() { 10687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return fText; 10697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 10727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Set the iterator to analyze a new piece of text. This function resets 10737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the current iteration position to the beginning of the text. 10747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param newText An iterator over the text to analyze. 10757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @stable ICU 2.0 10767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 10772d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert @Override 10787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void setText(CharacterIterator newText) { 10797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText = newText; 10807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // first() resets the caches 10817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.first(); 10827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 10857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * package private 10867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 10877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void setBreakType(int type) { 10887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fBreakType = type; 10897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 10927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * package private 10937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 10947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int getBreakType() { 10957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return fBreakType; 10967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 10997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Control debug, trace and dump options. 11007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @internal 11017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 11027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ? 11037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ICUDebug.value(RBBI_DEBUG_ARG) : null; 110487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 110587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 11067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private LanguageBreakEngine getLanguageBreakEngine(int c) { 11077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We have a dictionary character. 11097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Does an already instantiated break engine handle it? 11107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (LanguageBreakEngine candidate : fBreakEngines.values()) { 11117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (candidate.handles(c, fBreakType)) { 11127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return candidate; 11137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // if we don't have an existing engine, build one. 11177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); 11187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { 11197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Katakana, Hiragana and Han are handled by the same dictionary engine. 11207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Fold them together for mapping from script -> engine. 11217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert script = UScript.HAN; 11227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 112387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 11247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert LanguageBreakEngine eng = fBreakEngines.get(script); 11257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /* 11267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (eng != null && !eng.handles(c, fBreakType)) { 11277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fUnhandledBreakEngine.handleChar(c, getBreakType()); 11287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert eng = fUnhandledBreakEngine; 11297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else */ { 11307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert try { 11317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switch (script) { 11327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case UScript.THAI: 11337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert eng = new ThaiBreakEngine(); 11347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 11357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case UScript.LAO: 11367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert eng = new LaoBreakEngine(); 11377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 11387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case UScript.MYANMAR: 11397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert eng = new BurmeseBreakEngine(); 11407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 11417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case UScript.KHMER: 11427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert eng = new KhmerBreakEngine(); 11437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 11447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case UScript.HAN: 11457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (getBreakType() == KIND_WORD) { 11467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert eng = new CjkBreakEngine(false); 11477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert else { 11497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fUnhandledBreakEngine.handleChar(c, getBreakType()); 11507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert eng = fUnhandledBreakEngine; 11517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 11537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case UScript.HANGUL: 11547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (getBreakType() == KIND_WORD) { 11557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert eng = new CjkBreakEngine(true); 11567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 11577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fUnhandledBreakEngine.handleChar(c, getBreakType()); 11587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert eng = fUnhandledBreakEngine; 11597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 11617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert default: 11627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fUnhandledBreakEngine.handleChar(c, getBreakType()); 11637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert eng = fUnhandledBreakEngine; 11647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 11657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } catch (IOException e) { 11677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert eng = null; 11687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (eng != null && eng != fUnhandledBreakEngine) { 11727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng); 11737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (existingEngine != null) { 11747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // There was a race & another thread was first to register an engine for this script. 11757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Use theirs and discard the one we just created. 11767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert eng = existingEngine; 11777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // assert eng.handles(c, fBreakType); 11797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return eng; 11817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 118387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert private static final int kMaxLookaheads = 8; 118487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert private static class LookAheadResults { 118587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert int fUsedSlotLimit; 118687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert int[] fPositions; 118787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert int[] fKeys; 118887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 118987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert LookAheadResults() { 119087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert fUsedSlotLimit= 0; 119187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert fPositions = new int[kMaxLookaheads]; 119287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert fKeys = new int[kMaxLookaheads]; 119387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 119487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 119587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert int getPosition(int key) { 119687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert for (int i=0; i<fUsedSlotLimit; ++i) { 119787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (fKeys[i] == key) { 119887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert return fPositions[i]; 119987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 120087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 120187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert assert(false); 120287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert return -1; 120387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 120487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 120587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert void setPosition(int key, int position) { 120687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert int i; 120787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert for (i=0; i<fUsedSlotLimit; ++i) { 120887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (fKeys[i] == key) { 120987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert fPositions[i] = position; 121087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert return; 121187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 121287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 121387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (i >= kMaxLookaheads) { 121487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert assert(false); 121587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert i = kMaxLookaheads - 1; 121687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 121787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert fKeys[i] = key; 121887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert fPositions[i] = position; 121987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert assert(fUsedSlotLimit == i); 122087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert fUsedSlotLimit = i + 1; 122187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 122287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 122387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert void reset() { 122487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert fUsedSlotLimit = 0; 122587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 122687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert }; 122787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert private LookAheadResults fLookAheadMatches = new LookAheadResults(); 122887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 12297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 12317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The State Machine Engine for moving forward is here. 12327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This function is the heart of the RBBI run time engine. 123387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * 12347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param stateTable 12357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return the new iterator position 123687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert * 12377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A note on supplementary characters and the position of underlying 12387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Java CharacterIterator: Normally, a character iterator is positioned at 12397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the char most recently returned by next(). Within this function, when 12407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * a supplementary char is being processed, the char iterator is left 12417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * sitting on the trail surrogate, in the middle of the code point. 12427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This is different from everywhere else, where an iterator always 12437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * points at the lead surrogate of a supplementary. 12447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 12457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int handleNext(short stateTable[]) { 12467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (TRACE) { 12477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.println("Handle Next pos char state category"); 12487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No matter what, handleNext alway correctly sets the break tag value. 12517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastStatusIndexValid = true; 12527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastRuleStatusIndex = 0; 12537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // caches for quicker access 12557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharacterIterator text = fText; 12567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharTrie trie = fRData.fTrie; 12577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Set up the starting char 12597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c = text.current(); 12607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 12617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = nextTrail32(text, c); 12627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (c == DONE32) { 12637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return BreakIterator.DONE; 12647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int initialPosition = text.getIndex(); 12677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int result = initialPosition; 12687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Set the initial state for the state machine 12707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int state = START_STATE; 127187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert int row = fRData.getRowIndex(state); 12727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert short category = 3; 12737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int flagsState = fRData.getStateTableFlags(stateTable); 12747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int mode = RBBI_RUN; 12757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 12767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert category = 2; 12777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mode = RBBI_START; 12787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (TRACE) { 127987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 12807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 12817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 12827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 128487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert fLookAheadMatches.reset(); 12857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // loop until we reach the end of the text or transition to state 0 12877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (state != STOP_STATE) { 12887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (c == DONE32) { 12897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Reached end of input string. 12907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (mode == RBBI_END) { 12917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We have already run the loop one last time with the 12927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // character set to the pseudo {eof} value. Now it is time 12937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // to unconditionally bail out. 12947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 12957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Run the loop one last time with the fake end-of-input character category 12977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mode = RBBI_END; 12987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert category = 1; 12997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert else if (mode == RBBI_RUN) { 13017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Get the char category. An incoming category of 1 or 2 mens that 13027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // we are preset for doing the beginning or end of input, and 13037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // that we shouldn't get a category from an actual text input character. 13047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 13057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // look up the current character's character category, which tells us 13077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // which column in the state table to look at. 13087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 13097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert category = (short) trie.getCodePointValue(c); 131087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 13117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Check the dictionary bit in the character's category. 13127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Counter is only used by dictionary based iterators (subclasses). 13137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Chars that need to be handled by a dictionary have a flag bit set 13147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // in their category values. 13157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 13167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((category & 0x4000) != 0) { 13177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fDictionaryCharCount++; 13187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // And off the dictionary flag bit. 13197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert category &= ~0x4000; 13207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (TRACE) { 132387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 13247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 13257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 13267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 132887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // Advance to the next character. 13297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If this is a beginning-of-input loop iteration, don't advance. 13307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The next iteration will be processing the first real input character. 13312d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert c = text.next(); 13327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 13337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = nextTrail32(text, c); 13347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert else { 13377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mode = RBBI_RUN; 13387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // look up a state transition in the state table 13417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 134287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert row = fRData.getRowIndex(state); 13437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { 13457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Match found, common case 13467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = text.getIndex(); 13477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 13487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The iterator has been left in the middle of a surrogate pair. 13497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We want the start of it. 13507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result--; 13517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Remember the break status (tag) values. 13547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; 13557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 135787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING]; 135887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (completedRule > 0) { 135987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // Lookahead match is completed 136087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert int lookaheadResult = fLookAheadMatches.getPosition(completedRule); 136187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (lookaheadResult >= 0) { 136287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; 136387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert text.setIndex(lookaheadResult); 136487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert return lookaheadResult; 13657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 136687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 13677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 136887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 136987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (rule != 0) { 137087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // At the position of a '/' in a look-ahead match. Record it. 137187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert int pos = text.getIndex(); 13727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 13737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The iterator has been left in the middle of a surrogate pair. 13747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We want the beginning of it. 137587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert pos--; 13767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 137787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert fLookAheadMatches.setPosition(rule, pos); 13787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 138087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 13817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } // End of state machine main loop 13827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The state machine is done. Check whether it found a match... 13847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If the iterator failed to advance in the match engine force it ahead by one. 13867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This indicates a defect in the break rules, which should always match 13877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // at least one character. 138887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 13897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (result == initialPosition) { 13907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (TRACE) { 13917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.println("Iterator did not move. Advancing by 1."); 13927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text.setIndex(initialPosition); 13947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert next32(text); 13957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = text.getIndex(); 13967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert else { 13987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Leave the iterator at our result position. 13997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (we may have advanced beyond the last accepting position chasing after 14007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // longer matches that never completed.) 14017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert text.setIndex(result); 14027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (TRACE) { 14047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.println("result = " + result); 14057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 14077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int handlePrevious(short stateTable[]) { 14107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fText == null || stateTable == null) { 14117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 14127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 141387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 14147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int state; 14157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int category = 0; 14167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int mode; 141787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert int row; 14187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c; 14197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int result = 0; 14207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int initialPosition = 0; 142187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert fLookAheadMatches.reset(); 142287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 14237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // handlePrevious() never gets the rule status. 14247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Flag the status as invalid; if the user ever asks for status, we will need 14257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // to back up, then re-find the break position using handleNext(), which does 14267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // get the status value. 14277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastStatusIndexValid = false; 14287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fLastRuleStatusIndex = 0; 142987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 14307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // set up the starting char 14317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert initialPosition = fText.getIndex(); 14327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = initialPosition; 14337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = previous32(fText); 143487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 14357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Set up the initial state for the state machine 14367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert state = START_STATE; 14377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert row = fRData.getRowIndex(state); 14387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert category = 3; // TODO: obsolete? from the old start/run mode scheme? 14397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mode = RBBI_RUN; 14407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 14417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert category = 2; 14427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mode = RBBI_START; 14437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 144487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 14457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (TRACE) { 14467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.println("Handle Prev pos char state category "); 14477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 144887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 14497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // loop until we reach the beginning of the text or transition to state 0 14507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 14517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mainLoop: for (;;) { 145287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (c == DONE32) { 145387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // Reached end of input string. 145487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (mode == RBBI_END || fRData.fHeader.fVersion == 1) { 145587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // Either this is the old (ICU 3.2 and earlier) format data which 145687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // does not support explicit support for matching {eof}, or 145787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // we have already done the {eof} iteration. Now is the time 145887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // to unconditionally bail out. 145987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (result == initialPosition) { 146087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // Ran off start, no match found. 146187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // Move one position (towards the start, since we are doing previous.) 146287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert fText.setIndex(initialPosition); 146387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert previous32(fText); 14647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 146587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert break mainLoop; 14667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 146787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert mode = RBBI_END; 146887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert category = 1; 146987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 147087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 147187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (mode == RBBI_RUN) { 147287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // look up the current character's category, which tells us 147387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // which column in the state table to look at. 147487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // 147587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert category = (short) fRData.fTrie.getCodePointValue(c); 147687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 147787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // Check the dictionary bit in the character's category. 147887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // Counter is only used by dictionary based iterators (subclasses). 147987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // Chars that need to be handled by a dictionary have a flag bit set 148087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // in their category values. 14817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 148287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if ((category & 0x4000) != 0) { 148387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert fDictionaryCharCount++; 148487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // And off the dictionary flag bit. 148587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert category &= ~0x4000; 14867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 148787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 148887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 148987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 149087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (TRACE) { 149187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert System.out.print(" " + fText.getIndex() + " "); 149287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (0x20 <= c && c < 0x7f) { 149387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert System.out.print(" " + c + " "); 149487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } else { 149587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert System.out.print(" " + Integer.toHexString(c) + " "); 149687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 149787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert System.out.println(" " + state + " " + category + " "); 149887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 149987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 150087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // State Transition - move machine to its next state 150187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // 150287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 150387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert row = fRData.getRowIndex(state); 150487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 150587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { 150687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // Match found, common case, could have lookahead so we move 150787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // on to check it 150887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert result = fText.getIndex(); 150987255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 151087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 151187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 151287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING]; 151387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (completedRule > 0) { 151487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // Lookahead match is completed. 151587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert int lookaheadResult = fLookAheadMatches.getPosition(completedRule); 151687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (lookaheadResult >= 0) { 151787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert result = lookaheadResult; 151887255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert break mainLoop; 15197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 152087255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 152187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 152287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert if (rule != 0) { 152387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert // At the position of a '/' in a look-ahead match. Record it. 152487255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert int pos = fText.getIndex(); 152587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert fLookAheadMatches.setPosition(rule, pos); 152687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert } 152787255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 15287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (state == STOP_STATE) { 15297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Normal loop exit is here 15307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break mainLoop; 15317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 153287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 15337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // then move iterator position backwards one character 15347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 15357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (mode == RBBI_RUN) { 15367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = previous32(fText); 15377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 15387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (mode == RBBI_START) { 15397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert mode = RBBI_RUN; 15407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 154287255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 154387255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 15447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } // End of the main loop. 154587255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 15467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The state machine is done. Check whether it found a match... 15477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 15487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If the iterator failed to advance in the match engine, force it ahead by one. 15497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (This really indicates a defect in the break rules. They should always match 15507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // at least one character.) 15517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (result == initialPosition) { 15527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = fText.setIndex(initialPosition); 15537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert previous32(fText); 15547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert result = fText.getIndex(); 15557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 155687255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 15577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fText.setIndex(result); 15587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (TRACE) { 15597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.println("Result = " + result); 15607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 156187255a3fc79cc94374b5b8adc76a86e251ac7d3eFredrik Roubert 15627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return result; 15637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 15647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 15657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1566