1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 2005-2016 International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 */
9
10package com.ibm.icu.text;
11
12import static com.ibm.icu.impl.CharacterIteration.DONE32;
13import static com.ibm.icu.impl.CharacterIteration.next32;
14import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
15import static com.ibm.icu.impl.CharacterIteration.previous32;
16
17import java.io.ByteArrayOutputStream;
18import java.io.IOException;
19import java.io.InputStream;
20import java.io.OutputStream;
21import java.nio.ByteBuffer;
22import java.text.CharacterIterator;
23import java.util.ArrayList;
24import java.util.List;
25
26import com.ibm.icu.impl.CharacterIteration;
27import com.ibm.icu.impl.ICUBinary;
28import com.ibm.icu.impl.ICUDebug;
29import com.ibm.icu.impl.Trie2;
30import com.ibm.icu.lang.UCharacter;
31import com.ibm.icu.lang.UProperty;
32import com.ibm.icu.lang.UScript;
33
34/**
35 * Rule Based Break Iterator
36 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
37 *
38 * @stable ICU 2.0
39 */
40public class RuleBasedBreakIterator extends BreakIterator {
41    //=======================================================================
42    // Constructors & Factories
43    //=======================================================================
44
45    /**
46     * private constructor
47     */
48    private RuleBasedBreakIterator() {
49        fDictionaryCharCount  = 0;
50        synchronized(gAllBreakEngines) {
51            fBreakEngines = new ArrayList<LanguageBreakEngine>(gAllBreakEngines);
52        }
53    }
54
55    /**
56     * Create a break iterator from a precompiled set of break rules.
57     *
58     * Creating a break iterator from the binary rules is much faster than
59     * creating one from source rules.
60     *
61     * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
62     * Binary break iterator rules are not guaranteed to be compatible between
63     * different versions of ICU.
64     *
65     * @param is an input stream supplying the compiled binary rules.
66     * @throws IOException if there is an error while reading the rules from the InputStream.
67     * @see    #compileRules(String, OutputStream)
68     * @stable ICU 4.8
69     */
70    public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
71        RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
72        This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
73        return This;
74    }
75
76    /**
77     * Create a break iterator from a precompiled set of break rules.
78     *
79     * Creating a break iterator from the binary rules is much faster than
80     * creating one from source rules.
81     *
82     * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
83     * Binary break iterator rules are not guaranteed to be compatible between
84     * different versions of ICU.
85     *
86     * @param bytes a buffer supplying the compiled binary rules.
87     * @throws IOException if there is an error while reading the rules from the buffer.
88     * @see    #compileRules(String, OutputStream)
89     * @internal
90     * @deprecated This API is ICU internal only.
91     */
92    @Deprecated
93    public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
94        RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
95        This.fRData = RBBIDataWrapper.get(bytes);
96        return This;
97    }
98
99    /**
100     * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
101     * @param rules The break rules to be used.
102     * @stable ICU 2.2
103     */
104    public RuleBasedBreakIterator(String rules)  {
105        this();
106        try {
107            ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
108            compileRules(rules, ruleOS);
109            fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
110        } catch (IOException e) {
111            ///CLOVER:OFF
112            // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
113            //  causing bogus compiled rules to be produced, but with no compile error raised.
114            RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
115                    + e.getMessage());
116            throw rte;
117            ///CLOVER:ON
118        }
119    }
120
121    //=======================================================================
122    // Boilerplate
123    //=======================================================================
124
125    /**
126     * Clones this iterator.
127     * @return A newly-constructed RuleBasedBreakIterator with the same
128     * behavior as this one.
129     * @stable ICU 2.0
130     */
131    @Override
132    public Object clone()  {
133        RuleBasedBreakIterator result;
134        result = (RuleBasedBreakIterator)super.clone();
135        if (fText != null) {
136            result.fText = (CharacterIterator)(fText.clone());
137        }
138        synchronized (gAllBreakEngines)  {
139            result.fBreakEngines = new ArrayList<LanguageBreakEngine>(gAllBreakEngines);
140        }
141        result.fLookAheadMatches = new LookAheadResults();
142        result.fBreakCache = result.new BreakCache(fBreakCache);
143        result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
144        return result;
145    }
146
147
148    /**
149     * Returns true if both BreakIterators are of the same class, have the same
150     * rules, and iterate over the same text.
151     * @stable ICU 2.0
152     */
153    @Override
154    public boolean equals(Object that) {
155        if (that == null) {
156            return false;
157        }
158        if (this == that) {
159            return true;
160        }
161        try {
162            RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
163            if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
164                return false;
165            }
166            if (fRData != null && other.fRData != null &&
167                    (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
168                return false;
169            }
170            if (fText == null && other.fText == null) {
171                return true;
172            }
173            if (fText == null || other.fText == null || !fText.equals(other.fText)) {
174                return false;
175            }
176            return fPosition == other.fPosition;
177        }
178        catch(ClassCastException e) {
179            return false;
180        }
181     }
182
183    /**
184     * Returns the description (rules) used to create this iterator.
185     * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
186     * @stable ICU 2.0
187     */
188    @Override
189    public String toString() {
190        String retStr = "";
191        if (fRData != null) {
192            retStr =  fRData.fRuleSource;
193        }
194        return retStr;
195    }
196
197    /**
198     * Compute a hashcode for this BreakIterator
199     * @return A hash code
200     * @stable ICU 2.0
201     */
202    @Override
203    public int hashCode()
204    {
205        return fRData.fRuleSource.hashCode();
206    }
207
208
209    private static final int  START_STATE = 1;     // The state number of the starting state
210    private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"
211
212    // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
213    //               of user text.  A variable with this enum type keeps track of where we
214    //               are.  The state machine only fetches user text input while in RUN mode.
215    private static final int  RBBI_START  = 0;
216    private static final int  RBBI_RUN    = 1;
217    private static final int  RBBI_END    = 2;
218
219    /*
220     * The character iterator through which this BreakIterator accesses the text.
221     */
222    private CharacterIterator   fText = new java.text.StringCharacterIterator("");
223
224    /**
225     * The rule data for this BreakIterator instance. Package private.
226     */
227    RBBIDataWrapper             fRData;
228
229    /**
230     *  The iteration state - current position, rule status for the current position,
231     *                        and whether the iterator ran off the end, yielding UBRK_DONE.
232     *                        Current position is pinned to be 0 < position <= text.length.
233     *                        Current position is always set to a boundary.
234     *
235     *  The current  position of the iterator. Pinned, 0 < fPosition <= text.length.
236     *  Never has the value UBRK_DONE (-1).
237     */
238    private int                fPosition;
239
240    /**
241     * Index of the Rule {tag} values for the most recent match.
242     */
243    private int                fRuleStatusIndex;
244
245    /**
246     * True when iteration has run off the end, and iterator functions should return UBRK_DONE.
247     */
248    private boolean            fDone;
249
250    /**
251     *   Cache of previously determined boundary positions.
252     */
253    private BreakCache         fBreakCache = new BreakCache();
254
255
256    /**
257     * Counter for the number of characters encountered with the "dictionary"
258     *   flag set.  Normal RBBI iterators don't use it, although the code
259     *   for updating it is live.  Dictionary Based break iterators (a subclass
260     *   of us) access this field directly.
261     * @internal
262     */
263    private int fDictionaryCharCount;
264
265    private DictionaryCache     fDictionaryCache = new DictionaryCache();
266
267    /*
268     * ICU debug argument name for RBBI
269     */
270    private static final String RBBI_DEBUG_ARG = "rbbi";
271
272    /**
273     * Debugging flag.  Trace operation of state machine when true.
274     */
275    private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
276            && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
277
278    /**
279     * What kind of break iterator this is.
280     * Defaulting BreakType to word gives reasonable dictionary behavior for
281     * Break Iterators that are built from rules.
282     */
283    private int fBreakType = KIND_WORD;
284
285    /**
286     * The "default" break engine - just skips over ranges of dictionary words,
287     * producing no breaks. Should only be used if characters need to be handled
288     * by a dictionary but we have no dictionary implementation for them.
289     *
290     * Only one instance; shared by all break iterators.
291     */
292    private static final UnhandledBreakEngine gUnhandledBreakEngine;
293
294    /**
295     * List of all known break engines, common for all break iterators.
296     * Lazily updated as break engines are needed, because instantiation of
297     * break engines is expensive.
298     *
299     * Because gAllBreakEngines can be referenced concurrently from different
300     * BreakIterator instances, all access is synchronized.
301     */
302    private static final List<LanguageBreakEngine> gAllBreakEngines;
303
304    static {
305        gUnhandledBreakEngine = new UnhandledBreakEngine();
306        gAllBreakEngines = new ArrayList<LanguageBreakEngine>();
307        gAllBreakEngines.add(gUnhandledBreakEngine);
308    }
309
310    /**
311     * List of all known break engines. Similar to gAllBreakEngines, but local to a
312     * break iterator, allowing it to be used without synchronization.
313     */
314    private List<LanguageBreakEngine> fBreakEngines;
315
316    /**
317     * Dump the contents of the state table and character classes for this break iterator.
318     * For debugging only.
319     * @internal
320     * @deprecated This API is ICU internal only.
321     */
322    @Deprecated
323    public void dump(java.io.PrintStream out) {
324        if (out == null) {
325            out = System.out;
326        }
327        this.fRData.dump(out);
328    }
329
330    /**
331     * Compile a set of source break rules into the binary state tables used
332     * by the break iterator engine.  Creating a break iterator from precompiled
333     * rules is much faster than creating one from source rules.
334     *
335     * Binary break rules are not guaranteed to be compatible between different
336     * versions of ICU.
337     *
338     *
339     * @param rules  The source form of the break rules
340     * @param ruleBinary  An output stream to receive the compiled rules.
341     * @throws IOException If there is an error writing the output.
342     * @see #getInstanceFromCompiledRules(InputStream)
343     * @stable ICU 4.8
344     */
345    public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
346        RBBIRuleBuilder.compileRules(rules, ruleBinary);
347    }
348
349    //=======================================================================
350    // BreakIterator overrides
351    //=======================================================================
352
353    /**
354     * Sets the current iteration position to the beginning of the text.
355     * (i.e., the CharacterIterator's starting offset).
356     * @return The offset of the beginning of the text.
357     * @stable ICU 2.0
358     */
359    @Override
360    public int first() {
361        if (fText == null) {
362            return BreakIterator.DONE;
363        }
364        fText.first();
365        int start =  fText.getIndex();
366        if (!fBreakCache.seek(start)) {
367            fBreakCache.populateNear(start);
368        }
369        fBreakCache.current();
370        assert(fPosition == start);
371        return fPosition;
372    }
373
374    /**
375     * Sets the current iteration position to the end of the text.
376     * (i.e., the CharacterIterator's ending offset).
377     * @return The text's past-the-end offset.
378     * @stable ICU 2.0
379     */
380    @Override
381    public int last() {
382        if (fText == null) {
383            return BreakIterator.DONE;
384        }
385        int endPos = fText.getEndIndex();
386        boolean endShouldBeBoundary = isBoundary(endPos);      // Has side effect of setting iterator position.
387        assert(endShouldBeBoundary);
388        if (fPosition != endPos) {
389            assert(fPosition == endPos);
390        }
391        return endPos;
392    }
393
394    /**
395     * Advances the iterator either forward or backward the specified number of steps.
396     * Negative values move backward, and positive values move forward.  This is
397     * equivalent to repeatedly calling next() or previous().
398     * @param n The number of steps to move.  The sign indicates the direction
399     * (negative is backwards, and positive is forwards).
400     * @return The character offset of the boundary position n boundaries away from
401     * the current one.
402     * @stable ICU 2.0
403     */
404    @Override
405    public int next(int n) {
406        int result = 0;
407        if (n > 0) {
408            for (; n > 0 && result != DONE; --n) {
409                result = next();
410            }
411        } else if (n < 0) {
412            for (; n < 0 && result != DONE; ++n) {
413                result = previous();
414            }
415        } else {
416            result = current();
417        }
418        return result;
419    }
420
421    /**
422     * Advances the iterator to the next boundary position.
423     * @return The position of the first boundary after this one.
424     * @stable ICU 2.0
425     */
426    @Override
427    public int next() {
428        fBreakCache.next();
429        return fDone ? DONE : fPosition;
430    }
431
432    /**
433     * Moves the iterator backwards, to the boundary preceding the current one.
434     * @return The position of the boundary position immediately preceding the starting position.
435     * @stable ICU 2.0
436     */
437    @Override
438    public int previous() {
439        fBreakCache.previous();
440        return fDone ? DONE : fPosition;
441    }
442
443    /**
444     * Sets the iterator to refer to the first boundary position following
445     * the specified position.
446     * @param startPos The position from which to begin searching for a break position.
447     * @return The position of the first break after the current position.
448     * @stable ICU 2.0
449     */
450    @Override
451    public int following(int startPos) {
452        // if the supplied position is before the beginning, return the
453        // text's starting offset
454        if (startPos < fText.getBeginIndex()) {
455            return first();
456        }
457
458        // Move requested offset to a code point start. It might be on a trail surrogate.
459        // Or it may be beyond the end of the text.
460        startPos = CISetIndex32(fText, startPos);
461        fBreakCache.following(startPos);
462        return fDone ? DONE : fPosition;
463    }
464
465
466    /**
467     * Sets the iterator to refer to the last boundary position before the
468     * specified position.
469     * @param offset The position to begin searching for a break from.
470     * @return The position of the last boundary before the starting position.
471     * @stable ICU 2.0
472     */
473    @Override
474    public int preceding(int offset) {
475        if (fText == null || offset > fText.getEndIndex()) {
476            return last();
477        } else if (offset < fText.getBeginIndex()) {
478            return first();
479        }
480
481        // Move requested offset to a code point start. It might be on a trail surrogate.
482        // int adjustedOffset = CISetIndex32(fText, offset);    // TODO: restore to match ICU4C behavior.
483        int adjustedOffset = offset;
484        fBreakCache.preceding(adjustedOffset);
485        return fDone ? DONE : fPosition;
486
487    }
488
489
490    /**
491     * Throw IllegalArgumentException unless begin &lt;= offset &lt; end.
492     * @stable ICU 2.0
493     */
494    protected static final void checkOffset(int offset, CharacterIterator text) {
495        if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
496            throw new IllegalArgumentException("offset out of bounds");
497        }
498    }
499
500
501    /**
502     * Returns true if the specified position is a boundary position.  As a side
503     * effect, leaves the iterator pointing to the first boundary position at
504     * or after "offset".
505     * @param offset the offset to check.
506     * @return True if "offset" is a boundary position.
507     * @stable ICU 2.0
508     */
509    @Override
510    public boolean isBoundary(int offset) {
511        // TODO: behavior difference with ICU4C, which considers out-of-range offsets
512        //       to not be boundaries, and to not be errors.
513        checkOffset(offset, fText);
514
515        // Adjust offset to be on a code point boundary and not beyond the end of the text.
516        // Note that isBoundary() is always be false for offsets that are not on code point boundaries.
517        // But we still need the side effect of leaving iteration at the following boundary.
518        int adjustedOffset = CISetIndex32(fText, offset);
519
520        boolean result = false;
521        if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) {
522            result = (fBreakCache.current() == offset);
523        }
524
525        if (!result) {
526            // Not on a boundary. isBoundary() must leave iterator on the following boundary.
527            // fBreakCache.seek(), above, left us on the preceding boundary, so advance one.
528            next();
529        }
530        return result;
531
532    }
533
534    /**
535     * Returns the current iteration position.  Note that UBRK_DONE is never
536     * returned from this function; if iteration has run to the end of a
537     * string, current() will return the length of the string while
538     * next() will return BreakIterator.DONE).
539     * @return The current iteration position.
540     * @stable ICU 2.0
541     */
542    @Override
543    public int current() {
544        return (fText != null) ? fPosition : BreakIterator.DONE;
545    }
546
547
548    /**
549     * Return the status tag from the break rule that determined the most recently
550     * returned break position.  The values appear in the rule source
551     * within brackets, {123}, for example.  For rules that do not specify a
552     * status, a default value of 0 is returned.  If more than one rule applies,
553     * the numerically largest of the possible status values is returned.
554     * <p>
555     * Of the standard types of ICU break iterators, only the word and line break
556     * iterator provides status values.  The values are defined in
557     * class RuleBasedBreakIterator, and allow distinguishing between words
558     * that contain alphabetic letters, "words" that appear to be numbers,
559     * punctuation and spaces, words containing ideographic characters, and
560     * more.  Call <code>getRuleStatus</code> after obtaining a boundary
561     * position from <code>next()</code>, <code>previous()</code>, or
562     * any other break iterator functions that returns a boundary position.
563     * <p>
564     * @return the status from the break rule that determined the most recently
565     * returned break position.
566     *
567     * @stable ICU 60
568     */
569
570    @Override
571    public int  getRuleStatus() {
572        //   Status records have this form:
573        //           Count N         <--  fLastRuleStatusIndex points here.
574        //           Status val 0
575        //           Status val 1
576        //              ...
577        //           Status val N-1  <--  the value we need to return
578        //   The status values are sorted in ascending order.
579        //   This function returns the last (largest) of the array of status values.
580        int  idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex];
581        int  tagVal = fRData.fStatusTable[idx];
582        return tagVal;
583    }
584
585    /**
586     * Get the status (tag) values from the break rule(s) that determined the most
587     * recently returned break position.  The values appear in the rule source
588     * within brackets, {123}, for example.  The default status value for rules
589     * that do not explicitly provide one is zero.
590     * <p>
591     * The status values used by the standard ICU break rules are defined
592     * as public constants in class RuleBasedBreakIterator.
593     * <p>
594     * If the size  of the output array is insufficient to hold the data,
595     *  the output will be truncated to the available length.  No exception
596     *  will be thrown.
597     *
598     * @param fillInArray an array to be filled in with the status values.
599     * @return          The number of rule status values from rules that determined
600     *                  the most recent boundary returned by the break iterator.
601     *                  In the event that the array is too small, the return value
602     *                  is the total number of status values that were available,
603     *                  not the reduced number that were actually returned.
604     * @stable ICU 60
605     */
606    @Override
607    public int getRuleStatusVec(int[] fillInArray) {
608        int numStatusVals = fRData.fStatusTable[fRuleStatusIndex];
609        if (fillInArray != null) {
610            int numToCopy = Math.min(numStatusVals, fillInArray.length);
611            for (int i=0; i<numToCopy; i++) {
612                fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1];
613            }
614        }
615        return numStatusVals;
616    }
617
618    /**
619     * Return a CharacterIterator over the text being analyzed.  This version
620     * of this method returns the actual CharacterIterator we're using internally.
621     * Changing the state of this iterator can have undefined consequences.  If
622     * you need to change it, clone it first.
623     * @return An iterator over the text being analyzed.
624     * @stable ICU 2.0
625     */
626    @Override
627    public CharacterIterator getText() {
628        return fText;
629    }
630
631    /**
632     * Set the iterator to analyze a new piece of text.  This function resets
633     * the current iteration position to the beginning of the text.
634     * @param newText An iterator over the text to analyze.
635     * @stable ICU 2.0
636     */
637    @Override
638    public void setText(CharacterIterator newText) {
639        if (newText != null) {
640            fBreakCache.reset(newText.getBeginIndex(), 0);
641        } else {
642            fBreakCache.reset();
643        }
644        fDictionaryCache.reset();
645        fText = newText;
646        this.first();
647    }
648
649    /**
650     * package private
651     */
652    void setBreakType(int type) {
653        fBreakType = type;
654    }
655
656    /**
657     * package private
658     */
659    int getBreakType() {
660        return fBreakType;
661    }
662
663    /**
664     * Control debug, trace and dump options.
665     * @internal
666     */
667    static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
668                                        ICUDebug.value(RBBI_DEBUG_ARG) : null;
669
670
671    private LanguageBreakEngine getLanguageBreakEngine(int c) {
672
673        // We have a dictionary character.
674        // Does an already instantiated break engine handle it?
675        for (LanguageBreakEngine candidate : fBreakEngines) {
676            if (candidate.handles(c, fBreakType)) {
677                return candidate;
678            }
679        }
680
681        synchronized (gAllBreakEngines) {
682            // This break iterator's list of break engines didn't handle the character.
683            // Check the global list, another break iterator may have instantiated the
684            // desired engine.
685            for (LanguageBreakEngine candidate : gAllBreakEngines) {
686                if (candidate.handles(c, fBreakType)) {
687                    fBreakEngines.add(candidate);
688                    return candidate;
689                }
690            }
691
692            // The global list doesn't have an existing engine, build one.
693            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
694            if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
695                // Katakana, Hiragana and Han are handled by the same dictionary engine.
696                // Fold them together for mapping from script -> engine.
697                script = UScript.HAN;
698            }
699
700            LanguageBreakEngine eng;
701            try {
702                switch (script) {
703                case UScript.THAI:
704                    eng = new ThaiBreakEngine();
705                    break;
706                case UScript.LAO:
707                    eng = new LaoBreakEngine();
708                    break;
709                case UScript.MYANMAR:
710                    eng = new BurmeseBreakEngine();
711                    break;
712                case UScript.KHMER:
713                    eng = new KhmerBreakEngine();
714                    break;
715                case UScript.HAN:
716                    if (getBreakType() == KIND_WORD) {
717                        eng = new CjkBreakEngine(false);
718                    }
719                    else {
720                        gUnhandledBreakEngine.handleChar(c, getBreakType());
721                        eng = gUnhandledBreakEngine;
722                    }
723                    break;
724                case UScript.HANGUL:
725                    if (getBreakType() == KIND_WORD) {
726                        eng = new CjkBreakEngine(true);
727                    } else {
728                        gUnhandledBreakEngine.handleChar(c, getBreakType());
729                        eng = gUnhandledBreakEngine;
730                    }
731                    break;
732                default:
733                    gUnhandledBreakEngine.handleChar(c, getBreakType());
734                    eng = gUnhandledBreakEngine;
735                    break;
736                }
737            } catch (IOException e) {
738                eng = null;
739            }
740
741            if (eng != null && eng != gUnhandledBreakEngine) {
742                gAllBreakEngines.add(eng);
743                fBreakEngines.add(eng);
744            }
745            return eng;
746        }   // end synchronized(gAllBreakEngines)
747    }
748
749    private static final int kMaxLookaheads = 8;
750    private static class LookAheadResults {
751        int      fUsedSlotLimit;
752        int[]    fPositions;
753        int[]    fKeys;
754
755        LookAheadResults() {
756            fUsedSlotLimit= 0;
757            fPositions = new int[kMaxLookaheads];
758            fKeys = new int[kMaxLookaheads];
759        }
760
761        int getPosition(int key) {
762            for (int i=0; i<fUsedSlotLimit; ++i) {
763                if (fKeys[i] == key) {
764                    return fPositions[i];
765                }
766            }
767            assert(false);
768            return -1;
769        }
770
771        void setPosition(int key, int position) {
772            int i;
773            for (i=0; i<fUsedSlotLimit; ++i) {
774                if (fKeys[i] == key) {
775                    fPositions[i] = position;
776                    return;
777                }
778            }
779            if (i >= kMaxLookaheads) {
780                assert(false);
781                i = kMaxLookaheads - 1;
782            }
783            fKeys[i] = key;
784            fPositions[i] = position;
785            assert(fUsedSlotLimit == i);
786            fUsedSlotLimit = i + 1;
787        }
788
789        void reset() {
790            fUsedSlotLimit = 0;
791        }
792    };
793    private LookAheadResults fLookAheadMatches = new LookAheadResults();
794
795
796    /**
797     * The State Machine Engine for moving forward is here.
798     * This function is the heart of the RBBI run time engine.
799     *
800     * Input
801     *    fPosition, the position in the text to begin from.
802     * Output
803     *    fPosition:           the boundary following the starting position.
804     *    fDictionaryCharCount the number of dictionary characters encountered.
805     *                         If > 0, the segment will be further subdivided
806     *    fRuleStatusIndex     Info from the state table indicating which rules caused the boundary.
807     *
808     * @return the new iterator position
809     *
810     * A note on supplementary characters and the position of underlying
811     * Java CharacterIterator:   Normally, a character iterator is positioned at
812     * the char most recently returned by next().  Within this function, when
813     * a supplementary char is being processed, the char iterator is left
814     * sitting on the trail surrogate, in the middle of the code point.
815     * This is different from everywhere else, where an iterator always
816     * points at the lead surrogate of a supplementary.
817     */
818    private int handleNext() {
819        if (TRACE) {
820            System.out.println("Handle Next   pos      char  state category");
821        }
822
823        // handleNext always sets the break tag value.
824        // Set the default for it.
825        fRuleStatusIndex  = 0;
826        fDictionaryCharCount = 0;
827
828        // caches for quicker access
829        CharacterIterator text = fText;
830        Trie2 trie = fRData.fTrie;
831
832        short[] stateTable  = fRData.fFTable;
833        int initialPosition = fPosition;
834        text.setIndex(initialPosition);
835        int result          = initialPosition;
836
837        // Set up the starting char
838        int c = text.current();
839        if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
840            c = nextTrail32(text, c);
841            if (c == DONE32) {
842                fDone = true;
843                return BreakIterator.DONE;
844            }
845        }
846
847        // Set the initial state for the state machine
848        int state           = START_STATE;
849        int row             = fRData.getRowIndex(state);
850        short category      = 3;
851        int flagsState      = fRData.getStateTableFlags(stateTable);
852        int mode            = RBBI_RUN;
853        if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
854            category = 2;
855            mode     = RBBI_START;
856            if (TRACE) {
857                System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
858                System.out.print(RBBIDataWrapper.intToHexString(c, 10));
859                System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
860            }
861        }
862        fLookAheadMatches.reset();
863
864        // loop until we reach the end of the text or transition to state 0
865        while (state != STOP_STATE) {
866            if (c == DONE32) {
867                // Reached end of input string.
868                if (mode == RBBI_END) {
869                    // We have already run the loop one last time with the
870                    // character set to the pseudo {eof} value. Now it is time
871                    // to unconditionally bail out.
872                    break;
873                }
874                // Run the loop one last time with the fake end-of-input character category
875                mode = RBBI_END;
876                category = 1;
877            }
878            else if (mode == RBBI_RUN) {
879                // Get the char category.  An incoming category of 1 or 2 mens that
880                //      we are preset for doing the beginning or end of input, and
881                //      that we shouldn't get a category from an actual text input character.
882                //
883
884                // look up the current character's character category, which tells us
885                // which column in the state table to look at.
886                //
887                category = (short) trie.get(c);
888
889                // Check the dictionary bit in the character's category.
890                //    Counter is only used by dictionary based iterators (subclasses).
891                //    Chars that need to be handled by a dictionary have a flag bit set
892                //    in their category values.
893                //
894                if ((category & 0x4000) != 0)  {
895                    fDictionaryCharCount++;
896                    //  And off the dictionary flag bit.
897                    category &= ~0x4000;
898                }
899
900                if (TRACE) {
901                    System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
902                    System.out.print(RBBIDataWrapper.intToHexString(c, 10));
903                    System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
904                }
905
906                // Advance to the next character.
907                // If this is a beginning-of-input loop iteration, don't advance.
908                //    The next iteration will be processing the first real input character.
909                c = text.next();
910                if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
911                    c = nextTrail32(text, c);
912                }
913            }
914            else {
915                mode = RBBI_RUN;
916            }
917
918            // look up a state transition in the state table
919            state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
920            row   = fRData.getRowIndex(state);
921
922            if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
923                // Match found, common case
924                result = text.getIndex();
925                if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
926                    // The iterator has been left in the middle of a surrogate pair.
927                    // We want the start of it.
928                    result--;
929                }
930
931                //  Remember the break status (tag) values.
932                fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
933            }
934
935            int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
936            if (completedRule > 0) {
937                // Lookahead match is completed
938                int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
939                if (lookaheadResult >= 0) {
940                    fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
941                    fPosition = lookaheadResult;
942                    return lookaheadResult;
943                }
944            }
945
946            int rule =  stateTable[row + RBBIDataWrapper.LOOKAHEAD];
947            if (rule != 0) {
948                // At the position of a '/' in a look-ahead match. Record it.
949                int  pos = text.getIndex();
950                if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
951                    // The iterator has been left in the middle of a surrogate pair.
952                    // We want the beginning  of it.
953                    pos--;
954                }
955                fLookAheadMatches.setPosition(rule, pos);
956            }
957
958
959        }        // End of state machine main loop
960
961        // The state machine is done.  Check whether it found a match...
962
963        // If the iterator failed to advance in the match engine force it ahead by one.
964        // This indicates a defect in the break rules, which should always match
965        // at least one character.
966
967        if (result == initialPosition) {
968            if (TRACE) {
969                System.out.println("Iterator did not move. Advancing by 1.");
970            }
971            text.setIndex(initialPosition);
972            next32(text);
973            result = text.getIndex();
974            fRuleStatusIndex = 0;
975        }
976
977        // Leave the iterator at our result position.
978        //   (we may have advanced beyond the last accepting position chasing after
979        //    longer matches that never completed.)
980        fPosition = result;
981
982        if (TRACE) {
983            System.out.println("result = " + result);
984        }
985        return result;
986    }
987
988    /**
989     * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules.
990     * This locates a "Safe Position" from which the forward break rules
991     * will operate correctly. A Safe Position is not necessarily a boundary itself.
992     *
993     * The logic of this function is very similar to handleNext(), above.
994     *
995     * @param fromPosition the position in the input text to begin the iteration.
996     * @internal
997     */
998    private int handlePrevious(int fromPosition) {
999        if (fText == null) {
1000            return 0;
1001        }
1002
1003        int            state;
1004        int            category           = 0;
1005        int            mode;
1006        int            row;
1007        int            c;
1008        int            result             = 0;
1009        int            initialPosition    = fromPosition;
1010        fLookAheadMatches.reset();
1011        short[] stateTable = fRData.fSRTable;
1012        CISetIndex32(fText, fromPosition);
1013        if (fromPosition == fText.getBeginIndex()) {
1014            return BreakIterator.DONE;
1015        }
1016
1017        // set up the starting char
1018        result          = initialPosition;
1019        c               = previous32(fText);
1020
1021        // Set up the initial state for the state machine
1022        state = START_STATE;
1023        row = fRData.getRowIndex(state);
1024        category = 3;   // TODO:  obsolete?  from the old start/run mode scheme?
1025        mode     = RBBI_RUN;
1026        if ((fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
1027            category = 2;
1028            mode     = RBBI_START;
1029        }
1030
1031        if (TRACE) {
1032            System.out.println("Handle Prev   pos   char  state category ");
1033        }
1034
1035        // loop until we reach the beginning of the text or transition to state 0
1036        //
1037        mainLoop: for (;;) {
1038            if (c == DONE32) {
1039                // Reached end of input string.
1040                if (mode == RBBI_END) {
1041                    // We have already done the {eof} iteration.  Now is the time
1042                    // to unconditionally bail out.
1043                    break mainLoop;
1044                }
1045                mode = RBBI_END;
1046                category = 1;
1047            }
1048
1049            if (mode == RBBI_RUN) {
1050                // look up the current character's category, which tells us
1051                // which column in the state table to look at.
1052                //
1053                //  And off the dictionary flag bit. For reverse iteration it is not used.
1054                category = (short) fRData.fTrie.get(c);
1055                category &= ~0x4000;
1056            }
1057
1058            if (TRACE) {
1059                System.out.print("             " + fText.getIndex() + "   ");
1060                if (0x20 <= c && c < 0x7f) {
1061                    System.out.print("  " + c + "  ");
1062                } else {
1063                    System.out.print(" " + Integer.toHexString(c) + " ");
1064                }
1065                System.out.println(" " + state + "  " + category + " ");
1066            }
1067
1068            // State Transition - move machine to its next state
1069            //
1070            state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1071            row = fRData.getRowIndex(state);
1072
1073            if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
1074                // Match found, common case, could have lookahead so we move
1075                // on to check it
1076                result = fText.getIndex();
1077            }
1078
1079
1080            int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
1081            if (completedRule > 0) {
1082                // Lookahead match is completed.
1083                int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
1084                if (lookaheadResult >= 0) {
1085                    result = lookaheadResult;
1086                    break mainLoop;
1087                }
1088            }
1089            int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
1090            if (rule != 0) {
1091                // At the position of a '/' in a look-ahead match. Record it.
1092                int pos = fText.getIndex();
1093                fLookAheadMatches.setPosition(rule, pos);
1094            }
1095
1096            if (state == STOP_STATE) {
1097                // Normal loop exit is here
1098                break mainLoop;
1099            }
1100
1101            // then move iterator position backwards one character
1102            //
1103            if (mode == RBBI_RUN) {
1104                c = previous32(fText);
1105            } else {
1106                if (mode == RBBI_START) {
1107                    mode = RBBI_RUN;
1108                }
1109            }
1110
1111
1112        }   // End of the main loop.
1113
1114        // The state machine is done.  Check whether it found a match...
1115        //
1116        // If the iterator failed to move in the match engine, force it back by one code point.
1117        //   (This really indicates a defect in the break rules.  They should always match
1118        //    at least one character.)
1119        if (result == initialPosition) {
1120            CISetIndex32(fText, initialPosition);
1121            previous32(fText);
1122            result = fText.getIndex();
1123        }
1124
1125        if (TRACE) {
1126            System.out.println("Result = " + result);
1127        }
1128
1129        return result;
1130    }
1131
1132    /**
1133     * Set the index of a CharacterIterator.
1134     * Pin the index to the valid range range of BeginIndex <= index <= EndIndex.
1135     * If the index points to a trail surrogate of a supplementary character, adjust it
1136     * to the start (lead surrogate) index.
1137     *
1138     * @param ci A CharacterIterator to set
1139     * @param index the index to set
1140     * @return the resulting index, possibly pinned or adjusted.
1141     */
1142    private static int CISetIndex32(CharacterIterator ci, int index) {
1143        if (index <= ci.getBeginIndex()) {
1144            ci.first();
1145        } else if (index >= ci.getEndIndex()) {
1146            ci.setIndex(ci.getEndIndex());
1147        } else if (Character.isLowSurrogate(ci.setIndex(index))) {
1148            if (!Character.isHighSurrogate(ci.previous())) {
1149                ci.next();
1150            }
1151        }
1152        return ci.getIndex();
1153    }
1154
1155    /* DictionaryCache  stores the boundaries obtained from a run of dictionary characters.
1156     *                 Dictionary boundaries are moved first to this cache, then from here
1157     *                 to the main BreakCache, where they may inter-leave with non-dictionary
1158     *                 boundaries. The public BreakIterator API always fetches directly
1159     *                 from the main BreakCache, not from here.
1160     *
1161     *                 In common situations, the number of boundaries in a single dictionary run
1162     *                 should be quite small, it will be terminated by punctuation, spaces,
1163     *                 or any other non-dictionary characters. The main BreakCache may end
1164     *                 up with boundaries from multiple dictionary based runs.
1165     *
1166     *                 The boundaries are stored in a simple ArrayList (vector), with the
1167     *                 assumption that they will be accessed sequentially.
1168     */
1169    class DictionaryCache  {
1170
1171         void reset() {
1172             fPositionInCache = -1;
1173             fStart = 0;
1174             fLimit = 0;
1175             fFirstRuleStatusIndex = 0;
1176             fOtherRuleStatusIndex = 0;
1177             fBreaks.removeAllElements();
1178         };
1179
1180         boolean following(int fromPos) {
1181             if (fromPos >= fLimit || fromPos < fStart) {
1182                 fPositionInCache = -1;
1183                 return false;
1184             }
1185
1186             // Sequential iteration, move from previous boundary to the following
1187
1188             int r = 0;
1189             if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1190                 ++fPositionInCache;
1191                 if (fPositionInCache >= fBreaks.size()) {
1192                     fPositionInCache = -1;
1193                     return false;
1194                 }
1195                 r = fBreaks.elementAt(fPositionInCache);
1196                 assert(r > fromPos);
1197                 fBoundary = r;
1198                 fStatusIndex = fOtherRuleStatusIndex;
1199                 return true;
1200             }
1201
1202             // Random indexing. Linear search for the boundary following the given position.
1203
1204             for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
1205                 r= fBreaks.elementAt(fPositionInCache);
1206                 if (r > fromPos) {
1207                     fBoundary = r;
1208                     fStatusIndex = fOtherRuleStatusIndex;
1209                     return true;
1210                 }
1211             }
1212
1213             // Internal error. fStart <= fromPos < fLimit, but no cached boundary.
1214             assert(false);
1215             fPositionInCache = -1;
1216             return false;
1217         };
1218
1219         boolean preceding(int fromPos) {
1220             if (fromPos <= fStart || fromPos > fLimit) {
1221                 fPositionInCache = -1;
1222                 return false;
1223             }
1224
1225             if (fromPos == fLimit) {
1226                 fPositionInCache = fBreaks.size() - 1;
1227                 if (fPositionInCache >= 0) {
1228                     assert(fBreaks.elementAt(fPositionInCache) == fromPos);
1229                 }
1230             }
1231
1232             int r;
1233             if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1234                 --fPositionInCache;
1235                 r = fBreaks.elementAt(fPositionInCache);
1236                 assert(r < fromPos);
1237                 fBoundary = r;
1238                 fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1239                 return true;
1240             }
1241
1242             if (fPositionInCache == 0) {
1243                 fPositionInCache = -1;
1244                 return false;
1245             }
1246
1247             for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
1248                 r = fBreaks.elementAt(fPositionInCache);
1249                 if (r < fromPos) {
1250                     fBoundary = r;
1251                     fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1252                     return true;
1253                 }
1254             }
1255             assert(false);
1256             fPositionInCache = -1;
1257             return false;
1258         };
1259
1260        /**
1261         * Populate the cache with the dictionary based boundaries within a region of text.
1262         * @param startPos  The start position of a range of text
1263         * @param endPos    The end position of a range of text
1264         * @param firstRuleStatus The rule status index that applies to the break at startPos
1265         * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
1266         * @internal
1267         */
1268        void populateDictionary(int startPos, int endPos,
1269                                int firstRuleStatus, int otherRuleStatus) {
1270            if ((endPos - startPos) <= 1) {
1271                return;
1272            }
1273
1274            reset();
1275            fFirstRuleStatusIndex = firstRuleStatus;
1276            fOtherRuleStatusIndex = otherRuleStatus;
1277
1278            int rangeStart = startPos;
1279            int rangeEnd = endPos;
1280
1281            int         category;
1282            int         current;
1283            int         foundBreakCount = 0;
1284
1285            // Loop through the text, looking for ranges of dictionary characters.
1286            // For each span, find the appropriate break engine, and ask it to find
1287            // any breaks within the span.
1288
1289            fText.setIndex(rangeStart);
1290            int     c = CharacterIteration.current32(fText);
1291            category = (short)fRData.fTrie.get(c);
1292
1293            while(true) {
1294                while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) {
1295                    c = CharacterIteration.next32(fText);    // pre-increment
1296                    category = (short)fRData.fTrie.get(c);
1297                }
1298                if (current >= rangeEnd) {
1299                    break;
1300                }
1301
1302                // We now have a dictionary character. Get the appropriate language object
1303                // to deal with it.
1304                LanguageBreakEngine lbe = getLanguageBreakEngine(c);
1305
1306                // Ask the language object if there are any breaks. It will add them to the cache and
1307                // leave the text pointer on the other side of its range, ready to search for the next one.
1308                if (lbe != null) {
1309                    foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreakType, fBreaks);
1310                }
1311
1312                // Reload the loop variables for the next go-round
1313                c = CharacterIteration.current32(fText);
1314                category = (short)fRData.fTrie.get(c);
1315            }
1316
1317            // If we found breaks, ensure that the first and last entries are
1318            // the original starting and ending position. And initialize the
1319            // cache iteration position to the first entry.
1320
1321            // System.out.printf("foundBreakCount = %d%n", foundBreakCount);
1322            if (foundBreakCount > 0) {
1323                assert(foundBreakCount == fBreaks.size());
1324                if (startPos < fBreaks.elementAt(0)) {
1325                    // The dictionary did not place a boundary at the start of the segment of text.
1326                    // Add one now. This should not commonly happen, but it would be easy for interactions
1327                    // of the rules for dictionary segments and the break engine implementations to
1328                    // inadvertently cause it. Cover it here, just in case.
1329                    fBreaks.offer(startPos);
1330                }
1331                if (endPos > fBreaks.peek()) {
1332                    fBreaks.push(endPos);
1333                }
1334                fPositionInCache = 0;
1335                // Note: Dictionary matching may extend beyond the original limit.
1336                fStart = fBreaks.elementAt(0);
1337                fLimit = fBreaks.peek();
1338            } else {
1339                // there were no language-based breaks, even though the segment contained
1340                // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
1341                // for this range will fail, and the calling code will fall back to the rule based boundaries.
1342            }
1343
1344        };
1345
1346
1347        DictionaryCache() {
1348            fPositionInCache = -1;
1349            fBreaks = new DictionaryBreakEngine.DequeI();
1350        }
1351
1352        /**
1353         * copy constructor. Used by RuleBasedBreakIterator.clone().
1354         *
1355         * @param src the source object to be copied.
1356         */
1357        DictionaryCache(DictionaryCache src)  {
1358            try {
1359                fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone();
1360            }
1361            catch (CloneNotSupportedException e) {
1362                throw new RuntimeException(e);
1363            }
1364            fPositionInCache      = src.fPositionInCache;
1365            fStart                = src.fStart;
1366            fLimit                = src.fLimit;
1367            fFirstRuleStatusIndex = src.fFirstRuleStatusIndex;
1368            fOtherRuleStatusIndex = src.fOtherRuleStatusIndex;
1369            fBoundary             = src.fBoundary;
1370            fStatusIndex          = src.fStatusIndex;
1371        }
1372
1373        // A data structure containing the boundaries themselves. Essentially a vector of raw ints.
1374        DictionaryBreakEngine.DequeI fBreaks;
1375        int             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
1376        //                                      //    or preceding(). Optimizes sequential access.
1377        int             fStart;                 // Text position of first boundary in cache.
1378        int             fLimit;                 // Last boundary in cache. Which is the limit of the
1379        //                                      //    text segment being handled by the dictionary.
1380        int             fFirstRuleStatusIndex;  // Rule status info for first boundary.
1381        int             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries.
1382        int             fBoundary;              // Current boundary. Set by preceding(), following().
1383        int             fStatusIndex;           // Current rule status index. Set by preceding, following().
1384    };
1385
1386
1387
1388
1389/*
1390 * class BreakCache
1391 *
1392 * Cache of break boundary positions and rule status values.
1393 * Break iterator API functions, next(), previous(), etc., will use cached results
1394 * when possible, and otherwise cache new results as they are obtained.
1395 *
1396 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
1397 *
1398 * The cache is implemented as a single circular buffer.
1399 */
1400
1401/*
1402 * size of the circular cache buffer.
1403 */
1404
1405class BreakCache {
1406
1407    BreakCache() {
1408        reset();
1409    };
1410
1411    void  reset(int pos, int ruleStatus) {
1412        fStartBufIdx = 0;
1413        fEndBufIdx = 0;
1414        fTextIdx = pos;
1415        fBufIdx = 0;
1416        fBoundaries[0] = pos;
1417        fStatuses[0] = (short)ruleStatus;
1418    }
1419
1420    void  reset() {reset(0, 0); };
1421
1422    void  next() {
1423        if (fBufIdx == fEndBufIdx) {
1424            fDone = !populateFollowing();
1425            fPosition = fTextIdx;
1426            fRuleStatusIndex = fStatuses[fBufIdx];
1427        } else {
1428            fBufIdx = modChunkSize(fBufIdx + 1);
1429            fTextIdx = fPosition = fBoundaries[fBufIdx];
1430            fRuleStatusIndex = fStatuses[fBufIdx];
1431        }
1432    };
1433
1434    void  previous() {
1435        int initialBufIdx = fBufIdx;
1436        if (fBufIdx == fStartBufIdx) {
1437            // At start of cache. Prepend to it.
1438            populatePreceding();
1439        } else {
1440            // Cache already holds the next boundary
1441            fBufIdx = modChunkSize(fBufIdx - 1);
1442            fTextIdx = fBoundaries[fBufIdx];
1443        }
1444        fDone = (fBufIdx == initialBufIdx);
1445        fPosition = fTextIdx;
1446        fRuleStatusIndex = fStatuses[fBufIdx];
1447        return;
1448    };
1449
1450    // Move the iteration state to the position following the startPosition.
1451    // Input position must be pinned to the input length.
1452    void following(int startPos) {
1453        if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1454            // startPos is in the cache. Do a next() from that position.
1455            // TODO: an awkward set of interactions with bi->fDone
1456            //       seek() does not clear it; it can't because of interactions with populateNear().
1457            //       next() does not clear it in the fast-path case, where everything matters. Maybe it should.
1458            //       So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
1459            fDone = false;
1460            next();
1461        }
1462
1463    };
1464
1465    void  preceding(int startPos) {
1466        if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1467            if (startPos == fTextIdx) {
1468                previous();
1469            } else {
1470                // seek() leaves the BreakCache positioned at the preceding boundary
1471                //        if the requested position is between two bounaries.
1472                // current() pushes the BreakCache position out to the BreakIterator itself.
1473                assert(startPos > fTextIdx);
1474                current();
1475            }
1476        }
1477        return;
1478    };
1479
1480    /*
1481     * Update the state of the public BreakIterator (fBI) to reflect the
1482     * current state of the break iterator cache (this).
1483     */
1484    int current() {
1485        fPosition = fTextIdx;
1486        fRuleStatusIndex = fStatuses[fBufIdx];
1487        fDone = false;
1488        return fTextIdx;
1489    };
1490
1491    /**
1492     * Add boundaries to the cache near the specified position.
1493     * The given position need not be a boundary itself.
1494     * The input position must be within the range of the text, and
1495     * on a code point boundary.
1496     * If the requested position is a break boundary, leave the iteration
1497     * position on it.
1498     * If the requested position is not a boundary, leave the iteration
1499     * position on the preceding boundary and include both the the
1500     * preceding and following boundaries in the cache.
1501     * Additional boundaries, either preceding or following, may be added
1502     * to the cache as a side effect.
1503     *
1504     * Return false if the operation failed.
1505     */
1506    boolean populateNear(int position) {
1507        assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
1508
1509        // Find a boundary somewhere in the vicinity of the requested position.
1510        // Depending on the safe rules and the text data, it could be either before, at, or after
1511        // the requested position.
1512
1513
1514        // If the requested position is not near already cached positions, clear the existing cache,
1515        // find a near-by boundary and begin new cache contents there.
1516
1517        if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
1518            int aBoundary = fText.getBeginIndex();
1519            int ruleStatusIndex = 0;
1520            // TODO: check for position == length of text. Although may still need to back up to get rule status.
1521            if (position > aBoundary + 20) {
1522                int backupPos = handlePrevious(position);
1523                fPosition = backupPos;
1524                aBoundary = handleNext();                // Ignore dictionary, just finding a rule based boundary.
1525                ruleStatusIndex = fRuleStatusIndex;
1526            }
1527            reset(aBoundary, ruleStatusIndex);               // Reset cache to hold aBoundary as a single starting point.
1528        }
1529
1530        // Fill in boundaries between existing cache content and the new requested position.
1531
1532        if (fBoundaries[fEndBufIdx] < position) {
1533            // The last position in the cache precedes the requested position.
1534            // Add following position(s) to the cache.
1535            while (fBoundaries[fEndBufIdx] < position) {
1536                if (!populateFollowing()) {
1537                    assert false;
1538                    return false;
1539                }
1540            }
1541            fBufIdx = fEndBufIdx;                      // Set iterator position to the end of the buffer.
1542            fTextIdx = fBoundaries[fBufIdx];           // Required because populateFollowing may add extra boundaries.
1543            while (fTextIdx > position) {              // Move backwards to a position at or preceding the requested pos.
1544                previous();
1545            }
1546            return true;
1547        }
1548
1549        if (fBoundaries[fStartBufIdx] > position) {
1550            // The first position in the cache is beyond the requested position.
1551            // back up more until we get a boundary <= the requested position.
1552            while (fBoundaries[fStartBufIdx] > position) {
1553                populatePreceding();
1554            }
1555            fBufIdx = fStartBufIdx;                    // Set iterator position to the start of the buffer.
1556            fTextIdx = fBoundaries[fBufIdx];           // Required because populatePreceding may add extra boundaries.
1557            while (fTextIdx < position) {              // Move forwards to a position at or following the requested pos.
1558                next();
1559            }
1560            if (fTextIdx > position) {
1561                // If position is not itself a boundary, the next() loop above will overshoot.
1562                // Back up one, leaving cache position at the boundary preceding the requested position.
1563                previous();
1564            }
1565            return true;
1566        }
1567
1568        assert fTextIdx == position;
1569        return true;
1570
1571    };
1572
1573    /**
1574     *  Add boundary(s) to the cache following the current last boundary.
1575     *  Return false if at the end of the text, and no more boundaries can be added.
1576     *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
1577     */
1578    boolean populateFollowing() {
1579        int fromPosition = fBoundaries[fEndBufIdx];
1580        int fromRuleStatusIdx = fStatuses[fEndBufIdx];
1581        int pos = 0;
1582        int ruleStatusIdx = 0;
1583
1584        if (fDictionaryCache.following(fromPosition)) {
1585            addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1586            return true;
1587        }
1588
1589        fPosition = fromPosition;
1590        pos = handleNext();
1591        if (pos == BreakIterator.DONE) {
1592            return false;
1593        }
1594
1595        ruleStatusIdx = fRuleStatusIndex;
1596        if (fDictionaryCharCount > 0) {
1597            // The text segment obtained from the rules includes dictionary characters.
1598            // Subdivide it, with subdivided results going into the dictionary cache.
1599            fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
1600            if (fDictionaryCache.following(fromPosition)) {
1601                addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1602                return true;
1603                // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point.
1604                //       But be careful with interactions with populateNear().
1605            }
1606        }
1607
1608        // Rule based segment did not include dictionary characters.
1609        // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
1610        //    meaning that we didn't take the return, above.
1611        // Add its end point to the cache.
1612        addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
1613
1614        // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
1615        //    (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
1616        //
1617        for (int count=0; count<6; ++count) {
1618            pos = handleNext();
1619            if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) {
1620                break;
1621            }
1622            addFollowing(pos, fRuleStatusIndex, RetainCachePosition);
1623        }
1624        return true;
1625    };
1626
1627    /**
1628     *  Add one or more boundaries to the cache preceding the first currently cached boundary.
1629     *  Leave the iteration position on the first added boundary.
1630     *  Return false if no boundaries could be added (if at the start of the text.)
1631     */
1632    boolean populatePreceding() {
1633        int textBegin = fText.getBeginIndex();
1634        int fromPosition = fBoundaries[fStartBufIdx];
1635        if (fromPosition == textBegin) {
1636            return false;
1637        }
1638
1639        int position = textBegin;
1640        int positionStatusIdx = 0;
1641
1642        if (fDictionaryCache.preceding(fromPosition)) {
1643            addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1644            return true;
1645        }
1646
1647        int backupPosition = fromPosition;
1648
1649        // Find a boundary somewhere preceding the first already-cached boundary
1650        do {
1651            backupPosition = backupPosition - 30;
1652            if (backupPosition <= textBegin) {
1653                backupPosition = textBegin;
1654            } else {
1655                backupPosition = handlePrevious(backupPosition);
1656            }
1657            if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
1658                position = textBegin;
1659                positionStatusIdx = 0;
1660            } else {
1661                fPosition = backupPosition;  // TODO: pass starting position in a clearer way.
1662                position = handleNext();
1663                positionStatusIdx = fRuleStatusIndex;
1664
1665            }
1666        } while (position >= fromPosition);
1667
1668        // Find boundaries between the one we just located and the first already-cached boundary
1669        // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
1670
1671        fSideBuffer.removeAllElements();
1672        fSideBuffer.push(position);
1673        fSideBuffer.push(positionStatusIdx);
1674
1675        do {
1676            int prevPosition = fPosition = position;
1677            int prevStatusIdx = positionStatusIdx;
1678            position = handleNext();
1679            positionStatusIdx = fRuleStatusIndex;
1680            if (position == BreakIterator.DONE) {
1681                break;
1682            }
1683
1684            boolean segmentHandledByDictionary = false;
1685            if (fDictionaryCharCount != 0) {
1686                // Segment from the rules includes dictionary characters.
1687                // Subdivide it, with subdivided results going into the dictionary cache.
1688                int dictSegEndPosition = position;
1689                fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
1690                while (fDictionaryCache.following(prevPosition)) {
1691                    position = fDictionaryCache.fBoundary;
1692                    positionStatusIdx = fDictionaryCache.fStatusIndex;
1693                    segmentHandledByDictionary = true;
1694                    assert(position > prevPosition);
1695                    if (position >= fromPosition) {
1696                        break;
1697                    }
1698                    assert(position <= dictSegEndPosition);
1699                    fSideBuffer.push(position);
1700                    fSideBuffer.push(positionStatusIdx);
1701                    prevPosition = position;
1702                }
1703                assert(position==dictSegEndPosition || position>=fromPosition);
1704            }
1705
1706            if (!segmentHandledByDictionary && position < fromPosition) {
1707                fSideBuffer.push(position);
1708                fSideBuffer.push(positionStatusIdx);
1709            }
1710        } while (position < fromPosition);
1711
1712        // Move boundaries from the side buffer to the main circular buffer.
1713        boolean success = false;
1714        if (!fSideBuffer.isEmpty()) {
1715            positionStatusIdx = fSideBuffer.pop();
1716            position = fSideBuffer.pop();
1717            addPreceding(position, positionStatusIdx, UpdateCachePosition);
1718            success = true;
1719        }
1720
1721        while (!fSideBuffer.isEmpty()) {
1722            positionStatusIdx = fSideBuffer.pop();
1723            position = fSideBuffer.pop();
1724            if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
1725                // No space in circular buffer to hold a new preceding result while
1726                // also retaining the current cache (iteration) position.
1727                // Bailing out is safe; the cache will refill again if needed.
1728                break;
1729            }
1730        }
1731        return success;
1732    };
1733
1734
1735    static final boolean RetainCachePosition = false;
1736    static final boolean UpdateCachePosition = true;
1737
1738    /*
1739     * Add the boundary following the current position.
1740     * The current position can be left as it was, or changed to the newly added boundary,
1741     * as specified by the update parameter.
1742     */
1743    void addFollowing(int position, int ruleStatusIdx, boolean update) {
1744        assert(position > fBoundaries[fEndBufIdx]);
1745        assert(ruleStatusIdx <= Short.MAX_VALUE);
1746        int nextIdx = modChunkSize(fEndBufIdx + 1);
1747        if (nextIdx == fStartBufIdx) {
1748            fStartBufIdx = modChunkSize(fStartBufIdx + 6);    // TODO: experiment. Probably revert to 1.
1749        }
1750        fBoundaries[nextIdx] = position;
1751        fStatuses[nextIdx] = (short)ruleStatusIdx;
1752        fEndBufIdx = nextIdx;
1753        if (update == UpdateCachePosition) {
1754            // Set current position to the newly added boundary.
1755            fBufIdx = nextIdx;
1756            fTextIdx = position;
1757        } else {
1758            // Retaining the original cache position.
1759            // Check if the added boundary wraps around the buffer, and would over-write the original position.
1760            // It's the responsibility of callers of this function to not add too many.
1761            assert(nextIdx != fBufIdx);
1762        }
1763
1764    };
1765
1766
1767    /*
1768     * Add the boundary preceding the current position.
1769     * The current position can be left as it was, or changed to the newly added boundary,
1770     * as specified by the update parameter.
1771     */
1772    boolean addPreceding(int position, int ruleStatusIdx, boolean update) {
1773        assert(position < fBoundaries[fStartBufIdx]);
1774        assert(ruleStatusIdx <= Short.MAX_VALUE);
1775        int nextIdx = modChunkSize(fStartBufIdx - 1);
1776        if (nextIdx == fEndBufIdx) {
1777            if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
1778                // Failure. The insertion of the new boundary would claim the buffer position that is the
1779                // current iteration position. And we also want to retain the current iteration position.
1780                // (The buffer is already completely full of entries that precede the iteration position.)
1781                return false;
1782            }
1783            fEndBufIdx = modChunkSize(fEndBufIdx - 1);
1784        }
1785        fBoundaries[nextIdx] = position;
1786        fStatuses[nextIdx] = (short)ruleStatusIdx;
1787        fStartBufIdx = nextIdx;
1788        if (update == UpdateCachePosition) {
1789            fBufIdx = nextIdx;
1790            fTextIdx = position;
1791        }
1792        return true;
1793    };
1794
1795    /**
1796     *  Set the cache position to the specified position, or, if the position
1797     *  falls between to cached boundaries, to the preceding boundary.
1798     *  Fails if the requested position is outside of the range of boundaries currently held by the cache.
1799     *  The startPosition must be on a code point boundary.
1800     *
1801     *  Return true if successful, false if the specified position is after
1802     *  the last cached boundary or before the first.
1803     */
1804    boolean seek(int pos) {
1805        if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
1806            return false;
1807        }
1808        if (pos == fBoundaries[fStartBufIdx]) {
1809            // Common case: seek(0), from BreakIterator::first()
1810            fBufIdx = fStartBufIdx;
1811            fTextIdx = fBoundaries[fBufIdx];
1812            return true;
1813        }
1814        if (pos == fBoundaries[fEndBufIdx]) {
1815            fBufIdx = fEndBufIdx;
1816            fTextIdx = fBoundaries[fBufIdx];
1817            return true;
1818        }
1819
1820        int min = fStartBufIdx;
1821        int max = fEndBufIdx;
1822        while (min != max) {
1823            int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
1824            probe = modChunkSize(probe);
1825            if (fBoundaries[probe] > pos) {
1826                max = probe;
1827            } else {
1828                min = modChunkSize(probe + 1);
1829            }
1830        }
1831        assert(fBoundaries[max] > pos);
1832        fBufIdx = modChunkSize(max - 1);
1833        fTextIdx = fBoundaries[fBufIdx];
1834        assert(fTextIdx <= pos);
1835        return true;
1836
1837    };
1838
1839
1840    /**
1841     * copy constructor, used from RuleBasedBreakIterator.clone().
1842     *
1843     * @param src
1844     */
1845    BreakCache(BreakCache src)  {
1846        fStartBufIdx = src.fStartBufIdx;
1847        fEndBufIdx = src.fEndBufIdx;
1848        fTextIdx = src.fTextIdx;
1849        fBufIdx = src.fBufIdx;
1850        fBoundaries = src.fBoundaries.clone();
1851        fStatuses = src.fStatuses.clone();
1852        fSideBuffer = new DictionaryBreakEngine.DequeI();  // Transient, no need to clone contents.
1853    }
1854
1855    void dumpCache() {
1856        System.out.printf("fTextIdx:%d   fBufIdx:%d%n", fTextIdx, fBufIdx);
1857        for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) {
1858            System.out.printf("%d  %d%n", i, fBoundaries[i]);
1859            if (i == fEndBufIdx) {
1860                break;
1861            }
1862        }
1863    };
1864
1865    private final int   modChunkSize(int index) { return index & (CACHE_SIZE - 1); };
1866
1867    static final int CACHE_SIZE = 128;
1868    // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
1869
1870    int                 fStartBufIdx;
1871    int                 fEndBufIdx;    // inclusive
1872
1873    int                 fTextIdx;
1874    int                 fBufIdx;
1875
1876    int[]               fBoundaries = new int[CACHE_SIZE];
1877    short[]             fStatuses = new short[CACHE_SIZE];
1878
1879    DictionaryBreakEngine.DequeI   fSideBuffer = new DictionaryBreakEngine.DequeI();
1880};
1881
1882
1883
1884
1885}
1886
1887