1ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert// Copyright (C) 2016 and later: Unicode, Inc. and others. 2ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html 3ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 4ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert// file: rbbi_cache.cpp 5ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 6ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert#include "unicode/utypes.h" 7ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 8ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert#if !UCONFIG_NO_BREAK_ITERATION 9ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 10ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert#include "unicode/ubrk.h" 11ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert#include "unicode/rbbi.h" 12ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 13ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert#include "rbbi_cache.h" 14ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 15ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert#include "brkeng.h" 16ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert#include "cmemory.h" 17ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert#include "rbbidata.h" 18ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert#include "rbbirb.h" 19ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert#include "uassert.h" 20ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert#include "uvectr32.h" 21ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 22ffdc27edd5503111189fc11165c5a11289a71f79Fredrik RoubertU_NAMESPACE_BEGIN 23ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 24ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert/* 25ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert * DictionaryCache implementation 26ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert */ 27ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 28ffdc27edd5503111189fc11165c5a11289a71f79Fredrik RoubertRuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) : 29ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI(bi), fBreaks(NULL), fPositionInCache(-1), 30ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) { 31ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBreaks = new UVector32(status); 32ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 33ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 34ffdc27edd5503111189fc11165c5a11289a71f79Fredrik RoubertRuleBasedBreakIterator::DictionaryCache::~DictionaryCache() { 35ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert delete fBreaks; 36ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBreaks = NULL; 37ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 38ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 39ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubertvoid RuleBasedBreakIterator::DictionaryCache::reset() { 40ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fPositionInCache = -1; 41ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fStart = 0; 42ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fLimit = 0; 43ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fFirstRuleStatusIndex = 0; 44ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fOtherRuleStatusIndex = 0; 45ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBreaks->removeAllElements(); 46ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 47ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 48ffdc27edd5503111189fc11165c5a11289a71f79Fredrik RoubertUBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) { 49ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fromPos >= fLimit || fromPos < fStart) { 50ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fPositionInCache = -1; 51ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return FALSE; 52ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 53ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 54ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Sequential iteration, move from previous boundary to the following 55ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 56ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t r = 0; 57ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fPositionInCache >= 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) { 58ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert ++fPositionInCache; 59ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fPositionInCache >= fBreaks->size()) { 60ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fPositionInCache = -1; 61ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return FALSE; 62ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 63ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert r = fBreaks->elementAti(fPositionInCache); 64ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(r > fromPos); 65ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert *result = r; 66ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert *statusIndex = fOtherRuleStatusIndex; 67ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return TRUE; 68ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 69ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 70ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Random indexing. Linear search for the boundary following the given position. 71ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 72ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert for (fPositionInCache = 0; fPositionInCache < fBreaks->size(); ++fPositionInCache) { 73ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert r= fBreaks->elementAti(fPositionInCache); 74ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (r > fromPos) { 75ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert *result = r; 76ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert *statusIndex = fOtherRuleStatusIndex; 77ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return TRUE; 78ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 79ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 80ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(FALSE); 81ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fPositionInCache = -1; 82ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return FALSE; 83ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 84ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 85ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 86ffdc27edd5503111189fc11165c5a11289a71f79Fredrik RoubertUBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) { 87ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fromPos <= fStart || fromPos > fLimit) { 88ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fPositionInCache = -1; 89ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return FALSE; 90ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 91ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 92ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fromPos == fLimit) { 93ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fPositionInCache = fBreaks->size() - 1; 94ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fPositionInCache >= 0) { 95ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(fBreaks->elementAti(fPositionInCache) == fromPos); 96ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 97ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 98ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 99ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t r; 100ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fPositionInCache > 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) { 101ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert --fPositionInCache; 102ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert r = fBreaks->elementAti(fPositionInCache); 103ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(r < fromPos); 104ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert *result = r; 105ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 106ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return TRUE; 107ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 108ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 109ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fPositionInCache == 0) { 110ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fPositionInCache = -1; 111ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return FALSE; 112ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 113ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 114ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert for (fPositionInCache = fBreaks->size()-1; fPositionInCache >= 0; --fPositionInCache) { 115ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert r = fBreaks->elementAti(fPositionInCache); 116ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (r < fromPos) { 117ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert *result = r; 118ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 119ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return TRUE; 120ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 121ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 122ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(FALSE); 123ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fPositionInCache = -1; 124ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return FALSE; 125ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 126ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 127ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubertvoid RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos, 128ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t firstRuleStatus, int32_t otherRuleStatus) { 129ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if ((endPos - startPos) <= 1) { 130ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return; 131ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 132ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 133ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert reset(); 134ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fFirstRuleStatusIndex = firstRuleStatus; 135ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fOtherRuleStatusIndex = otherRuleStatus; 136ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 137ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t rangeStart = startPos; 138ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t rangeEnd = endPos; 139ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 140ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert uint16_t category; 141ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t current; 142ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert UErrorCode status = U_ZERO_ERROR; 143ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t foundBreakCount = 0; 144ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert UText *text = fBI->fText; 145ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 146ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Loop through the text, looking for ranges of dictionary characters. 147ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // For each span, find the appropriate break engine, and ask it to find 148ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // any breaks within the span. 149ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 150ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert utext_setNativeIndex(text, rangeStart); 151ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert UChar32 c = utext_current32(text); 152ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert category = UTRIE2_GET16(fBI->fData->fTrie, c); 153ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 154ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert while(U_SUCCESS(status)) { 155ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & 0x4000) == 0) { 156ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert utext_next32(text); // TODO: cleaner loop structure. 157ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert c = utext_current32(text); 158ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert category = UTRIE2_GET16(fBI->fData->fTrie, c); 159ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 160ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (current >= rangeEnd) { 161ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert break; 162ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 163ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 164ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // We now have a dictionary character. Get the appropriate language object 165ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // to deal with it. 166ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c); 167ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 168ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Ask the language object if there are any breaks. It will add them to the cache and 169ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // leave the text pointer on the other side of its range, ready to search for the next one. 170ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (lbe != NULL) { 171ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, *fBreaks); 172ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 173ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 174ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Reload the loop variables for the next go-round 175ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert c = utext_current32(text); 176ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert category = UTRIE2_GET16(fBI->fData->fTrie, c); 177ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 178ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 179ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // If we found breaks, ensure that the first and last entries are 180ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // the original starting and ending position. And initialize the 181ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // cache iteration position to the first entry. 182ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 183ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // printf("foundBreakCount = %d\n", foundBreakCount); 184ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (foundBreakCount > 0) { 185ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(foundBreakCount == fBreaks->size()); 186ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (startPos < fBreaks->elementAti(0)) { 187ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // The dictionary did not place a boundary at the start of the segment of text. 188ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Add one now. This should not commonly happen, but it would be easy for interactions 189ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // of the rules for dictionary segments and the break engine implementations to 190ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // inadvertently cause it. Cover it here, just in case. 191ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBreaks->insertElementAt(startPos, 0, status); 192ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 193ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (endPos > fBreaks->peeki()) { 194ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBreaks->push(endPos, status); 195ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 196ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fPositionInCache = 0; 197ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Note: Dictionary matching may extend beyond the original limit. 198ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fStart = fBreaks->elementAti(0); 199ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fLimit = fBreaks->peeki(); 200ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } else { 201ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // there were no language-based breaks, even though the segment contained 202ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache 203ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // for this range will fail, and the calling code will fall back to the rule based boundaries. 204ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 205ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 206ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 207ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 208ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert/* 209ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert * BreakCache implemetation 210ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert */ 211ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 212ffdc27edd5503111189fc11165c5a11289a71f79Fredrik RoubertRuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) : 213ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI(bi), fSideBuffer(status) { 214ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert reset(); 215ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 216ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 217ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 218ffdc27edd5503111189fc11165c5a11289a71f79Fredrik RoubertRuleBasedBreakIterator::BreakCache::~BreakCache() { 219ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 220ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 221ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 222ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubertvoid RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) { 223ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fStartBufIdx = 0; 224ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fEndBufIdx = 0; 225ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fTextIdx = pos; 226ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBufIdx = 0; 227ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBoundaries[0] = pos; 228ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fStatuses[0] = (uint16_t)ruleStatus; 229ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 230ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 231ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 232ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubertint32_t RuleBasedBreakIterator::BreakCache::current() { 233ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI->fPosition = fTextIdx; 234ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI->fRuleStatusIndex = fStatuses[fBufIdx]; 235ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI->fDone = FALSE; 236ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return fTextIdx; 237ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 238ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 239ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 240ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubertvoid RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) { 241ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (U_FAILURE(status)) { 242ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return; 243ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 244ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { 245ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // startPos is in the cache. Do a next() from that position. 246ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // TODO: an awkward set of interactions with bi->fDone 247ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // seek() does not clear it; it can't because of interactions with populateNear(). 248ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // next() does not clear it in the fast-path case, where everything matters. Maybe it should. 249ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. 250ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI->fDone = false; 251ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert next(); 252ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 253ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return; 254ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 255ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 256ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 257ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubertvoid RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) { 258ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (U_FAILURE(status)) { 259ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return; 260ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 261ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { 262ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (startPos == fTextIdx) { 263ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert previous(status); 264ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } else { 265ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // seek() leaves the BreakCache positioned at the preceding boundary 266ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // if the requested position is between two bounaries. 267ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // current() pushes the BreakCache position out to the BreakIterator itself. 268ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(startPos > fTextIdx); 269ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert current(); 270ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 271ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 272ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return; 273ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 274ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 275ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 276ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert/* 277ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert * Out-of-line code for BreakCache::next(). 278ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert * Cache does not already contain the boundary 279ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert */ 280ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubertvoid RuleBasedBreakIterator::BreakCache::nextOL() { 281ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI->fDone = !populateFollowing(); 282ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI->fPosition = fTextIdx; 283ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI->fRuleStatusIndex = fStatuses[fBufIdx]; 284ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return; 285ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 286ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 287ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 288ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubertvoid RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) { 289ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (U_FAILURE(status)) { 290ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return; 291ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 292ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t initialBufIdx = fBufIdx; 293ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fBufIdx == fStartBufIdx) { 294ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // At start of cache. Prepend to it. 295ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert populatePreceding(status); 296ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } else { 297ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Cache already holds the next boundary 298ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBufIdx = modChunkSize(fBufIdx - 1); 299ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fTextIdx = fBoundaries[fBufIdx]; 300ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 301ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI->fDone = (fBufIdx == initialBufIdx); 302ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI->fPosition = fTextIdx; 303ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI->fRuleStatusIndex = fStatuses[fBufIdx]; 304ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return; 305ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 306ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 307ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 308ffdc27edd5503111189fc11165c5a11289a71f79Fredrik RoubertUBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) { 309ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { 310ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return FALSE; 311ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 312ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (pos == fBoundaries[fStartBufIdx]) { 313ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Common case: seek(0), from BreakIterator::first() 314ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBufIdx = fStartBufIdx; 315ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fTextIdx = fBoundaries[fBufIdx]; 316ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return TRUE; 317ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 318ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (pos == fBoundaries[fEndBufIdx]) { 319ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBufIdx = fEndBufIdx; 320ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fTextIdx = fBoundaries[fBufIdx]; 321ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return TRUE; 322ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 323ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 324ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t min = fStartBufIdx; 325ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t max = fEndBufIdx; 326ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert while (min != max) { 327ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; 328ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert probe = modChunkSize(probe); 329ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fBoundaries[probe] > pos) { 330ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert max = probe; 331ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } else { 332ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert min = modChunkSize(probe + 1); 333ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 334ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 335ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(fBoundaries[max] > pos); 336ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBufIdx = modChunkSize(max - 1); 337ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fTextIdx = fBoundaries[fBufIdx]; 338ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(fTextIdx <= pos); 339ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return TRUE; 340ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 341ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 342ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 343ffdc27edd5503111189fc11165c5a11289a71f79Fredrik RoubertUBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) { 344ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (U_FAILURE(status)) { 345ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return FALSE; 346ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 347ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); 348ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 349ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Find a boundary somewhere in the vicinity of the requested position. 350ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Depending on the safe rules and the text data, it could be either before, at, or after 351ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // the requested position. 352ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 353ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 354ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // If the requested position is not near already cached positions, clear the existing cache, 355ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // find a near-by boundary and begin new cache contents there. 356ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 357ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { 358ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t aBoundary = 0; 359ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t ruleStatusIndex = 0; 360ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // TODO: check for position == length of text. Although may still need to back up to get rule status. 361ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (position > 20) { 362ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t backupPos = fBI->handlePrevious(position); 363ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI->fPosition = backupPos; 364ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary. 365ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert ruleStatusIndex = fBI->fRuleStatusIndex; 366ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 367ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. 368ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 369ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 370ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Fill in boundaries between existing cache content and the new requested position. 371ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 372ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fBoundaries[fEndBufIdx] < position) { 373ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // The last position in the cache precedes the requested position. 374ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Add following position(s) to the cache. 375ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert while (fBoundaries[fEndBufIdx] < position) { 376ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (!populateFollowing()) { 377ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(false); 378ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return false; 379ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 380ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 381ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. 382ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. 383ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. 384ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert previous(status); 385ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 386ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return true; 387ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 388ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 389ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fBoundaries[fStartBufIdx] > position) { 390ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // The first position in the cache is beyond the requested position. 391ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // back up more until we get a boundary <= the requested position. 392ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert while (fBoundaries[fStartBufIdx] > position) { 393ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert populatePreceding(status); 394ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 395ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. 396ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. 397ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. 398ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert next(); 399ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 400ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fTextIdx > position) { 401ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // If position is not itself a boundary, the next() loop above will overshoot. 402ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Back up one, leaving cache position at the boundary preceding the requested position. 403ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert previous(status); 404ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 405ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return true; 406ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 407ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 408ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(fTextIdx == position); 409ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return true; 410ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 411ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 412ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 413ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 414ffdc27edd5503111189fc11165c5a11289a71f79Fredrik RoubertUBool RuleBasedBreakIterator::BreakCache::populateFollowing() { 415ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t fromPosition = fBoundaries[fEndBufIdx]; 416ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx]; 417ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t pos = 0; 418ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t ruleStatusIdx = 0; 419ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 420ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { 421ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert addFollowing(pos, ruleStatusIdx, UpdateCachePosition); 422ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return TRUE; 423ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 424ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 425ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI->fPosition = fromPosition; 426ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert pos = fBI->handleNext(); 427ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (pos == UBRK_DONE) { 428ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return FALSE; 429ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 430ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 431ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert ruleStatusIdx = fBI->fRuleStatusIndex; 432ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fBI->fDictionaryCharCount > 0) { 433ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // The text segment obtained from the rules includes dictionary characters. 434ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Subdivide it, with subdivided results going into the dictionary cache. 435ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); 436ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { 437ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert addFollowing(pos, ruleStatusIdx, UpdateCachePosition); 438ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return TRUE; 439ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point. 440ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // But be careful with interactions with populateNear(). 441ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 442ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 443ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 444ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Rule based segment did not include dictionary characters. 445ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, 446ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // meaning that we didn't take the return, above. 447ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Add its end point to the cache. 448ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert addFollowing(pos, ruleStatusIdx, UpdateCachePosition); 449ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 450ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. 451ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. 452ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // 453ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert for (int count=0; count<6; ++count) { 454ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert pos = fBI->handleNext(); 455ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) { 456ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert break; 457ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 458ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition); 459ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 460ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 461ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return TRUE; 462ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 463ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 464ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 465ffdc27edd5503111189fc11165c5a11289a71f79Fredrik RoubertUBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) { 466ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (U_FAILURE(status)) { 467ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return FALSE; 468ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 469ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 470ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t fromPosition = fBoundaries[fStartBufIdx]; 471ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fromPosition == 0) { 472ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return FALSE; 473ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 474ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 475ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t position = 0; 476ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t positionStatusIdx = 0; 477ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 478ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) { 479ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert addPreceding(position, positionStatusIdx, UpdateCachePosition); 480ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return TRUE; 481ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 482ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 483ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t backupPosition = fromPosition; 484ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 485ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Find a boundary somewhere preceding the first already-cached boundary 486ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert do { 487ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert backupPosition = backupPosition - 30; 488ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (backupPosition <= 0) { 489ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert backupPosition = 0; 490ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } else { 491ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert backupPosition = fBI->handlePrevious(backupPosition); 492ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 493ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (backupPosition == UBRK_DONE || backupPosition == 0) { 494ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert position = 0; 495ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert positionStatusIdx = 0; 496ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } else { 497ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI->fPosition = backupPosition; // TODO: pass starting position in a clearer way. 498ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert position = fBI->handleNext(); 499ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert positionStatusIdx = fBI->fRuleStatusIndex; 500ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 501ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 502ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } while (position >= fromPosition); 503ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 504ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Find boundaries between the one we just located and the first already-cached boundary 505ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.. 506ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 507ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fSideBuffer.removeAllElements(); 508ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fSideBuffer.addElement(position, status); 509ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fSideBuffer.addElement(positionStatusIdx, status); 510ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 511ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert do { 512ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t prevPosition = fBI->fPosition = position; 513ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t prevStatusIdx = positionStatusIdx; 514ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert position = fBI->handleNext(); 515ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert positionStatusIdx = fBI->fRuleStatusIndex; 516ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (position == UBRK_DONE) { 517ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert break; 518ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 519ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 520ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert UBool segmentHandledByDictionary = FALSE; 521ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fBI->fDictionaryCharCount != 0) { 522ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Segment from the rules includes dictionary characters. 523ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Subdivide it, with subdivided results going into the dictionary cache. 524ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t dictSegEndPosition = position; 525ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); 526ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) { 527ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert segmentHandledByDictionary = true; 528ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(position > prevPosition); 529ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (position >= fromPosition) { 530ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert break; 531ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 532ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(position <= dictSegEndPosition); 533ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fSideBuffer.addElement(position, status); 534ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fSideBuffer.addElement(positionStatusIdx, status); 535ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert prevPosition = position; 536ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 537ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(position==dictSegEndPosition || position>=fromPosition); 538ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 539ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 540ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (!segmentHandledByDictionary && position < fromPosition) { 541ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fSideBuffer.addElement(position, status); 542ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fSideBuffer.addElement(positionStatusIdx, status); 543ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 544ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } while (position < fromPosition); 545ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 546ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Move boundaries from the side buffer to the main circular buffer. 547ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert UBool success = FALSE; 548ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (!fSideBuffer.isEmpty()) { 549ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert positionStatusIdx = fSideBuffer.popi(); 550ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert position = fSideBuffer.popi(); 551ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert addPreceding(position, positionStatusIdx, UpdateCachePosition); 552ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert success = TRUE; 553ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 554ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 555ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert while (!fSideBuffer.isEmpty()) { 556ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert positionStatusIdx = fSideBuffer.popi(); 557ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert position = fSideBuffer.popi(); 558ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { 559ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // No space in circular buffer to hold a new preceding result while 560ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // also retaining the current cache (iteration) position. 561ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Bailing out is safe; the cache will refill again if needed. 562ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert break; 563ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 564ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 565ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 566ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return success; 567ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 568ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 569ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 570ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubertvoid RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { 571ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(position > fBoundaries[fEndBufIdx]); 572ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(ruleStatusIdx <= UINT16_MAX); 573ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t nextIdx = modChunkSize(fEndBufIdx + 1); 574ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (nextIdx == fStartBufIdx) { 575ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. 576ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 577ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBoundaries[nextIdx] = position; 578ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fStatuses[nextIdx] = ruleStatusIdx; 579ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fEndBufIdx = nextIdx; 580ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (update == UpdateCachePosition) { 581ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Set current position to the newly added boundary. 582ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBufIdx = nextIdx; 583ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fTextIdx = position; 584ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } else { 585ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Retaining the original cache position. 586ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Check if the added boundary wraps around the buffer, and would over-write the original position. 587ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // It's the responsibility of callers of this function to not add too many. 588ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(nextIdx != fBufIdx); 589ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 590ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 591ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 592ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubertbool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { 593ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(position < fBoundaries[fStartBufIdx]); 594ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U_ASSERT(ruleStatusIdx <= UINT16_MAX); 595ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert int32_t nextIdx = modChunkSize(fStartBufIdx - 1); 596ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (nextIdx == fEndBufIdx) { 597ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { 598ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // Failure. The insertion of the new boundary would claim the buffer position that is the 599ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // current iteration position. And we also want to retain the current iteration position. 600ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // (The buffer is already completely full of entries that precede the iteration position.) 601ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return false; 602ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 603ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fEndBufIdx = modChunkSize(fEndBufIdx - 1); 604ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 605ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBoundaries[nextIdx] = position; 606ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fStatuses[nextIdx] = ruleStatusIdx; 607ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fStartBufIdx = nextIdx; 608ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (update == UpdateCachePosition) { 609ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fBufIdx = nextIdx; 610ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert fTextIdx = position; 611ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 612ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return true; 613ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 614ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 615ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 616ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubertvoid RuleBasedBreakIterator::BreakCache::dumpCache() { 617ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert#ifdef RBBI_DEBUG 618ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx); 619ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) { 620ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert RBBIDebugPrintf("%d %d\n", i, fBoundaries[i]); 621ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if (i == fEndBufIdx) { 622ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert break; 623ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 624ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 625ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert#endif 626ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert} 627ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 628ffdc27edd5503111189fc11165c5a11289a71f79Fredrik RoubertU_NAMESPACE_END 629ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert 630ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert#endif // #if !UCONFIG_NO_BREAK_ITERATION 631