1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4***************************************************************************
5*   Copyright (C) 1999-2016 International Business Machines Corporation
6*   and others. All rights reserved.
7***************************************************************************
8*/
9//
10//  file:  rbbi.cpp  Contains the implementation of the rule based break iterator
11//                   runtime engine and the API implementation for
12//                   class RuleBasedBreakIterator
13//
14
15#include "utypeinfo.h"  // for 'typeid' to work
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_BREAK_ITERATION
20
21#include "unicode/rbbi.h"
22#include "unicode/schriter.h"
23#include "unicode/uchriter.h"
24#include "unicode/uclean.h"
25#include "unicode/udata.h"
26
27#include "brkeng.h"
28#include "ucln_cmn.h"
29#include "cmemory.h"
30#include "cstring.h"
31#include "rbbidata.h"
32#include "rbbi_cache.h"
33#include "rbbirb.h"
34#include "uassert.h"
35#include "umutex.h"
36#include "uvectr32.h"
37
38// if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
39#if U_LOCAL_SERVICE_HOOK
40#include "localsvc.h"
41#endif
42
43#ifdef RBBI_DEBUG
44static UBool gTrace = FALSE;
45#endif
46
47U_NAMESPACE_BEGIN
48
49// The state number of the starting state
50constexpr int32_t START_STATE = 1;
51
52// The state-transition value indicating "stop"
53constexpr int32_t STOP_STATE = 0;
54
55
56UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
57
58
59//=======================================================================
60// constructors
61//=======================================================================
62
63/**
64 * Constructs a RuleBasedBreakIterator that uses the already-created
65 * tables object that is passed in as a parameter.
66 */
67RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) {
68    init(status);
69    fData = new RBBIDataWrapper(data, status); // status checked in constructor
70    if (U_FAILURE(status)) {return;}
71    if(fData == 0) {
72        status = U_MEMORY_ALLOCATION_ERROR;
73        return;
74    }
75}
76
77//
78//  Construct from precompiled binary rules (tables).  This constructor is public API,
79//  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
80//
81RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
82                       uint32_t       ruleLength,
83                       UErrorCode     &status) {
84    init(status);
85    if (U_FAILURE(status)) {
86        return;
87    }
88    if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
89        status = U_ILLEGAL_ARGUMENT_ERROR;
90        return;
91    }
92    const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
93    if (data->fLength > ruleLength) {
94        status = U_ILLEGAL_ARGUMENT_ERROR;
95        return;
96    }
97    fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
98    if (U_FAILURE(status)) {return;}
99    if(fData == 0) {
100        status = U_MEMORY_ALLOCATION_ERROR;
101        return;
102    }
103}
104
105
106//-------------------------------------------------------------------------------
107//
108//   Constructor   from a UDataMemory handle to precompiled break rules
109//                 stored in an ICU data file.
110//
111//-------------------------------------------------------------------------------
112RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
113{
114    init(status);
115    fData = new RBBIDataWrapper(udm, status); // status checked in constructor
116    if (U_FAILURE(status)) {return;}
117    if(fData == 0) {
118        status = U_MEMORY_ALLOCATION_ERROR;
119        return;
120    }
121}
122
123
124
125//-------------------------------------------------------------------------------
126//
127//   Constructor       from a set of rules supplied as a string.
128//
129//-------------------------------------------------------------------------------
130RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
131                                                UParseError          &parseError,
132                                                UErrorCode           &status)
133{
134    init(status);
135    if (U_FAILURE(status)) {return;}
136    RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
137        RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
138    // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
139    //        creates and returns a complete RBBI.  From here, in a constructor, we
140    //        can't just return the object created by the builder factory, hence
141    //        the assignment of the factory created object to "this".
142    if (U_SUCCESS(status)) {
143        *this = *bi;
144        delete bi;
145    }
146}
147
148
149//-------------------------------------------------------------------------------
150//
151// Default Constructor.      Create an empty shell that can be set up later.
152//                           Used when creating a RuleBasedBreakIterator from a set
153//                           of rules.
154//-------------------------------------------------------------------------------
155RuleBasedBreakIterator::RuleBasedBreakIterator() {
156    UErrorCode status = U_ZERO_ERROR;
157    init(status);
158}
159
160
161//-------------------------------------------------------------------------------
162//
163//   Copy constructor.  Will produce a break iterator with the same behavior,
164//                      and which iterates over the same text, as the one passed in.
165//
166//-------------------------------------------------------------------------------
167RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
168: BreakIterator(other)
169{
170    UErrorCode status = U_ZERO_ERROR;
171    this->init(status);
172    *this = other;
173}
174
175
176/**
177 * Destructor
178 */
179RuleBasedBreakIterator::~RuleBasedBreakIterator() {
180    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
181        // fCharIter was adopted from the outside.
182        delete fCharIter;
183    }
184    fCharIter = NULL;
185    delete fSCharIter;
186    fSCharIter = NULL;
187    delete fDCharIter;
188    fDCharIter = NULL;
189
190    utext_close(fText);
191
192    if (fData != NULL) {
193        fData->removeReference();
194        fData = NULL;
195    }
196    delete fBreakCache;
197    fBreakCache = NULL;
198
199    delete fDictionaryCache;
200    fDictionaryCache = NULL;
201
202    delete fLanguageBreakEngines;
203    fLanguageBreakEngines = NULL;
204
205    delete fUnhandledBreakEngine;
206    fUnhandledBreakEngine = NULL;
207}
208
209/**
210 * Assignment operator.  Sets this iterator to have the same behavior,
211 * and iterate over the same text, as the one passed in.
212 */
213RuleBasedBreakIterator&
214RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
215    if (this == &that) {
216        return *this;
217    }
218    BreakIterator::operator=(that);
219
220    fBreakType = that.fBreakType;
221    if (fLanguageBreakEngines != NULL) {
222        delete fLanguageBreakEngines;
223        fLanguageBreakEngines = NULL;   // Just rebuild for now
224    }
225    // TODO: clone fLanguageBreakEngines from "that"
226    UErrorCode status = U_ZERO_ERROR;
227    fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
228
229    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
230        delete fCharIter;
231    }
232    fCharIter = NULL;
233
234    if (that.fCharIter != NULL ) {
235        // This is a little bit tricky - it will intially appear that
236        //  this->fCharIter is adopted, even if that->fCharIter was
237        //  not adopted.  That's ok.
238        fCharIter = that.fCharIter->clone();
239    }
240
241    if (fData != NULL) {
242        fData->removeReference();
243        fData = NULL;
244    }
245    if (that.fData != NULL) {
246        fData = that.fData->addReference();
247    }
248
249    fPosition = that.fPosition;
250    fRuleStatusIndex = that.fRuleStatusIndex;
251    fDone = that.fDone;
252
253    // TODO: both the dictionary and the main cache need to be copied.
254    //       Current position could be within a dictionary range. Trying to continue
255    //       the iteration without the caches present would go to the rules, with
256    //       the assumption that the current position is on a rule boundary.
257    fBreakCache->reset(fPosition, fRuleStatusIndex);
258    fDictionaryCache->reset();
259
260    return *this;
261}
262
263
264
265//-----------------------------------------------------------------------------
266//
267//    init()      Shared initialization routine.   Used by all the constructors.
268//                Initializes all fields, leaving the object in a consistent state.
269//
270//-----------------------------------------------------------------------------
271void RuleBasedBreakIterator::init(UErrorCode &status) {
272    fText                 = NULL;
273    fCharIter             = NULL;
274    fSCharIter            = NULL;
275    fDCharIter            = NULL;
276    fData                 = NULL;
277    fPosition             = 0;
278    fRuleStatusIndex      = 0;
279    fDone                 = false;
280    fDictionaryCharCount  = 0;
281    fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
282                                        //   dictionary behavior for Break Iterators that are
283                                        //   built from rules.  Even better would be the ability to
284                                        //   declare the type in the rules.
285
286    fLanguageBreakEngines = NULL;
287    fUnhandledBreakEngine = NULL;
288    fBreakCache           = NULL;
289    fDictionaryCache      = NULL;
290
291    if (U_FAILURE(status)) {
292        return;
293    }
294
295    fText            = utext_openUChars(NULL, NULL, 0, &status);
296    fDictionaryCache = new DictionaryCache(this, status);
297    fBreakCache      = new BreakCache(this, status);
298    if (U_SUCCESS(status) && (fText == NULL || fDictionaryCache == NULL || fBreakCache == NULL)) {
299        status = U_MEMORY_ALLOCATION_ERROR;
300    }
301
302#ifdef RBBI_DEBUG
303    static UBool debugInitDone = FALSE;
304    if (debugInitDone == FALSE) {
305        char *debugEnv = getenv("U_RBBIDEBUG");
306        if (debugEnv && uprv_strstr(debugEnv, "trace")) {
307            gTrace = TRUE;
308        }
309        debugInitDone = TRUE;
310    }
311#endif
312}
313
314
315
316//-----------------------------------------------------------------------------
317//
318//    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
319//            behavior, and iterating over the same text, as this one.
320//            Virtual function: does the right thing with subclasses.
321//
322//-----------------------------------------------------------------------------
323BreakIterator*
324RuleBasedBreakIterator::clone(void) const {
325    return new RuleBasedBreakIterator(*this);
326}
327
328/**
329 * Equality operator.  Returns TRUE if both BreakIterators are of the
330 * same class, have the same behavior, and iterate over the same text.
331 */
332UBool
333RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
334    if (typeid(*this) != typeid(that)) {
335        return FALSE;
336    }
337    if (this == &that) {
338        return TRUE;
339    }
340
341    // The base class BreakIterator carries no state that participates in equality,
342    // and does not implement an equality function that would otherwise be
343    // checked at this point.
344
345    const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
346
347    if (!utext_equals(fText, that2.fText)) {
348        // The two break iterators are operating on different text,
349        //   or have a different iteration position.
350        //   Note that fText's position is always the same as the break iterator's position.
351        return FALSE;
352    };
353
354    if (!(fPosition == that2.fPosition &&
355            fRuleStatusIndex == that2.fRuleStatusIndex &&
356            fDone == that2.fDone)) {
357        return FALSE;
358    }
359
360    if (that2.fData == fData ||
361        (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
362            // The two break iterators are using the same rules.
363            return TRUE;
364        }
365    return FALSE;
366}
367
368/**
369 * Compute a hash code for this BreakIterator
370 * @return A hash code
371 */
372int32_t
373RuleBasedBreakIterator::hashCode(void) const {
374    int32_t   hash = 0;
375    if (fData != NULL) {
376        hash = fData->hashCode();
377    }
378    return hash;
379}
380
381
382void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
383    if (U_FAILURE(status)) {
384        return;
385    }
386    fBreakCache->reset();
387    fDictionaryCache->reset();
388    fText = utext_clone(fText, ut, FALSE, TRUE, &status);
389
390    // Set up a dummy CharacterIterator to be returned if anyone
391    //   calls getText().  With input from UText, there is no reasonable
392    //   way to return a characterIterator over the actual input text.
393    //   Return one over an empty string instead - this is the closest
394    //   we can come to signaling a failure.
395    //   (GetText() is obsolete, this failure is sort of OK)
396    if (fDCharIter == NULL) {
397        static const UChar c = 0;
398        fDCharIter = new UCharCharacterIterator(&c, 0);
399        if (fDCharIter == NULL) {
400            status = U_MEMORY_ALLOCATION_ERROR;
401            return;
402        }
403    }
404
405    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
406        // existing fCharIter was adopted from the outside.  Delete it now.
407        delete fCharIter;
408    }
409    fCharIter = fDCharIter;
410
411    this->first();
412}
413
414
415UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
416    UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
417    return result;
418}
419
420
421//=======================================================================
422// BreakIterator overrides
423//=======================================================================
424
425/**
426 * Return a CharacterIterator over the text being analyzed.
427 */
428CharacterIterator&
429RuleBasedBreakIterator::getText() const {
430    return *fCharIter;
431}
432
433/**
434 * Set the iterator to analyze a new piece of text.  This function resets
435 * the current iteration position to the beginning of the text.
436 * @param newText An iterator over the text to analyze.
437 */
438void
439RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
440    // If we are holding a CharacterIterator adopted from a
441    //   previous call to this function, delete it now.
442    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
443        delete fCharIter;
444    }
445
446    fCharIter = newText;
447    UErrorCode status = U_ZERO_ERROR;
448    fBreakCache->reset();
449    fDictionaryCache->reset();
450    if (newText==NULL || newText->startIndex() != 0) {
451        // startIndex !=0 wants to be an error, but there's no way to report it.
452        // Make the iterator text be an empty string.
453        fText = utext_openUChars(fText, NULL, 0, &status);
454    } else {
455        fText = utext_openCharacterIterator(fText, newText, &status);
456    }
457    this->first();
458}
459
460/**
461 * Set the iterator to analyze a new piece of text.  This function resets
462 * the current iteration position to the beginning of the text.
463 * @param newText An iterator over the text to analyze.
464 */
465void
466RuleBasedBreakIterator::setText(const UnicodeString& newText) {
467    UErrorCode status = U_ZERO_ERROR;
468    fBreakCache->reset();
469    fDictionaryCache->reset();
470    fText = utext_openConstUnicodeString(fText, &newText, &status);
471
472    // Set up a character iterator on the string.
473    //   Needed in case someone calls getText().
474    //  Can not, unfortunately, do this lazily on the (probably never)
475    //  call to getText(), because getText is const.
476    if (fSCharIter == NULL) {
477        fSCharIter = new StringCharacterIterator(newText);
478    } else {
479        fSCharIter->setText(newText);
480    }
481
482    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
483        // old fCharIter was adopted from the outside.  Delete it.
484        delete fCharIter;
485    }
486    fCharIter = fSCharIter;
487
488    this->first();
489}
490
491
492/**
493 *  Provide a new UText for the input text.  Must reference text with contents identical
494 *  to the original.
495 *  Intended for use with text data originating in Java (garbage collected) environments
496 *  where the data may be moved in memory at arbitrary times.
497 */
498RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
499    if (U_FAILURE(status)) {
500        return *this;
501    }
502    if (input == NULL) {
503        status = U_ILLEGAL_ARGUMENT_ERROR;
504        return *this;
505    }
506    int64_t pos = utext_getNativeIndex(fText);
507    //  Shallow read-only clone of the new UText into the existing input UText
508    fText = utext_clone(fText, input, FALSE, TRUE, &status);
509    if (U_FAILURE(status)) {
510        return *this;
511    }
512    utext_setNativeIndex(fText, pos);
513    if (utext_getNativeIndex(fText) != pos) {
514        // Sanity check.  The new input utext is supposed to have the exact same
515        // contents as the old.  If we can't set to the same position, it doesn't.
516        // The contents underlying the old utext might be invalid at this point,
517        // so it's not safe to check directly.
518        status = U_ILLEGAL_ARGUMENT_ERROR;
519    }
520    return *this;
521}
522
523
524/**
525 * Sets the current iteration position to the beginning of the text, position zero.
526 * @return The new iterator position, which is zero.
527 */
528int32_t RuleBasedBreakIterator::first(void) {
529    UErrorCode status = U_ZERO_ERROR;
530    if (!fBreakCache->seek(0)) {
531        fBreakCache->populateNear(0, status);
532    }
533    fBreakCache->current();
534    U_ASSERT(fPosition == 0);
535    return 0;
536}
537
538/**
539 * Sets the current iteration position to the end of the text.
540 * @return The text's past-the-end offset.
541 */
542int32_t RuleBasedBreakIterator::last(void) {
543    int32_t endPos = (int32_t)utext_nativeLength(fText);
544    UBool endShouldBeBoundary = isBoundary(endPos);      // Has side effect of setting iterator position.
545    (void)endShouldBeBoundary;
546    U_ASSERT(endShouldBeBoundary);
547    U_ASSERT(fPosition == endPos);
548    return endPos;
549}
550
551/**
552 * Advances the iterator either forward or backward the specified number of steps.
553 * Negative values move backward, and positive values move forward.  This is
554 * equivalent to repeatedly calling next() or previous().
555 * @param n The number of steps to move.  The sign indicates the direction
556 * (negative is backwards, and positive is forwards).
557 * @return The character offset of the boundary position n boundaries away from
558 * the current one.
559 */
560int32_t RuleBasedBreakIterator::next(int32_t n) {
561    int32_t result = 0;
562    if (n > 0) {
563        for (; n > 0 && result != UBRK_DONE; --n) {
564            result = next();
565        }
566    } else if (n < 0) {
567        for (; n < 0 && result != UBRK_DONE; ++n) {
568            result = previous();
569        }
570    } else {
571        result = current();
572    }
573    return result;
574}
575
576/**
577 * Advances the iterator to the next boundary position.
578 * @return The position of the first boundary after this one.
579 */
580int32_t RuleBasedBreakIterator::next(void) {
581    fBreakCache->next();
582    return fDone ? UBRK_DONE : fPosition;
583}
584
585/**
586 * Move the iterator backwards, to the boundary preceding the current one.
587 *
588 *         Starts from the current position within fText.
589 *         Starting position need not be on a boundary.
590 *
591 * @return The position of the boundary position immediately preceding the starting position.
592 */
593int32_t RuleBasedBreakIterator::previous(void) {
594    UErrorCode status = U_ZERO_ERROR;
595    fBreakCache->previous(status);
596    return fDone ? UBRK_DONE : fPosition;
597}
598
599/**
600 * Sets the iterator to refer to the first boundary position following
601 * the specified position.
602 * @param startPos The position from which to begin searching for a break position.
603 * @return The position of the first break after the current position.
604 */
605int32_t RuleBasedBreakIterator::following(int32_t startPos) {
606    // if the supplied position is before the beginning, return the
607    // text's starting offset
608    if (startPos < 0) {
609        return first();
610    }
611
612    // Move requested offset to a code point start. It might be on a trail surrogate,
613    // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text.
614    utext_setNativeIndex(fText, startPos);
615    startPos = (int32_t)utext_getNativeIndex(fText);
616
617    UErrorCode status = U_ZERO_ERROR;
618    fBreakCache->following(startPos, status);
619    return fDone ? UBRK_DONE : fPosition;
620}
621
622/**
623 * Sets the iterator to refer to the last boundary position before the
624 * specified position.
625 * @param offset The position to begin searching for a break from.
626 * @return The position of the last boundary before the starting position.
627 */
628int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
629    if (fText == NULL || offset > utext_nativeLength(fText)) {
630        return last();
631    }
632
633    // Move requested offset to a code point start. It might be on a trail surrogate,
634    // or on a trail byte if the input is UTF-8.
635
636    utext_setNativeIndex(fText, offset);
637    int32_t adjustedOffset = utext_getNativeIndex(fText);
638
639    UErrorCode status = U_ZERO_ERROR;
640    fBreakCache->preceding(adjustedOffset, status);
641    return fDone ? UBRK_DONE : fPosition;
642}
643
644/**
645 * Returns true if the specfied position is a boundary position.  As a side
646 * effect, leaves the iterator pointing to the first boundary position at
647 * or after "offset".
648 *
649 * @param offset the offset to check.
650 * @return True if "offset" is a boundary position.
651 */
652UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
653    // out-of-range indexes are never boundary positions
654    if (offset < 0) {
655        first();       // For side effects on current position, tag values.
656        return FALSE;
657    }
658
659    // Adjust offset to be on a code point boundary and not beyond the end of the text.
660    // Note that isBoundary() is always be false for offsets that are not on code point boundaries.
661    // But we still need the side effect of leaving iteration at the following boundary.
662
663    utext_setNativeIndex(fText, offset);
664    int32_t adjustedOffset = utext_getNativeIndex(fText);
665
666    bool result = false;
667    UErrorCode status = U_ZERO_ERROR;
668    if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) {
669        result = (fBreakCache->current() == offset);
670    }
671
672    if (result && adjustedOffset < offset && utext_char32At(fText, offset) == U_SENTINEL) {
673        // Original offset is beyond the end of the text. Return FALSE, it's not a boundary,
674        // but the iteration position remains set to the end of the text, which is a boundary.
675        return FALSE;
676    }
677    if (!result) {
678        // Not on a boundary. isBoundary() must leave iterator on the following boundary.
679        // Cache->seek(), above, left us on the preceding boundary, so advance one.
680        next();
681    }
682    return result;
683}
684
685
686/**
687 * Returns the current iteration position.
688 * @return The current iteration position.
689 */
690int32_t RuleBasedBreakIterator::current(void) const {
691    return fPosition;
692}
693
694
695//=======================================================================
696// implementation
697//=======================================================================
698
699//
700// RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
701//                 of user text.  A variable with this enum type keeps track of where we
702//                 are.  The state machine only fetches user input while in the RUN mode.
703//
704enum RBBIRunMode {
705    RBBI_START,     // state machine processing is before first char of input
706    RBBI_RUN,       // state machine processing is in the user text
707    RBBI_END        // state machine processing is after end of user text.
708};
709
710
711// Map from look-ahead break states (corresponds to rules) to boundary positions.
712// Allows multiple lookahead break rules to be in flight at the same time.
713//
714// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
715// in the state table be sequential, then we can just index an array. And the
716// table could also tell us in advance how big that array needs to be.
717//
718// Before ICU 57 there was just a single simple variable for a look-ahead match that
719// was in progress. Two rules at once did not work.
720
721static const int32_t kMaxLookaheads = 8;
722struct LookAheadResults {
723    int32_t    fUsedSlotLimit;
724    int32_t    fPositions[8];
725    int16_t    fKeys[8];
726
727    LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {};
728
729    int32_t getPosition(int16_t key) {
730        for (int32_t i=0; i<fUsedSlotLimit; ++i) {
731            if (fKeys[i] == key) {
732                return fPositions[i];
733            }
734        }
735        U_ASSERT(FALSE);
736        return -1;
737    }
738
739    void setPosition(int16_t key, int32_t position) {
740        int32_t i;
741        for (i=0; i<fUsedSlotLimit; ++i) {
742            if (fKeys[i] == key) {
743                fPositions[i] = position;
744                return;
745            }
746        }
747        if (i >= kMaxLookaheads) {
748            U_ASSERT(FALSE);
749            i = kMaxLookaheads - 1;
750        }
751        fKeys[i] = key;
752        fPositions[i] = position;
753        U_ASSERT(fUsedSlotLimit == i);
754        fUsedSlotLimit = i + 1;
755    }
756};
757
758
759//-----------------------------------------------------------------------------------
760//
761//  handleNext()
762//     Run the state machine to find a boundary
763//
764//-----------------------------------------------------------------------------------
765int32_t RuleBasedBreakIterator::handleNext() {
766    int32_t             state;
767    uint16_t            category        = 0;
768    RBBIRunMode         mode;
769
770    RBBIStateTableRow  *row;
771    UChar32             c;
772    LookAheadResults    lookAheadMatches;
773    int32_t             result             = 0;
774    int32_t             initialPosition    = 0;
775    const RBBIStateTable *statetable       = fData->fForwardTable;
776    const char         *tableData          = statetable->fTableData;
777    uint32_t            tableRowLen        = statetable->fRowLen;
778    #ifdef RBBI_DEBUG
779        if (gTrace) {
780            RBBIDebugPuts("Handle Next   pos   char  state category");
781        }
782    #endif
783
784    // handleNext alway sets the break tag value.
785    // Set the default for it.
786    fRuleStatusIndex = 0;
787
788    fDictionaryCharCount = 0;
789
790    // if we're already at the end of the text, return DONE.
791    initialPosition = fPosition;
792    UTEXT_SETNATIVEINDEX(fText, initialPosition);
793    result          = initialPosition;
794    c               = UTEXT_NEXT32(fText);
795    if (c==U_SENTINEL) {
796        fDone = TRUE;
797        return UBRK_DONE;
798    }
799
800    //  Set the initial state for the state machine
801    state = START_STATE;
802    row = (RBBIStateTableRow *)
803            //(statetable->fTableData + (statetable->fRowLen * state));
804            (tableData + tableRowLen * state);
805
806
807    mode     = RBBI_RUN;
808    if (statetable->fFlags & RBBI_BOF_REQUIRED) {
809        category = 2;
810        mode     = RBBI_START;
811    }
812
813
814    // loop until we reach the end of the text or transition to state 0
815    //
816    for (;;) {
817        if (c == U_SENTINEL) {
818            // Reached end of input string.
819            if (mode == RBBI_END) {
820                // We have already run the loop one last time with the
821                //   character set to the psueudo {eof} value.  Now it is time
822                //   to unconditionally bail out.
823                break;
824            }
825            // Run the loop one last time with the fake end-of-input character category.
826            mode = RBBI_END;
827            category = 1;
828        }
829
830        //
831        // Get the char category.  An incoming category of 1 or 2 means that
832        //      we are preset for doing the beginning or end of input, and
833        //      that we shouldn't get a category from an actual text input character.
834        //
835        if (mode == RBBI_RUN) {
836            // look up the current character's character category, which tells us
837            // which column in the state table to look at.
838            // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
839            //        not the size of the character going in, which is a UChar32.
840            //
841            category = UTRIE2_GET16(fData->fTrie, c);
842
843            // Check the dictionary bit in the character's category.
844            //    Counter is only used by dictionary based iteration.
845            //    Chars that need to be handled by a dictionary have a flag bit set
846            //    in their category values.
847            //
848            if ((category & 0x4000) != 0)  {
849                fDictionaryCharCount++;
850                //  And off the dictionary flag bit.
851                category &= ~0x4000;
852            }
853        }
854
855       #ifdef RBBI_DEBUG
856            if (gTrace) {
857                RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
858                if (0x20<=c && c<0x7f) {
859                    RBBIDebugPrintf("\"%c\"  ", c);
860                } else {
861                    RBBIDebugPrintf("%5x  ", c);
862                }
863                RBBIDebugPrintf("%3d  %3d\n", state, category);
864            }
865        #endif
866
867        // State Transition - move machine to its next state
868        //
869
870        // Note: fNextState is defined as uint16_t[2], but we are casting
871        // a generated RBBI table to RBBIStateTableRow and some tables
872        // actually have more than 2 categories.
873        U_ASSERT(category<fData->fHeader->fCatCount);
874        state = row->fNextState[category];  /*Not accessing beyond memory*/
875        row = (RBBIStateTableRow *)
876            // (statetable->fTableData + (statetable->fRowLen * state));
877            (tableData + tableRowLen * state);
878
879
880        if (row->fAccepting == -1) {
881            // Match found, common case.
882            if (mode != RBBI_START) {
883                result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
884            }
885            fRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
886        }
887
888        int16_t completedRule = row->fAccepting;
889        if (completedRule > 0) {
890            // Lookahead match is completed.
891            int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
892            if (lookaheadResult >= 0) {
893                fRuleStatusIndex = row->fTagIdx;
894                fPosition = lookaheadResult;
895                return lookaheadResult;
896            }
897        }
898        int16_t rule = row->fLookAhead;
899        if (rule != 0) {
900            // At the position of a '/' in a look-ahead match. Record it.
901            int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
902            lookAheadMatches.setPosition(rule, pos);
903        }
904
905        if (state == STOP_STATE) {
906            // This is the normal exit from the lookup state machine.
907            // We have advanced through the string until it is certain that no
908            //   longer match is possible, no matter what characters follow.
909            break;
910        }
911
912        // Advance to the next character.
913        // If this is a beginning-of-input loop iteration, don't advance
914        //    the input position.  The next iteration will be processing the
915        //    first real input character.
916        if (mode == RBBI_RUN) {
917            c = UTEXT_NEXT32(fText);
918        } else {
919            if (mode == RBBI_START) {
920                mode = RBBI_RUN;
921            }
922        }
923    }
924
925    // The state machine is done.  Check whether it found a match...
926
927    // If the iterator failed to advance in the match engine, force it ahead by one.
928    //   (This really indicates a defect in the break rules.  They should always match
929    //    at least one character.)
930    if (result == initialPosition) {
931        utext_setNativeIndex(fText, initialPosition);
932        utext_next32(fText);
933        result = (int32_t)utext_getNativeIndex(fText);
934        fRuleStatusIndex = 0;
935    }
936
937    // Leave the iterator at our result position.
938    fPosition = result;
939    #ifdef RBBI_DEBUG
940        if (gTrace) {
941            RBBIDebugPrintf("result = %d\n\n", result);
942        }
943    #endif
944    return result;
945}
946
947
948
949//-----------------------------------------------------------------------------------
950//
951//  handlePrevious()
952//
953//      Iterate backwards using the safe reverse rules.
954//      The logic of this function is very similar to handleNext(), above.
955//
956//-----------------------------------------------------------------------------------
957int32_t RuleBasedBreakIterator::handlePrevious(int32_t fromPosition) {
958    int32_t             state;
959    uint16_t            category        = 0;
960    RBBIRunMode         mode;
961    RBBIStateTableRow  *row;
962    UChar32             c;
963    LookAheadResults    lookAheadMatches;
964    int32_t             result          = 0;
965    int32_t             initialPosition = 0;
966
967    const RBBIStateTable *stateTable = fData->fSafeRevTable;
968    UTEXT_SETNATIVEINDEX(fText, fromPosition);
969    #ifdef RBBI_DEBUG
970        if (gTrace) {
971            RBBIDebugPuts("Handle Previous   pos   char  state category");
972        }
973    #endif
974
975    // if we're already at the start of the text, return DONE.
976    if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
977        return BreakIterator::DONE;
978    }
979
980    //  Set up the starting char.
981    initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
982    result          = initialPosition;
983    c               = UTEXT_PREVIOUS32(fText);
984
985    //  Set the initial state for the state machine
986    state = START_STATE;
987    row = (RBBIStateTableRow *)
988            (stateTable->fTableData + (stateTable->fRowLen * state));
989    category = 3;
990    mode     = RBBI_RUN;
991    if (stateTable->fFlags & RBBI_BOF_REQUIRED) {
992        category = 2;
993        mode     = RBBI_START;
994    }
995
996
997    // loop until we reach the start of the text or transition to state 0
998    //
999    for (;;) {
1000        if (c == U_SENTINEL) {
1001            // Reached end of input string.
1002            if (mode == RBBI_END) {
1003                // We have already run the loop one last time with the
1004                //   character set to the psueudo {eof} value.  Now it is time
1005                //   to unconditionally bail out.
1006                break;
1007            }
1008            // Run the loop one last time with the fake end-of-input character category.
1009            mode = RBBI_END;
1010            category = 1;
1011        }
1012
1013        //
1014        // Get the char category.  An incoming category of 1 or 2 means that
1015        //      we are preset for doing the beginning or end of input, and
1016        //      that we shouldn't get a category from an actual text input character.
1017        //
1018        if (mode == RBBI_RUN) {
1019            // look up the current character's character category, which tells us
1020            // which column in the state table to look at.
1021            // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1022            //        not the size of the character going in, which is a UChar32.
1023            //
1024            //  And off the dictionary flag bit. For reverse iteration it is not used.
1025            category = UTRIE2_GET16(fData->fTrie, c);
1026            category &= ~0x4000;
1027        }
1028
1029        #ifdef RBBI_DEBUG
1030            if (gTrace) {
1031                RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
1032                if (0x20<=c && c<0x7f) {
1033                    RBBIDebugPrintf("\"%c\"  ", c);
1034                } else {
1035                    RBBIDebugPrintf("%5x  ", c);
1036                }
1037                RBBIDebugPrintf("%3d  %3d\n", state, category);
1038            }
1039        #endif
1040
1041        // State Transition - move machine to its next state
1042        //
1043
1044        // Note: fNextState is defined as uint16_t[2], but we are casting
1045        // a generated RBBI table to RBBIStateTableRow and some tables
1046        // actually have more than 2 categories.
1047        U_ASSERT(category<fData->fHeader->fCatCount);
1048        state = row->fNextState[category];  /*Not accessing beyond memory*/
1049        row = (RBBIStateTableRow *)
1050            (stateTable->fTableData + (stateTable->fRowLen * state));
1051
1052        if (row->fAccepting == -1) {
1053            // Match found, common case.
1054            result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1055        }
1056
1057        int16_t completedRule = row->fAccepting;
1058        if (completedRule > 0) {
1059            // Lookahead match is completed.
1060            int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
1061            if (lookaheadResult >= 0) {
1062                UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
1063                return lookaheadResult;
1064            }
1065        }
1066        int16_t rule = row->fLookAhead;
1067        if (rule != 0) {
1068            // At the position of a '/' in a look-ahead match. Record it.
1069            int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1070            lookAheadMatches.setPosition(rule, pos);
1071        }
1072
1073        if (state == STOP_STATE) {
1074            // This is the normal exit from the lookup state machine.
1075            // We have advanced through the string until it is certain that no
1076            //   longer match is possible, no matter what characters follow.
1077            break;
1078        }
1079
1080        // Move (backwards) to the next character to process.
1081        // If this is a beginning-of-input loop iteration, don't advance
1082        //    the input position.  The next iteration will be processing the
1083        //    first real input character.
1084        if (mode == RBBI_RUN) {
1085            c = UTEXT_PREVIOUS32(fText);
1086        } else {
1087            if (mode == RBBI_START) {
1088                mode = RBBI_RUN;
1089            }
1090        }
1091    }
1092
1093    // The state machine is done.  Check whether it found a match...
1094
1095    // If the iterator failed to advance in the match engine, force it ahead by one.
1096    //   (This really indicates a defect in the break rules.  They should always match
1097    //    at least one character.)
1098    if (result == initialPosition) {
1099        UTEXT_SETNATIVEINDEX(fText, initialPosition);
1100        UTEXT_PREVIOUS32(fText);
1101        result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1102    }
1103
1104    #ifdef RBBI_DEBUG
1105        if (gTrace) {
1106            RBBIDebugPrintf("result = %d\n\n", result);
1107        }
1108    #endif
1109    return result;
1110}
1111
1112
1113//-------------------------------------------------------------------------------
1114//
1115//   getRuleStatus()   Return the break rule tag associated with the current
1116//                     iterator position.  If the iterator arrived at its current
1117//                     position by iterating forwards, the value will have been
1118//                     cached by the handleNext() function.
1119//
1120//-------------------------------------------------------------------------------
1121
1122int32_t  RuleBasedBreakIterator::getRuleStatus() const {
1123
1124    // fLastRuleStatusIndex indexes to the start of the appropriate status record
1125    //                                                 (the number of status values.)
1126    //   This function returns the last (largest) of the array of status values.
1127    int32_t  idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex];
1128    int32_t  tagVal = fData->fRuleStatusTable[idx];
1129
1130    return tagVal;
1131}
1132
1133
1134int32_t RuleBasedBreakIterator::getRuleStatusVec(
1135             int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
1136    if (U_FAILURE(status)) {
1137        return 0;
1138    }
1139
1140    int32_t  numVals = fData->fRuleStatusTable[fRuleStatusIndex];
1141    int32_t  numValsToCopy = numVals;
1142    if (numVals > capacity) {
1143        status = U_BUFFER_OVERFLOW_ERROR;
1144        numValsToCopy = capacity;
1145    }
1146    int i;
1147    for (i=0; i<numValsToCopy; i++) {
1148        fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1];
1149    }
1150    return numVals;
1151}
1152
1153
1154
1155//-------------------------------------------------------------------------------
1156//
1157//   getBinaryRules        Access to the compiled form of the rules,
1158//                         for use by build system tools that save the data
1159//                         for standard iterator types.
1160//
1161//-------------------------------------------------------------------------------
1162const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1163    const uint8_t  *retPtr = NULL;
1164    length = 0;
1165
1166    if (fData != NULL) {
1167        retPtr = (const uint8_t *)fData->fHeader;
1168        length = fData->fHeader->fLength;
1169    }
1170    return retPtr;
1171}
1172
1173
1174BreakIterator *  RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,
1175                                   int32_t &bufferSize,
1176                                   UErrorCode &status)
1177{
1178    if (U_FAILURE(status)){
1179        return NULL;
1180    }
1181
1182    if (bufferSize == 0) {
1183        bufferSize = 1;  // preflighting for deprecated functionality
1184        return NULL;
1185    }
1186
1187    BreakIterator *clonedBI = clone();
1188    if (clonedBI == NULL) {
1189        status = U_MEMORY_ALLOCATION_ERROR;
1190    } else {
1191        status = U_SAFECLONE_ALLOCATED_WARNING;
1192    }
1193    return (RuleBasedBreakIterator *)clonedBI;
1194}
1195
1196U_NAMESPACE_END
1197
1198
1199static icu::UStack *gLanguageBreakFactories = nullptr;
1200static const icu::UnicodeString *gEmptyString = nullptr;
1201static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
1202static icu::UInitOnce gRBBIInitOnce = U_INITONCE_INITIALIZER;
1203
1204/**
1205 * Release all static memory held by breakiterator.
1206 */
1207U_CDECL_BEGIN
1208static UBool U_CALLCONV rbbi_cleanup(void) {
1209    delete gLanguageBreakFactories;
1210    gLanguageBreakFactories = nullptr;
1211    delete gEmptyString;
1212    gEmptyString = nullptr;
1213    gLanguageBreakFactoriesInitOnce.reset();
1214    gRBBIInitOnce.reset();
1215    return TRUE;
1216}
1217U_CDECL_END
1218
1219U_CDECL_BEGIN
1220static void U_CALLCONV _deleteFactory(void *obj) {
1221    delete (icu::LanguageBreakFactory *) obj;
1222}
1223U_CDECL_END
1224U_NAMESPACE_BEGIN
1225
1226static void U_CALLCONV rbbiInit() {
1227    gEmptyString = new UnicodeString();
1228    ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
1229}
1230
1231static void U_CALLCONV initLanguageFactories() {
1232    UErrorCode status = U_ZERO_ERROR;
1233    U_ASSERT(gLanguageBreakFactories == NULL);
1234    gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
1235    if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
1236        ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1237        gLanguageBreakFactories->push(builtIn, status);
1238#ifdef U_LOCAL_SERVICE_HOOK
1239        LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1240        if (extra != NULL) {
1241            gLanguageBreakFactories->push(extra, status);
1242        }
1243#endif
1244    }
1245    ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
1246}
1247
1248
1249static const LanguageBreakEngine*
1250getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
1251{
1252    umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
1253    if (gLanguageBreakFactories == NULL) {
1254        return NULL;
1255    }
1256
1257    int32_t i = gLanguageBreakFactories->size();
1258    const LanguageBreakEngine *lbe = NULL;
1259    while (--i >= 0) {
1260        LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1261        lbe = factory->getEngineFor(c, breakType);
1262        if (lbe != NULL) {
1263            break;
1264        }
1265    }
1266    return lbe;
1267}
1268
1269
1270//-------------------------------------------------------------------------------
1271//
1272//  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
1273//                          the character c.
1274//
1275//-------------------------------------------------------------------------------
1276const LanguageBreakEngine *
1277RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1278    const LanguageBreakEngine *lbe = NULL;
1279    UErrorCode status = U_ZERO_ERROR;
1280
1281    if (fLanguageBreakEngines == NULL) {
1282        fLanguageBreakEngines = new UStack(status);
1283        if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
1284            delete fLanguageBreakEngines;
1285            fLanguageBreakEngines = 0;
1286            return NULL;
1287        }
1288    }
1289
1290    int32_t i = fLanguageBreakEngines->size();
1291    while (--i >= 0) {
1292        lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1293        if (lbe->handles(c, fBreakType)) {
1294            return lbe;
1295        }
1296    }
1297
1298    // No existing dictionary took the character. See if a factory wants to
1299    // give us a new LanguageBreakEngine for this character.
1300    lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
1301
1302    // If we got one, use it and push it on our stack.
1303    if (lbe != NULL) {
1304        fLanguageBreakEngines->push((void *)lbe, status);
1305        // Even if we can't remember it, we can keep looking it up, so
1306        // return it even if the push fails.
1307        return lbe;
1308    }
1309
1310    // No engine is forthcoming for this character. Add it to the
1311    // reject set. Create the reject break engine if needed.
1312    if (fUnhandledBreakEngine == NULL) {
1313        fUnhandledBreakEngine = new UnhandledEngine(status);
1314        if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1315            status = U_MEMORY_ALLOCATION_ERROR;
1316        }
1317        // Put it last so that scripts for which we have an engine get tried
1318        // first.
1319        fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1320        // If we can't insert it, or creation failed, get rid of it
1321        if (U_FAILURE(status)) {
1322            delete fUnhandledBreakEngine;
1323            fUnhandledBreakEngine = 0;
1324            return NULL;
1325        }
1326    }
1327
1328    // Tell the reject engine about the character; at its discretion, it may
1329    // add more than just the one character.
1330    fUnhandledBreakEngine->handleCharacter(c, fBreakType);
1331
1332    return fUnhandledBreakEngine;
1333}
1334
1335
1336
1337/*int32_t RuleBasedBreakIterator::getBreakType() const {
1338    return fBreakType;
1339}*/
1340
1341void RuleBasedBreakIterator::setBreakType(int32_t type) {
1342    fBreakType = type;
1343}
1344
1345void RuleBasedBreakIterator::dumpCache() {
1346    fBreakCache->dumpCache();
1347}
1348
1349/**
1350 * Returns the description used to create this iterator
1351 */
1352
1353const UnicodeString&
1354RuleBasedBreakIterator::getRules() const {
1355    if (fData != NULL) {
1356        return fData->getRuleSourceString();
1357    } else {
1358        umtx_initOnce(gRBBIInitOnce, &rbbiInit);
1359        return *gEmptyString;
1360    }
1361}
1362
1363U_NAMESPACE_END
1364
1365#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1366