1/*
2***************************************************************************
3*   Copyright (C) 1999-2008 International Business Machines Corporation   *
4*   and others. All rights reserved.                                      *
5***************************************************************************
6*/
7//
8//  file:  rbbi.c    Contains the implementation of the rule based break iterator
9//                   runtime engine and the API implementation for
10//                   class RuleBasedBreakIterator
11//
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_BREAK_ITERATION
16
17#include "unicode/rbbi.h"
18#include "unicode/schriter.h"
19#include "unicode/uchriter.h"
20#include "unicode/udata.h"
21#include "unicode/uclean.h"
22#include "rbbidata.h"
23#include "rbbirb.h"
24#include "cmemory.h"
25#include "cstring.h"
26#include "umutex.h"
27#include "ucln_cmn.h"
28#include "brkeng.h"
29
30#include "uassert.h"
31#include "uvector.h"
32
33// if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
34#if U_LOCAL_SERVICE_HOOK
35#include "localsvc.h"
36#endif
37
38#ifdef RBBI_DEBUG
39static UBool fTrace = FALSE;
40#endif
41
42U_NAMESPACE_BEGIN
43
44// The state number of the starting state
45#define START_STATE 1
46
47// The state-transition value indicating "stop"
48#define STOP_STATE  0
49
50
51UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
52
53
54//=======================================================================
55// constructors
56//=======================================================================
57
58/**
59 * Constructs a RuleBasedBreakIterator that uses the already-created
60 * tables object that is passed in as a parameter.
61 */
62RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
63{
64    init();
65    fData = new RBBIDataWrapper(data, status); // status checked in constructor
66    if (U_FAILURE(status)) {return;}
67    if(fData == 0) {
68        status = U_MEMORY_ALLOCATION_ERROR;
69        return;
70    }
71}
72
73/**
74 * Same as above but does not adopt memory
75 */
76RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
77{
78    init();
79    fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
80    if (U_FAILURE(status)) {return;}
81    if(fData == 0) {
82        status = U_MEMORY_ALLOCATION_ERROR;
83        return;
84    }
85}
86
87//-------------------------------------------------------------------------------
88//
89//   Constructor   from a UDataMemory handle to precompiled break rules
90//                 stored in an ICU data file.
91//
92//-------------------------------------------------------------------------------
93RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
94{
95    init();
96    fData = new RBBIDataWrapper(udm, status); // status checked in constructor
97    if (U_FAILURE(status)) {return;}
98    if(fData == 0) {
99        status = U_MEMORY_ALLOCATION_ERROR;
100        return;
101    }
102}
103
104
105
106//-------------------------------------------------------------------------------
107//
108//   Constructor       from a set of rules supplied as a string.
109//
110//-------------------------------------------------------------------------------
111RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
112                                                UParseError          &parseError,
113                                                UErrorCode           &status)
114{
115    init();
116    if (U_FAILURE(status)) {return;}
117    RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
118        RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
119    // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
120    //        creates and returns a complete RBBI.  From here, in a constructor, we
121    //        can't just return the object created by the builder factory, hence
122    //        the assignment of the factory created object to "this".
123    if (U_SUCCESS(status)) {
124        *this = *bi;
125        delete bi;
126    }
127}
128
129
130//-------------------------------------------------------------------------------
131//
132// Default Constructor.      Create an empty shell that can be set up later.
133//                           Used when creating a RuleBasedBreakIterator from a set
134//                           of rules.
135//-------------------------------------------------------------------------------
136RuleBasedBreakIterator::RuleBasedBreakIterator() {
137    init();
138}
139
140
141//-------------------------------------------------------------------------------
142//
143//   Copy constructor.  Will produce a break iterator with the same behavior,
144//                      and which iterates over the same text, as the one passed in.
145//
146//-------------------------------------------------------------------------------
147RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
148: BreakIterator(other)
149{
150    this->init();
151    *this = other;
152}
153
154
155/**
156 * Destructor
157 */
158RuleBasedBreakIterator::~RuleBasedBreakIterator() {
159    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
160        // fCharIter was adopted from the outside.
161        delete fCharIter;
162    }
163    fCharIter = NULL;
164    delete fSCharIter;
165    fCharIter = NULL;
166    delete fDCharIter;
167    fDCharIter = NULL;
168
169    utext_close(fText);
170
171    if (fData != NULL) {
172        fData->removeReference();
173        fData = NULL;
174    }
175    if (fCachedBreakPositions) {
176        uprv_free(fCachedBreakPositions);
177        fCachedBreakPositions = NULL;
178    }
179    if (fLanguageBreakEngines) {
180        delete fLanguageBreakEngines;
181        fLanguageBreakEngines = NULL;
182    }
183    if (fUnhandledBreakEngine) {
184        delete fUnhandledBreakEngine;
185        fUnhandledBreakEngine = NULL;
186    }
187}
188
189/**
190 * Assignment operator.  Sets this iterator to have the same behavior,
191 * and iterate over the same text, as the one passed in.
192 */
193RuleBasedBreakIterator&
194RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
195    if (this == &that) {
196        return *this;
197    }
198    reset();    // Delete break cache information
199    fBreakType = that.fBreakType;
200    if (fLanguageBreakEngines != NULL) {
201        delete fLanguageBreakEngines;
202        fLanguageBreakEngines = NULL;   // Just rebuild for now
203    }
204    // TODO: clone fLanguageBreakEngines from "that"
205    UErrorCode status = U_ZERO_ERROR;
206    fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
207
208    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
209        delete fCharIter;
210    }
211    fCharIter = NULL;
212
213    if (that.fCharIter != NULL ) {
214        // This is a little bit tricky - it will intially appear that
215        //  this->fCharIter is adopted, even if that->fCharIter was
216        //  not adopted.  That's ok.
217        fCharIter = that.fCharIter->clone();
218    }
219
220    if (fData != NULL) {
221        fData->removeReference();
222        fData = NULL;
223    }
224    if (that.fData != NULL) {
225        fData = that.fData->addReference();
226    }
227
228    return *this;
229}
230
231
232
233//-----------------------------------------------------------------------------
234//
235//    init()      Shared initialization routine.   Used by all the constructors.
236//                Initializes all fields, leaving the object in a consistent state.
237//
238//-----------------------------------------------------------------------------
239void RuleBasedBreakIterator::init() {
240    UErrorCode  status    = U_ZERO_ERROR;
241    fBufferClone          = FALSE;
242    fText                 = utext_openUChars(NULL, NULL, 0, &status);
243    fCharIter             = NULL;
244    fSCharIter            = NULL;
245    fDCharIter            = NULL;
246    fData                 = NULL;
247    fLastRuleStatusIndex  = 0;
248    fLastStatusIndexValid = TRUE;
249    fDictionaryCharCount  = 0;
250    fBreakType            = -1;
251
252    fCachedBreakPositions    = NULL;
253    fLanguageBreakEngines    = NULL;
254    fUnhandledBreakEngine    = NULL;
255    fNumCachedBreakPositions = 0;
256    fPositionInCache         = 0;
257
258#ifdef RBBI_DEBUG
259    static UBool debugInitDone = FALSE;
260    if (debugInitDone == FALSE) {
261        char *debugEnv = getenv("U_RBBIDEBUG");
262        if (debugEnv && uprv_strstr(debugEnv, "trace")) {
263            fTrace = TRUE;
264        }
265        debugInitDone = TRUE;
266    }
267#endif
268}
269
270
271
272//-----------------------------------------------------------------------------
273//
274//    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
275//            behavior, and iterating over the same text, as this one.
276//            Virtual function: does the right thing with subclasses.
277//
278//-----------------------------------------------------------------------------
279BreakIterator*
280RuleBasedBreakIterator::clone(void) const {
281    return new RuleBasedBreakIterator(*this);
282}
283
284/**
285 * Equality operator.  Returns TRUE if both BreakIterators are of the
286 * same class, have the same behavior, and iterate over the same text.
287 */
288UBool
289RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
290    if (that.getDynamicClassID() != getDynamicClassID()) {
291        return FALSE;
292    }
293
294    const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
295
296    if (!utext_equals(fText, that2.fText)) {
297        // The two break iterators are operating on different text,
298        //   or have a different interation position.
299        return FALSE;
300    };
301
302    // TODO:  need a check for when in a dictionary region at different offsets.
303
304    if (that2.fData == fData ||
305        (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
306            // The two break iterators are using the same rules.
307            return TRUE;
308        }
309    return FALSE;
310}
311
312/**
313 * Compute a hash code for this BreakIterator
314 * @return A hash code
315 */
316int32_t
317RuleBasedBreakIterator::hashCode(void) const {
318    int32_t   hash = 0;
319    if (fData != NULL) {
320        hash = fData->hashCode();
321    }
322    return hash;
323}
324
325
326void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
327    if (U_FAILURE(status)) {
328        return;
329    }
330    reset();
331    fText = utext_clone(fText, ut, FALSE, TRUE, &status);
332
333    // Set up a dummy CharacterIterator to be returned if anyone
334    //   calls getText().  With input from UText, there is no reasonable
335    //   way to return a characterIterator over the actual input text.
336    //   Return one over an empty string instead - this is the closest
337    //   we can come to signaling a failure.
338    //   (GetText() is obsolete, this failure is sort of OK)
339    if (fDCharIter == NULL) {
340        static const UChar c = 0;
341        fDCharIter = new UCharCharacterIterator(&c, 0);
342        if (fDCharIter == NULL) {
343            status = U_MEMORY_ALLOCATION_ERROR;
344            return;
345        }
346    }
347
348    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
349        // existing fCharIter was adopted from the outside.  Delete it now.
350        delete fCharIter;
351    }
352    fCharIter = fDCharIter;
353
354    this->first();
355}
356
357
358UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
359    UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
360    return result;
361}
362
363
364
365/**
366 * Returns the description used to create this iterator
367 */
368const UnicodeString&
369RuleBasedBreakIterator::getRules() const {
370    if (fData != NULL) {
371        return fData->getRuleSourceString();
372    } else {
373        static const UnicodeString *s;
374        if (s == NULL) {
375            // TODO:  something more elegant here.
376            //        perhaps API should return the string by value.
377            //        Note:  thread unsafe init & leak are semi-ok, better than
378            //               what was before.  Sould be cleaned up, though.
379            s = new UnicodeString;
380        }
381        return *s;
382    }
383}
384
385//=======================================================================
386// BreakIterator overrides
387//=======================================================================
388
389/**
390 * Return a CharacterIterator over the text being analyzed.
391 */
392CharacterIterator&
393RuleBasedBreakIterator::getText() const {
394    return *fCharIter;
395}
396
397/**
398 * Set the iterator to analyze a new piece of text.  This function resets
399 * the current iteration position to the beginning of the text.
400 * @param newText An iterator over the text to analyze.
401 */
402void
403RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
404    // If we are holding a CharacterIterator adopted from a
405    //   previous call to this function, delete it now.
406    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
407        delete fCharIter;
408    }
409
410    fCharIter = newText;
411    UErrorCode status = U_ZERO_ERROR;
412    reset();
413    if (newText==NULL || newText->startIndex() != 0) {
414        // startIndex !=0 wants to be an error, but there's no way to report it.
415        // Make the iterator text be an empty string.
416        fText = utext_openUChars(fText, NULL, 0, &status);
417    } else {
418        fText = utext_openCharacterIterator(fText, newText, &status);
419    }
420    this->first();
421}
422
423/**
424 * Set the iterator to analyze a new piece of text.  This function resets
425 * the current iteration position to the beginning of the text.
426 * @param newText An iterator over the text to analyze.
427 */
428void
429RuleBasedBreakIterator::setText(const UnicodeString& newText) {
430    UErrorCode status = U_ZERO_ERROR;
431    reset();
432    fText = utext_openConstUnicodeString(fText, &newText, &status);
433
434    // Set up a character iterator on the string.
435    //   Needed in case someone calls getText().
436    //  Can not, unfortunately, do this lazily on the (probably never)
437    //  call to getText(), because getText is const.
438    if (fSCharIter == NULL) {
439        fSCharIter = new StringCharacterIterator(newText);
440    } else {
441        fSCharIter->setText(newText);
442    }
443
444    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
445        // old fCharIter was adopted from the outside.  Delete it.
446        delete fCharIter;
447    }
448    fCharIter = fSCharIter;
449
450    this->first();
451}
452
453
454
455/**
456 * Sets the current iteration position to the beginning of the text.
457 * @return The offset of the beginning of the text.
458 */
459int32_t RuleBasedBreakIterator::first(void) {
460    reset();
461    fLastRuleStatusIndex  = 0;
462    fLastStatusIndexValid = TRUE;
463    //if (fText == NULL)
464    //    return BreakIterator::DONE;
465
466    utext_setNativeIndex(fText, 0);
467    return 0;
468}
469
470/**
471 * Sets the current iteration position to the end of the text.
472 * @return The text's past-the-end offset.
473 */
474int32_t RuleBasedBreakIterator::last(void) {
475    reset();
476    if (fText == NULL) {
477        fLastRuleStatusIndex  = 0;
478        fLastStatusIndexValid = TRUE;
479        return BreakIterator::DONE;
480    }
481
482    fLastStatusIndexValid = FALSE;
483    int32_t pos = (int32_t)utext_nativeLength(fText);
484    utext_setNativeIndex(fText, pos);
485    return pos;
486}
487
488/**
489 * Advances the iterator either forward or backward the specified number of steps.
490 * Negative values move backward, and positive values move forward.  This is
491 * equivalent to repeatedly calling next() or previous().
492 * @param n The number of steps to move.  The sign indicates the direction
493 * (negative is backwards, and positive is forwards).
494 * @return The character offset of the boundary position n boundaries away from
495 * the current one.
496 */
497int32_t RuleBasedBreakIterator::next(int32_t n) {
498    int32_t result = current();
499    while (n > 0) {
500        result = next();
501        --n;
502    }
503    while (n < 0) {
504        result = previous();
505        ++n;
506    }
507    return result;
508}
509
510/**
511 * Advances the iterator to the next boundary position.
512 * @return The position of the first boundary after this one.
513 */
514int32_t RuleBasedBreakIterator::next(void) {
515    // if we have cached break positions and we're still in the range
516    // covered by them, just move one step forward in the cache
517    if (fCachedBreakPositions != NULL) {
518        if (fPositionInCache < fNumCachedBreakPositions - 1) {
519            ++fPositionInCache;
520            int32_t pos = fCachedBreakPositions[fPositionInCache];
521            utext_setNativeIndex(fText, pos);
522            return pos;
523        }
524        else {
525            reset();
526        }
527    }
528
529    int32_t startPos = current();
530    int32_t result = handleNext(fData->fForwardTable);
531    if (fDictionaryCharCount > 0) {
532        result = checkDictionary(startPos, result, FALSE);
533    }
534    return result;
535}
536
537/**
538 * Advances the iterator backwards, to the last boundary preceding this one.
539 * @return The position of the last boundary position preceding this one.
540 */
541int32_t RuleBasedBreakIterator::previous(void) {
542    int32_t result;
543    int32_t startPos;
544
545    // if we have cached break positions and we're still in the range
546    // covered by them, just move one step backward in the cache
547    if (fCachedBreakPositions != NULL) {
548        if (fPositionInCache > 0) {
549            --fPositionInCache;
550            // If we're at the beginning of the cache, need to reevaluate the
551            // rule status
552            if (fPositionInCache <= 0) {
553                fLastStatusIndexValid = FALSE;
554            }
555            int32_t pos = fCachedBreakPositions[fPositionInCache];
556            utext_setNativeIndex(fText, pos);
557            return pos;
558        }
559        else {
560            reset();
561        }
562    }
563
564    // if we're already sitting at the beginning of the text, return DONE
565    if (fText == NULL || (startPos = current()) == 0) {
566        fLastRuleStatusIndex  = 0;
567        fLastStatusIndexValid = TRUE;
568        return BreakIterator::DONE;
569    }
570
571    if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
572        result = handlePrevious(fData->fReverseTable);
573        if (fDictionaryCharCount > 0) {
574            result = checkDictionary(result, startPos, TRUE);
575        }
576        return result;
577    }
578
579    // old rule syntax
580    // set things up.  handlePrevious() will back us up to some valid
581    // break position before the current position (we back our internal
582    // iterator up one step to prevent handlePrevious() from returning
583    // the current position), but not necessarily the last one before
584
585    // where we started
586
587    int32_t start = current();
588
589    UTEXT_PREVIOUS32(fText);
590    int32_t lastResult    = handlePrevious(fData->fReverseTable);
591    if (lastResult == UBRK_DONE) {
592        lastResult = 0;
593        utext_setNativeIndex(fText, 0);
594    }
595    result = lastResult;
596    int32_t lastTag       = 0;
597    UBool   breakTagValid = FALSE;
598
599    // iterate forward from the known break position until we pass our
600    // starting point.  The last break position before the starting
601    // point is our return value
602
603    for (;;) {
604        result         = next();
605        if (result == BreakIterator::DONE || result >= start) {
606            break;
607        }
608        lastResult     = result;
609        lastTag        = fLastRuleStatusIndex;
610        breakTagValid  = TRUE;
611    }
612
613    // fLastBreakTag wants to have the value for section of text preceding
614    // the result position that we are to return (in lastResult.)  If
615    // the backwards rules overshot and the above loop had to do two or more
616    // next()s to move up to the desired return position, we will have a valid
617    // tag value. But, if handlePrevious() took us to exactly the correct result positon,
618    // we wont have a tag value for that position, which is only set by handleNext().
619
620    // set the current iteration position to be the last break position
621    // before where we started, and then return that value
622    utext_setNativeIndex(fText, lastResult);
623    fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
624    fLastStatusIndexValid = breakTagValid;
625
626    // No need to check the dictionary; it will have been handled by
627    // next()
628
629    return lastResult;
630}
631
632/**
633 * Sets the iterator to refer to the first boundary position following
634 * the specified position.
635 * @offset The position from which to begin searching for a break position.
636 * @return The position of the first break after the current position.
637 */
638int32_t RuleBasedBreakIterator::following(int32_t offset) {
639    // if we have cached break positions and offset is in the range
640    // covered by them, use them
641    // TODO: could use binary search
642    // TODO: what if offset is outside range, but break is not?
643    if (fCachedBreakPositions != NULL) {
644        if (offset >= fCachedBreakPositions[0]
645                && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
646            fPositionInCache = 0;
647            // We are guaranteed not to leave the array due to range test above
648            while (offset >= fCachedBreakPositions[fPositionInCache]) {
649                ++fPositionInCache;
650            }
651            int32_t pos = fCachedBreakPositions[fPositionInCache];
652            utext_setNativeIndex(fText, pos);
653            return pos;
654        }
655        else {
656            reset();
657        }
658    }
659
660    // if the offset passed in is already past the end of the text,
661    // just return DONE; if it's before the beginning, return the
662    // text's starting offset
663    fLastRuleStatusIndex  = 0;
664    fLastStatusIndexValid = TRUE;
665    if (fText == NULL || offset >= utext_nativeLength(fText)) {
666        last();
667        return next();
668    }
669    else if (offset < 0) {
670        return first();
671    }
672
673    // otherwise, set our internal iteration position (temporarily)
674    // to the position passed in.  If this is the _beginning_ position,
675    // then we can just use next() to get our return value
676
677    int32_t result = 0;
678
679    if (fData->fSafeRevTable != NULL) {
680        // new rule syntax
681        utext_setNativeIndex(fText, offset);
682        // move forward one codepoint to prepare for moving back to a
683        // safe point.
684        // this handles offset being between a supplementary character
685        UTEXT_NEXT32(fText);
686        // handlePrevious will move most of the time to < 1 boundary away
687        handlePrevious(fData->fSafeRevTable);
688        int32_t result = next();
689        while (result <= offset) {
690            result = next();
691        }
692        return result;
693    }
694    if (fData->fSafeFwdTable != NULL) {
695        // backup plan if forward safe table is not available
696        utext_setNativeIndex(fText, offset);
697        UTEXT_PREVIOUS32(fText);
698        // handle next will give result >= offset
699        handleNext(fData->fSafeFwdTable);
700        // previous will give result 0 or 1 boundary away from offset,
701        // most of the time
702        // we have to
703        int32_t oldresult = previous();
704        while (oldresult > offset) {
705            int32_t result = previous();
706            if (result <= offset) {
707                return oldresult;
708            }
709            oldresult = result;
710        }
711        int32_t result = next();
712        if (result <= offset) {
713            return next();
714        }
715        return result;
716    }
717    // otherwise, we have to sync up first.  Use handlePrevious() to back
718    // up to a known break position before the specified position (if
719    // we can determine that the specified position is a break position,
720    // we don't back up at all).  This may or may not be the last break
721    // position at or before our starting position.  Advance forward
722    // from here until we've passed the starting position.  The position
723    // we stop on will be the first break position after the specified one.
724    // old rule syntax
725
726    utext_setNativeIndex(fText, offset);
727    if (offset==0 ||
728        offset==1  && utext_getNativeIndex(fText)==0) {
729        return next();
730    }
731    result = previous();
732
733    while (result != BreakIterator::DONE && result <= offset) {
734        result = next();
735    }
736
737    return result;
738}
739
740/**
741 * Sets the iterator to refer to the last boundary position before the
742 * specified position.
743 * @offset The position to begin searching for a break from.
744 * @return The position of the last boundary before the starting position.
745 */
746int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
747    // if we have cached break positions and offset is in the range
748    // covered by them, use them
749    if (fCachedBreakPositions != NULL) {
750        // TODO: binary search?
751        // TODO: What if offset is outside range, but break is not?
752        if (offset > fCachedBreakPositions[0]
753                && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
754            fPositionInCache = 0;
755            while (fPositionInCache < fNumCachedBreakPositions
756                   && offset > fCachedBreakPositions[fPositionInCache])
757                ++fPositionInCache;
758            --fPositionInCache;
759            // If we're at the beginning of the cache, need to reevaluate the
760            // rule status
761            if (fPositionInCache <= 0) {
762                fLastStatusIndexValid = FALSE;
763            }
764            utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
765            return fCachedBreakPositions[fPositionInCache];
766        }
767        else {
768            reset();
769        }
770    }
771
772    // if the offset passed in is already past the end of the text,
773    // just return DONE; if it's before the beginning, return the
774    // text's starting offset
775    if (fText == NULL || offset > utext_nativeLength(fText)) {
776        // return BreakIterator::DONE;
777        return last();
778    }
779    else if (offset < 0) {
780        return first();
781    }
782
783    // if we start by updating the current iteration position to the
784    // position specified by the caller, we can just use previous()
785    // to carry out this operation
786
787    if (fData->fSafeFwdTable != NULL) {
788        // new rule syntax
789        utext_setNativeIndex(fText, offset);
790        int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
791        if (newOffset != offset) {
792            // Will come here if specified offset was not a code point boundary AND
793            //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
794            //   indices to the containing code point.
795            // For breakitereator::preceding only, these non-code-point indices need to be moved
796            //   up to refer to the following codepoint.
797            UTEXT_NEXT32(fText);
798            offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
799        }
800
801        // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
802        //        rather than adjusting the position unconditionally?
803        //        (Change would interact with safe rules.)
804        // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
805        //        affects only preceding(), seems cleaner, but is slightly different.
806        UTEXT_PREVIOUS32(fText);
807        handleNext(fData->fSafeFwdTable);
808        int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
809        while (result >= offset) {
810            result = previous();
811        }
812        return result;
813    }
814    if (fData->fSafeRevTable != NULL) {
815        // backup plan if forward safe table is not available
816        //  TODO:  check whether this path can be discarded
817        //         It's probably OK to say that rules must supply both safe tables
818        //            if they use safe tables at all.  We have certainly never described
819        //            to anyone how to work with just one safe table.
820        utext_setNativeIndex(fText, offset);
821        UTEXT_NEXT32(fText);
822
823        // handle previous will give result <= offset
824        handlePrevious(fData->fSafeRevTable);
825
826        // next will give result 0 or 1 boundary away from offset,
827        // most of the time
828        // we have to
829        int32_t oldresult = next();
830        while (oldresult < offset) {
831            int32_t result = next();
832            if (result >= offset) {
833                return oldresult;
834            }
835            oldresult = result;
836        }
837        int32_t result = previous();
838        if (result >= offset) {
839            return previous();
840        }
841        return result;
842    }
843
844    // old rule syntax
845    utext_setNativeIndex(fText, offset);
846    return previous();
847}
848
849/**
850 * Returns true if the specfied position is a boundary position.  As a side
851 * effect, leaves the iterator pointing to the first boundary position at
852 * or after "offset".
853 * @param offset the offset to check.
854 * @return True if "offset" is a boundary position.
855 */
856UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
857    // the beginning index of the iterator is always a boundary position by definition
858    if (offset == 0) {
859        first();       // For side effects on current position, tag values.
860        return TRUE;
861    }
862
863    if (offset == (int32_t)utext_nativeLength(fText)) {
864        last();       // For side effects on current position, tag values.
865        return TRUE;
866    }
867
868    // out-of-range indexes are never boundary positions
869    if (offset < 0) {
870        first();       // For side effects on current position, tag values.
871        return FALSE;
872    }
873
874    if (offset > utext_nativeLength(fText)) {
875        last();        // For side effects on current position, tag values.
876        return FALSE;
877    }
878
879    // otherwise, we can use following() on the position before the specified
880    // one and return true if the position we get back is the one the user
881    // specified
882    utext_previous32From(fText, offset);
883    int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
884    UBool    result  = following(backOne) == offset;
885    return result;
886}
887
888/**
889 * Returns the current iteration position.
890 * @return The current iteration position.
891 */
892int32_t RuleBasedBreakIterator::current(void) const {
893    int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
894    return pos;
895}
896
897//=======================================================================
898// implementation
899//=======================================================================
900
901//
902// RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
903//                 of user text.  A variable with this enum type keeps track of where we
904//                 are.  The state machine only fetches user input while in the RUN mode.
905//
906enum RBBIRunMode {
907    RBBI_START,     // state machine processing is before first char of input
908    RBBI_RUN,       // state machine processing is in the user text
909    RBBI_END        // state machine processing is after end of user text.
910};
911
912
913//-----------------------------------------------------------------------------------
914//
915//  handleNext(stateTable)
916//     This method is the actual implementation of the rbbi next() method.
917//     This method initializes the state machine to state 1
918//     and advances through the text character by character until we reach the end
919//     of the text or the state machine transitions to state 0.  We update our return
920//     value every time the state machine passes through an accepting state.
921//
922//-----------------------------------------------------------------------------------
923int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
924    int32_t             state;
925    int16_t             category        = 0;
926    RBBIRunMode         mode;
927
928    RBBIStateTableRow  *row;
929    UChar32             c;
930    int32_t             lookaheadStatus = 0;
931    int32_t             lookaheadTagIdx = 0;
932    int32_t             result          = 0;
933    int32_t             initialPosition = 0;
934    int32_t             lookaheadResult = 0;
935    UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
936    const char         *tableData       = statetable->fTableData;
937    uint32_t            tableRowLen     = statetable->fRowLen;
938
939    #ifdef RBBI_DEBUG
940        if (fTrace) {
941            RBBIDebugPuts("Handle Next   pos   char  state category");
942        }
943    #endif
944
945    // No matter what, handleNext alway correctly sets the break tag value.
946    fLastStatusIndexValid = TRUE;
947    fLastRuleStatusIndex = 0;
948
949    // if we're already at the end of the text, return DONE.
950    initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
951    result          = initialPosition;
952    c               = UTEXT_NEXT32(fText);
953    if (fData == NULL || c==U_SENTINEL) {
954        return BreakIterator::DONE;
955    }
956
957    //  Set the initial state for the state machine
958    state = START_STATE;
959    row = (RBBIStateTableRow *)
960            //(statetable->fTableData + (statetable->fRowLen * state));
961            (tableData + tableRowLen * state);
962
963
964    mode     = RBBI_RUN;
965    if (statetable->fFlags & RBBI_BOF_REQUIRED) {
966        category = 2;
967        mode     = RBBI_START;
968    }
969
970
971    // loop until we reach the end of the text or transition to state 0
972    //
973    for (;;) {
974        if (c == U_SENTINEL) {
975            // Reached end of input string.
976            if (mode == RBBI_END) {
977                // We have already run the loop one last time with the
978                //   character set to the psueudo {eof} value.  Now it is time
979                //   to unconditionally bail out.
980                if (lookaheadResult > result) {
981                    // We ran off the end of the string with a pending look-ahead match.
982                    // Treat this as if the look-ahead condition had been met, and return
983                    //  the match at the / position from the look-ahead rule.
984                    result               = lookaheadResult;
985                    fLastRuleStatusIndex = lookaheadTagIdx;
986                    lookaheadStatus = 0;
987                }
988                break;
989            }
990            // Run the loop one last time with the fake end-of-input character category.
991            mode = RBBI_END;
992            category = 1;
993        }
994
995        //
996        // Get the char category.  An incoming category of 1 or 2 means that
997        //      we are preset for doing the beginning or end of input, and
998        //      that we shouldn't get a category from an actual text input character.
999        //
1000        if (mode == RBBI_RUN) {
1001            // look up the current character's character category, which tells us
1002            // which column in the state table to look at.
1003            // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1004            //        not the size of the character going in, which is a UChar32.
1005            //
1006            UTRIE_GET16(&fData->fTrie, c, category);
1007
1008            // Check the dictionary bit in the character's category.
1009            //    Counter is only used by dictionary based iterators (subclasses).
1010            //    Chars that need to be handled by a dictionary have a flag bit set
1011            //    in their category values.
1012            //
1013            if ((category & 0x4000) != 0)  {
1014                fDictionaryCharCount++;
1015                //  And off the dictionary flag bit.
1016                category &= ~0x4000;
1017            }
1018        }
1019
1020        #ifdef RBBI_DEBUG
1021            if (fTrace) {
1022                RBBIDebugPrintf("             %4d   ", utext_getNativeIndex(fText));
1023                if (0x20<=c && c<0x7f) {
1024                    RBBIDebugPrintf("\"%c\"  ", c);
1025                } else {
1026                    RBBIDebugPrintf("%5x  ", c);
1027                }
1028                RBBIDebugPrintf("%3d  %3d\n", state, category);
1029            }
1030        #endif
1031
1032        // State Transition - move machine to its next state
1033        //
1034        state = row->fNextState[category];
1035        row = (RBBIStateTableRow *)
1036            // (statetable->fTableData + (statetable->fRowLen * state));
1037            (tableData + tableRowLen * state);
1038
1039
1040        if (row->fAccepting == -1) {
1041            // Match found, common case.
1042            if (mode != RBBI_START) {
1043                result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1044            }
1045            fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
1046        }
1047
1048        if (row->fLookAhead != 0) {
1049            if (lookaheadStatus != 0
1050                && row->fAccepting == lookaheadStatus) {
1051                // Lookahead match is completed.
1052                result               = lookaheadResult;
1053                fLastRuleStatusIndex = lookaheadTagIdx;
1054                lookaheadStatus      = 0;
1055                // TODO:  make a standalone hard break in a rule work.
1056                if (lookAheadHardBreak) {
1057                    UTEXT_SETNATIVEINDEX(fText, result);
1058                    return result;
1059                }
1060                // Look-ahead completed, but other rules may match further.  Continue on
1061                //  TODO:  junk this feature?  I don't think it's used anywhwere.
1062                goto continueOn;
1063            }
1064
1065            int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1066            lookaheadResult = r;
1067            lookaheadStatus = row->fLookAhead;
1068            lookaheadTagIdx = row->fTagIdx;
1069            goto continueOn;
1070        }
1071
1072
1073        if (row->fAccepting != 0) {
1074            // Because this is an accepting state, any in-progress look-ahead match
1075            //   is no longer relavant.  Clear out the pending lookahead status.
1076            lookaheadStatus = 0;           // clear out any pending look-ahead match.
1077        }
1078
1079continueOn:
1080        if (state == STOP_STATE) {
1081            // This is the normal exit from the lookup state machine.
1082            // We have advanced through the string until it is certain that no
1083            //   longer match is possible, no matter what characters follow.
1084            break;
1085        }
1086
1087        // Advance to the next character.
1088        // If this is a beginning-of-input loop iteration, don't advance
1089        //    the input position.  The next iteration will be processing the
1090        //    first real input character.
1091        if (mode == RBBI_RUN) {
1092            c = UTEXT_NEXT32(fText);
1093        } else {
1094            if (mode == RBBI_START) {
1095                mode = RBBI_RUN;
1096            }
1097        }
1098
1099
1100    }
1101
1102    // The state machine is done.  Check whether it found a match...
1103
1104    // If the iterator failed to advance in the match engine, force it ahead by one.
1105    //   (This really indicates a defect in the break rules.  They should always match
1106    //    at least one character.)
1107    if (result == initialPosition) {
1108        UTEXT_SETNATIVEINDEX(fText, initialPosition);
1109        UTEXT_NEXT32(fText);
1110        result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1111    }
1112
1113    // Leave the iterator at our result position.
1114    UTEXT_SETNATIVEINDEX(fText, result);
1115    #ifdef RBBI_DEBUG
1116        if (fTrace) {
1117            RBBIDebugPrintf("result = %d\n\n", result);
1118        }
1119    #endif
1120    return result;
1121}
1122
1123
1124
1125//-----------------------------------------------------------------------------------
1126//
1127//  handlePrevious()
1128//
1129//      Iterate backwards, according to the logic of the reverse rules.
1130//      This version handles the exact style backwards rules.
1131//
1132//      The logic of this function is very similar to handleNext(), above.
1133//
1134//-----------------------------------------------------------------------------------
1135int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
1136    int32_t             state;
1137    int16_t             category        = 0;
1138    RBBIRunMode         mode;
1139    RBBIStateTableRow  *row;
1140    UChar32             c;
1141    int32_t             lookaheadStatus = 0;
1142    int32_t             result          = 0;
1143    int32_t             initialPosition = 0;
1144    int32_t             lookaheadResult = 0;
1145    UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1146
1147    #ifdef RBBI_DEBUG
1148        if (fTrace) {
1149            RBBIDebugPuts("Handle Previous   pos   char  state category");
1150        }
1151    #endif
1152
1153    // handlePrevious() never gets the rule status.
1154    // Flag the status as invalid; if the user ever asks for status, we will need
1155    // to back up, then re-find the break position using handleNext(), which does
1156    // get the status value.
1157    fLastStatusIndexValid = FALSE;
1158    fLastRuleStatusIndex = 0;
1159
1160    // if we're already at the start of the text, return DONE.
1161    if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
1162        return BreakIterator::DONE;
1163    }
1164
1165    //  Set up the starting char.
1166    initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1167    result          = initialPosition;
1168    c               = UTEXT_PREVIOUS32(fText);
1169
1170    //  Set the initial state for the state machine
1171    state = START_STATE;
1172    row = (RBBIStateTableRow *)
1173            (statetable->fTableData + (statetable->fRowLen * state));
1174    category = 3;
1175    mode     = RBBI_RUN;
1176    if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1177        category = 2;
1178        mode     = RBBI_START;
1179    }
1180
1181
1182    // loop until we reach the start of the text or transition to state 0
1183    //
1184    for (;;) {
1185        if (c == U_SENTINEL) {
1186            // Reached end of input string.
1187            if (mode == RBBI_END ||
1188                *(int32_t *)fData->fHeader->fFormatVersion == 1 ) {
1189                // We have already run the loop one last time with the
1190                //   character set to the psueudo {eof} value.  Now it is time
1191                //   to unconditionally bail out.
1192                //  (Or we have an old format binary rule file that does not support {eof}.)
1193                if (lookaheadResult < result) {
1194                    // We ran off the end of the string with a pending look-ahead match.
1195                    // Treat this as if the look-ahead condition had been met, and return
1196                    //  the match at the / position from the look-ahead rule.
1197                    result               = lookaheadResult;
1198                    lookaheadStatus = 0;
1199                } else if (result == initialPosition) {
1200                    // Ran off start, no match found.
1201                    // move one index one (towards the start, since we are doing a previous())
1202                    UTEXT_SETNATIVEINDEX(fText, initialPosition);
1203                    UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
1204                }
1205                break;
1206            }
1207            // Run the loop one last time with the fake end-of-input character category.
1208            mode = RBBI_END;
1209            category = 1;
1210        }
1211
1212        //
1213        // Get the char category.  An incoming category of 1 or 2 means that
1214        //      we are preset for doing the beginning or end of input, and
1215        //      that we shouldn't get a category from an actual text input character.
1216        //
1217        if (mode == RBBI_RUN) {
1218            // look up the current character's character category, which tells us
1219            // which column in the state table to look at.
1220            // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1221            //        not the size of the character going in, which is a UChar32.
1222            //
1223            UTRIE_GET16(&fData->fTrie, c, category);
1224
1225            // Check the dictionary bit in the character's category.
1226            //    Counter is only used by dictionary based iterators (subclasses).
1227            //    Chars that need to be handled by a dictionary have a flag bit set
1228            //    in their category values.
1229            //
1230            if ((category & 0x4000) != 0)  {
1231                fDictionaryCharCount++;
1232                //  And off the dictionary flag bit.
1233                category &= ~0x4000;
1234            }
1235        }
1236
1237        #ifdef RBBI_DEBUG
1238            if (fTrace) {
1239                RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
1240                if (0x20<=c && c<0x7f) {
1241                    RBBIDebugPrintf("\"%c\"  ", c);
1242                } else {
1243                    RBBIDebugPrintf("%5x  ", c);
1244                }
1245                RBBIDebugPrintf("%3d  %3d\n", state, category);
1246            }
1247        #endif
1248
1249        // State Transition - move machine to its next state
1250        //
1251        state = row->fNextState[category];
1252        row = (RBBIStateTableRow *)
1253            (statetable->fTableData + (statetable->fRowLen * state));
1254
1255        if (row->fAccepting == -1) {
1256            // Match found, common case.
1257            result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1258        }
1259
1260        if (row->fLookAhead != 0) {
1261            if (lookaheadStatus != 0
1262                && row->fAccepting == lookaheadStatus) {
1263                // Lookahead match is completed.
1264                result               = lookaheadResult;
1265                lookaheadStatus      = 0;
1266                // TODO:  make a standalone hard break in a rule work.
1267                if (lookAheadHardBreak) {
1268                    UTEXT_SETNATIVEINDEX(fText, result);
1269                    return result;
1270                }
1271                // Look-ahead completed, but other rules may match further.  Continue on
1272                //  TODO:  junk this feature?  I don't think it's used anywhwere.
1273                goto continueOn;
1274            }
1275
1276            int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1277            lookaheadResult = r;
1278            lookaheadStatus = row->fLookAhead;
1279            goto continueOn;
1280        }
1281
1282
1283        if (row->fAccepting != 0) {
1284            // Because this is an accepting state, any in-progress look-ahead match
1285            //   is no longer relavant.  Clear out the pending lookahead status.
1286            lookaheadStatus = 0;
1287        }
1288
1289continueOn:
1290        if (state == STOP_STATE) {
1291            // This is the normal exit from the lookup state machine.
1292            // We have advanced through the string until it is certain that no
1293            //   longer match is possible, no matter what characters follow.
1294            break;
1295        }
1296
1297        // Move (backwards) to the next character to process.
1298        // If this is a beginning-of-input loop iteration, don't advance
1299        //    the input position.  The next iteration will be processing the
1300        //    first real input character.
1301        if (mode == RBBI_RUN) {
1302            c = UTEXT_PREVIOUS32(fText);
1303        } else {
1304            if (mode == RBBI_START) {
1305                mode = RBBI_RUN;
1306            }
1307        }
1308    }
1309
1310    // The state machine is done.  Check whether it found a match...
1311
1312    // If the iterator failed to advance in the match engine, force it ahead by one.
1313    //   (This really indicates a defect in the break rules.  They should always match
1314    //    at least one character.)
1315    if (result == initialPosition) {
1316        UTEXT_SETNATIVEINDEX(fText, initialPosition);
1317        UTEXT_PREVIOUS32(fText);
1318        result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1319    }
1320
1321    // Leave the iterator at our result position.
1322    UTEXT_SETNATIVEINDEX(fText, result);
1323    #ifdef RBBI_DEBUG
1324        if (fTrace) {
1325            RBBIDebugPrintf("result = %d\n\n", result);
1326        }
1327    #endif
1328    return result;
1329}
1330
1331
1332void
1333RuleBasedBreakIterator::reset()
1334{
1335    if (fCachedBreakPositions) {
1336        uprv_free(fCachedBreakPositions);
1337    }
1338    fCachedBreakPositions = NULL;
1339    fNumCachedBreakPositions = 0;
1340    fDictionaryCharCount = 0;
1341    fPositionInCache = 0;
1342}
1343
1344
1345
1346//-------------------------------------------------------------------------------
1347//
1348//   getRuleStatus()   Return the break rule tag associated with the current
1349//                     iterator position.  If the iterator arrived at its current
1350//                     position by iterating forwards, the value will have been
1351//                     cached by the handleNext() function.
1352//
1353//                     If no cached status value is available, the status is
1354//                     found by doing a previous() followed by a next(), which
1355//                     leaves the iterator where it started, and computes the
1356//                     status while doing the next().
1357//
1358//-------------------------------------------------------------------------------
1359void RuleBasedBreakIterator::makeRuleStatusValid() {
1360    if (fLastStatusIndexValid == FALSE) {
1361        //  No cached status is available.
1362        if (fText == NULL || current() == 0) {
1363            //  At start of text, or there is no text.  Status is always zero.
1364            fLastRuleStatusIndex = 0;
1365            fLastStatusIndexValid = TRUE;
1366        } else {
1367            //  Not at start of text.  Find status the tedious way.
1368            int32_t pa = current();
1369            previous();
1370            if (fNumCachedBreakPositions > 0) {
1371                reset();                // Blow off the dictionary cache
1372            }
1373            int32_t pb = next();
1374            if (pa != pb) {
1375                // note: the if (pa != pb) test is here only to eliminate warnings for
1376                //       unused local variables on gcc.  Logically, it isn't needed.
1377                U_ASSERT(pa == pb);
1378            }
1379        }
1380    }
1381    U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
1382}
1383
1384
1385int32_t  RuleBasedBreakIterator::getRuleStatus() const {
1386    RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1387    nonConstThis->makeRuleStatusValid();
1388
1389    // fLastRuleStatusIndex indexes to the start of the appropriate status record
1390    //                                                 (the number of status values.)
1391    //   This function returns the last (largest) of the array of status values.
1392    int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
1393    int32_t  tagVal = fData->fRuleStatusTable[idx];
1394
1395    return tagVal;
1396}
1397
1398
1399
1400
1401int32_t RuleBasedBreakIterator::getRuleStatusVec(
1402             int32_t *fillInVec, int32_t capacity, UErrorCode &status)
1403{
1404    if (U_FAILURE(status)) {
1405        return 0;
1406    }
1407
1408    RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1409    nonConstThis->makeRuleStatusValid();
1410    int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
1411    int32_t  numValsToCopy = numVals;
1412    if (numVals > capacity) {
1413        status = U_BUFFER_OVERFLOW_ERROR;
1414        numValsToCopy = capacity;
1415    }
1416    int i;
1417    for (i=0; i<numValsToCopy; i++) {
1418        fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
1419    }
1420    return numVals;
1421}
1422
1423
1424
1425//-------------------------------------------------------------------------------
1426//
1427//   getBinaryRules        Access to the compiled form of the rules,
1428//                         for use by build system tools that save the data
1429//                         for standard iterator types.
1430//
1431//-------------------------------------------------------------------------------
1432const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1433    const uint8_t  *retPtr = NULL;
1434    length = 0;
1435
1436    if (fData != NULL) {
1437        retPtr = (const uint8_t *)fData->fHeader;
1438        length = fData->fHeader->fLength;
1439    }
1440    return retPtr;
1441}
1442
1443
1444
1445
1446//-------------------------------------------------------------------------------
1447//
1448//  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
1449//                    Saving one heap allocation isn't worth the trouble.
1450//                    Cloning shouldn't be done in tight loops, and
1451//                    making the clone copy involves other heap operations anyway.
1452//                    And the application code for correctly dealing with buffer
1453//                    size problems and the eventual object destruction is ugly.
1454//
1455//-------------------------------------------------------------------------------
1456BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
1457                                   int32_t &bufferSize,
1458                                   UErrorCode &status)
1459{
1460    if (U_FAILURE(status)){
1461        return NULL;
1462    }
1463
1464    //
1465    //  If user buffer size is zero this is a preflight operation to
1466    //    obtain the needed buffer size, allowing for worst case misalignment.
1467    //
1468    if (bufferSize == 0) {
1469        bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
1470        return NULL;
1471    }
1472
1473
1474    //
1475    //  Check the alignment and size of the user supplied buffer.
1476    //  Allocate heap memory if the user supplied memory is insufficient.
1477    //
1478    char    *buf   = (char *)stackBuffer;
1479    uint32_t s      = bufferSize;
1480
1481    if (stackBuffer == NULL) {
1482        s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
1483    }
1484    if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
1485        uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
1486        s   -= offsetUp;
1487        buf += offsetUp;
1488    }
1489    if (s < sizeof(RuleBasedBreakIterator)) {
1490        // Not enough room in the caller-supplied buffer.
1491        // Do a plain-vanilla heap based clone and return that, along with
1492        //   a warning that the clone was allocated.
1493        RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
1494        if (clonedBI == 0) {
1495            status = U_MEMORY_ALLOCATION_ERROR;
1496        } else {
1497            status = U_SAFECLONE_ALLOCATED_WARNING;
1498        }
1499        return clonedBI;
1500    }
1501
1502    //
1503    //  Clone the source BI into the caller-supplied buffer.
1504    //    TODO:  using an overloaded operator new to directly initialize the
1505    //           copy in the user's buffer would be better, but it doesn't seem
1506    //           to get along with namespaces.  Investigate why.
1507    //
1508    //           The memcpy is only safe with an empty (default constructed)
1509    //           break iterator.  Use on others can screw up reference counts
1510    //           to data.  memcpy-ing objects is not really a good idea...
1511    //
1512    RuleBasedBreakIterator localIter;        // Empty break iterator, source for memcpy
1513    RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
1514    uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part
1515    clone->init();                // Init RuleBasedBreakIterator part, (user default constructor)
1516    *clone = *this;               // clone = the real BI we want.
1517    clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
1518
1519    return clone;
1520}
1521
1522
1523//-------------------------------------------------------------------------------
1524//
1525//  isDictionaryChar      Return true if the category lookup for this char
1526//                        indicates that it is in the set of dictionary lookup
1527//                        chars.
1528//
1529//                        This function is intended for use by dictionary based
1530//                        break iterators.
1531//
1532//-------------------------------------------------------------------------------
1533/*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
1534    if (fData == NULL) {
1535        return FALSE;
1536    }
1537    uint16_t category;
1538    UTRIE_GET16(&fData->fTrie, c, category);
1539    return (category & 0x4000) != 0;
1540}*/
1541
1542
1543//-------------------------------------------------------------------------------
1544//
1545//  checkDictionary       This function handles all processing of characters in
1546//                        the "dictionary" set. It will determine the appropriate
1547//                        course of action, and possibly set up a cache in the
1548//                        process.
1549//
1550//-------------------------------------------------------------------------------
1551int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
1552                            int32_t endPos,
1553                            UBool reverse) {
1554    // Reset the old break cache first.
1555    uint32_t dictionaryCount = fDictionaryCharCount;
1556    reset();
1557
1558    if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
1559        return (reverse ? startPos : endPos);
1560    }
1561
1562    // Starting from the starting point, scan towards the proposed result,
1563    // looking for the first dictionary character (which may be the one
1564    // we're on, if we're starting in the middle of a range).
1565    utext_setNativeIndex(fText, reverse ? endPos : startPos);
1566    if (reverse) {
1567        UTEXT_PREVIOUS32(fText);
1568    }
1569
1570    int32_t rangeStart = startPos;
1571    int32_t rangeEnd = endPos;
1572
1573    uint16_t    category;
1574    int32_t     current;
1575    UErrorCode  status = U_ZERO_ERROR;
1576    UStack      breaks(status);
1577    int32_t     foundBreakCount = 0;
1578    UChar32     c = utext_current32(fText);
1579
1580    UTRIE_GET16(&fData->fTrie, c, category);
1581
1582    // Is the character we're starting on a dictionary character? If so, we
1583    // need to back up to include the entire run; otherwise the results of
1584    // the break algorithm will differ depending on where we start. Since
1585    // the result is cached and there is typically a non-dictionary break
1586    // within a small number of words, there should be little performance impact.
1587    if (category & 0x4000) {
1588        if (reverse) {
1589            do {
1590                utext_next32(fText);          // TODO:  recast to work directly with postincrement.
1591                c = utext_current32(fText);
1592                UTRIE_GET16(&fData->fTrie, c, category);
1593            } while (c != U_SENTINEL && (category & 0x4000));
1594            // Back up to the last dictionary character
1595            rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1596            if (c == U_SENTINEL) {
1597                // c = fText->last32();
1598                //   TODO:  why was this if needed?
1599                c = UTEXT_PREVIOUS32(fText);
1600            }
1601            else {
1602                c = UTEXT_PREVIOUS32(fText);
1603            }
1604        }
1605        else {
1606            do {
1607                c = UTEXT_PREVIOUS32(fText);
1608                UTRIE_GET16(&fData->fTrie, c, category);
1609            }
1610            while (c != U_SENTINEL && (category & 0x4000));
1611            // Back up to the last dictionary character
1612            if (c == U_SENTINEL) {
1613                // c = fText->first32();
1614                c = utext_current32(fText);
1615            }
1616            else {
1617                utext_next32(fText);
1618                c = utext_current32(fText);
1619            }
1620            rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
1621        }
1622        UTRIE_GET16(&fData->fTrie, c, category);
1623    }
1624
1625    // Loop through the text, looking for ranges of dictionary characters.
1626    // For each span, find the appropriate break engine, and ask it to find
1627    // any breaks within the span.
1628    // Note: we always do this in the forward direction, so that the break
1629    // cache is built in the right order.
1630    if (reverse) {
1631        utext_setNativeIndex(fText, rangeStart);
1632        c = utext_current32(fText);
1633        UTRIE_GET16(&fData->fTrie, c, category);
1634    }
1635    while(U_SUCCESS(status)) {
1636        while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
1637            utext_next32(fText);           // TODO:  tweak for post-increment operation
1638            c = utext_current32(fText);
1639            UTRIE_GET16(&fData->fTrie, c, category);
1640        }
1641        if (current >= rangeEnd) {
1642            break;
1643        }
1644
1645        // We now have a dictionary character. Get the appropriate language object
1646        // to deal with it.
1647        const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
1648
1649        // Ask the language object if there are any breaks. It will leave the text
1650        // pointer on the other side of its range, ready to search for the next one.
1651        if (lbe != NULL) {
1652            foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
1653        }
1654
1655        // Reload the loop variables for the next go-round
1656        c = utext_current32(fText);
1657        UTRIE_GET16(&fData->fTrie, c, category);
1658    }
1659
1660    // If we found breaks, build a new break cache. The first and last entries must
1661    // be the original starting and ending position.
1662    if (foundBreakCount > 0) {
1663        int32_t totalBreaks = foundBreakCount;
1664        if (startPos < breaks.elementAti(0)) {
1665            totalBreaks += 1;
1666        }
1667        if (endPos > breaks.peeki()) {
1668            totalBreaks += 1;
1669        }
1670        fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
1671        if (fCachedBreakPositions != NULL) {
1672            int32_t out = 0;
1673            fNumCachedBreakPositions = totalBreaks;
1674            if (startPos < breaks.elementAti(0)) {
1675                fCachedBreakPositions[out++] = startPos;
1676            }
1677            for (int32_t i = 0; i < foundBreakCount; ++i) {
1678                fCachedBreakPositions[out++] = breaks.elementAti(i);
1679            }
1680            if (endPos > fCachedBreakPositions[out-1]) {
1681                fCachedBreakPositions[out] = endPos;
1682            }
1683            // If there are breaks, then by definition, we are replacing the original
1684            // proposed break by one of the breaks we found. Use following() and
1685            // preceding() to do the work. They should never recurse in this case.
1686            if (reverse) {
1687                return preceding(endPos - 1);
1688            }
1689            else {
1690                return following(startPos);
1691            }
1692        }
1693        // If the allocation failed, just fall through to the "no breaks found" case.
1694    }
1695
1696    // If we get here, there were no language-based breaks. Set the text pointer
1697    // to the original proposed break.
1698    utext_setNativeIndex(fText, reverse ? startPos : endPos);
1699    return (reverse ? startPos : endPos);
1700}
1701
1702U_NAMESPACE_END
1703
1704// defined in ucln_cmn.h
1705
1706static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;
1707
1708/**
1709 * Release all static memory held by breakiterator.
1710 */
1711U_CDECL_BEGIN
1712static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
1713    if (gLanguageBreakFactories) {
1714        delete gLanguageBreakFactories;
1715        gLanguageBreakFactories = NULL;
1716    }
1717    return TRUE;
1718}
1719U_CDECL_END
1720
1721U_CDECL_BEGIN
1722static void U_CALLCONV _deleteFactory(void *obj) {
1723    delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;
1724}
1725U_CDECL_END
1726U_NAMESPACE_BEGIN
1727
1728static const LanguageBreakEngine*
1729getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
1730{
1731    UBool       needsInit;
1732    UErrorCode  status = U_ZERO_ERROR;
1733    UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
1734
1735    if (needsInit) {
1736        UStack  *factories = new UStack(_deleteFactory, NULL, status);
1737        if (factories != NULL && U_SUCCESS(status)) {
1738            ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1739            factories->push(builtIn, status);
1740#ifdef U_LOCAL_SERVICE_HOOK
1741            LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1742            if (extra != NULL) {
1743                factories->push(extra, status);
1744            }
1745#endif
1746        }
1747        umtx_lock(NULL);
1748        if (gLanguageBreakFactories == NULL) {
1749            gLanguageBreakFactories = factories;
1750            factories = NULL;
1751            ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
1752        }
1753        umtx_unlock(NULL);
1754        delete factories;
1755    }
1756
1757    if (gLanguageBreakFactories == NULL) {
1758        return NULL;
1759    }
1760
1761    int32_t i = gLanguageBreakFactories->size();
1762    const LanguageBreakEngine *lbe = NULL;
1763    while (--i >= 0) {
1764        LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1765        lbe = factory->getEngineFor(c, breakType);
1766        if (lbe != NULL) {
1767            break;
1768        }
1769    }
1770    return lbe;
1771}
1772
1773
1774//-------------------------------------------------------------------------------
1775//
1776//  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
1777//                          the characer c.
1778//
1779//-------------------------------------------------------------------------------
1780const LanguageBreakEngine *
1781RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1782    const LanguageBreakEngine *lbe = NULL;
1783    UErrorCode status = U_ZERO_ERROR;
1784
1785    if (fLanguageBreakEngines == NULL) {
1786        fLanguageBreakEngines = new UStack(status);
1787        if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
1788            delete fLanguageBreakEngines;
1789            fLanguageBreakEngines = 0;
1790            return NULL;
1791        }
1792    }
1793
1794    int32_t i = fLanguageBreakEngines->size();
1795    while (--i >= 0) {
1796        lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1797        if (lbe->handles(c, fBreakType)) {
1798            return lbe;
1799        }
1800    }
1801
1802    // No existing dictionary took the character. See if a factory wants to
1803    // give us a new LanguageBreakEngine for this character.
1804    lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
1805
1806    // If we got one, use it and push it on our stack.
1807    if (lbe != NULL) {
1808        fLanguageBreakEngines->push((void *)lbe, status);
1809        // Even if we can't remember it, we can keep looking it up, so
1810        // return it even if the push fails.
1811        return lbe;
1812    }
1813
1814    // No engine is forthcoming for this character. Add it to the
1815    // reject set. Create the reject break engine if needed.
1816    if (fUnhandledBreakEngine == NULL) {
1817        fUnhandledBreakEngine = new UnhandledEngine(status);
1818        if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1819            status = U_MEMORY_ALLOCATION_ERROR;
1820        }
1821        // Put it last so that scripts for which we have an engine get tried
1822        // first.
1823        fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1824        // If we can't insert it, or creation failed, get rid of it
1825        if (U_FAILURE(status)) {
1826            delete fUnhandledBreakEngine;
1827            fUnhandledBreakEngine = 0;
1828            return NULL;
1829        }
1830    }
1831
1832    // Tell the reject engine about the character; at its discretion, it may
1833    // add more than just the one character.
1834    fUnhandledBreakEngine->handleCharacter(c, fBreakType);
1835
1836    return fUnhandledBreakEngine;
1837}
1838
1839
1840
1841/*int32_t RuleBasedBreakIterator::getBreakType() const {
1842    return fBreakType;
1843}*/
1844
1845void RuleBasedBreakIterator::setBreakType(int32_t type) {
1846    fBreakType = type;
1847    reset();
1848}
1849
1850U_NAMESPACE_END
1851
1852#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1853