1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6/************************************************************************
7*   Date        Name        Description
8*   12/15/99    Madhu        Creation.
9*   01/12/2000  Madhu        Updated for changed API and added new tests
10************************************************************************/
11
12#include "utypeinfo.h"  // for 'typeid' to work
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/utypes.h"
19#include "unicode/brkiter.h"
20#include "unicode/rbbi.h"
21#include "unicode/uchar.h"
22#include "unicode/utf16.h"
23#include "unicode/ucnv.h"
24#include "unicode/schriter.h"
25#include "unicode/uniset.h"
26#if !UCONFIG_NO_REGULAR_EXPRESSIONS
27#include "unicode/regex.h"
28#endif
29#include "unicode/ustring.h"
30#include "unicode/utext.h"
31#include "intltest.h"
32#include "rbbitst.h"
33#include <string.h>
34#include "uvector.h"
35#include "uvectr32.h"
36#include <string.h>
37#include <stdio.h>
38#include <stdlib.h>
39#include "unicode/numfmt.h"
40#include "unicode/uscript.h"
41
42#define TEST_ASSERT(x) {if (!(x)) { \
43    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
44
45#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
46    errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
47
48
49//---------------------------------------------
50// runIndexedTest
51//---------------------------------------------
52
53
54//  Note:  Before adding new tests to this file, check whether the desired test data can
55//         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
56//         it's much less work than writing a new test, diagnostic output in the event of failures
57//         is good, and the test data file will is shared with ICU4J, so eventually the test
58//         will run there as well, without additional effort.
59
60void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
61{
62    if (exec) logln("TestSuite RuleBasedBreakIterator: ");
63
64    switch (index) {
65#if !UCONFIG_NO_FILE_IO
66        case 0: name = "TestBug4153072";
67            if(exec) TestBug4153072();                         break;
68#else
69        case 0: name = "skip";
70            break;
71#endif
72
73        case 1: name = "skip";
74            break;
75        case 2: name = "TestStatusReturn";
76            if(exec) TestStatusReturn();                       break;
77
78#if !UCONFIG_NO_FILE_IO
79        case 3: name = "TestUnicodeFiles";
80            if(exec) TestUnicodeFiles();                       break;
81        case 4: name = "TestEmptyString";
82            if(exec) TestEmptyString();                        break;
83#else
84        case 3: case 4: name = "skip";
85            break;
86#endif
87
88        case 5: name = "TestGetAvailableLocales";
89            if(exec) TestGetAvailableLocales();                break;
90
91        case 6: name = "TestGetDisplayName";
92            if(exec) TestGetDisplayName();                     break;
93
94#if !UCONFIG_NO_FILE_IO
95        case 7: name = "TestEndBehaviour";
96            if(exec) TestEndBehaviour();                       break;
97        case 8: case 9: case 10: name = "skip";
98             break;
99        case 11: name = "TestWordBreaks";
100             if(exec) TestWordBreaks();                        break;
101        case 12: name = "TestWordBoundary";
102             if(exec) TestWordBoundary();                      break;
103        case 13: name = "TestLineBreaks";
104             if(exec) TestLineBreaks();                        break;
105        case 14: name = "TestSentBreaks";
106             if(exec) TestSentBreaks();                        break;
107        case 15: name = "TestExtended";
108             if(exec) TestExtended();                          break;
109#else
110        case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
111             break;
112#endif
113
114#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
115        case 16:
116            name = "TestMonkey"; if(exec)  TestMonkey(params); break;
117#else
118        case 16:
119             name = "skip";                                    break;
120#endif
121
122#if !UCONFIG_NO_FILE_IO
123        case 17: name = "TestBug3818";
124            if(exec) TestBug3818();                            break;
125#else
126        case 17: name = "skip";
127            break;
128#endif
129
130        case 18: name = "skip";
131            break;
132        case 19: name = "TestDebug";
133            if(exec) TestDebug();                              break;
134        case 20: name = "skip";
135            break;
136
137#if !UCONFIG_NO_FILE_IO
138        case 21: name = "TestBug5775";
139            if (exec) TestBug5775();                           break;
140#else
141        case 21: name = "skip";
142            break;
143#endif
144
145        case 22: name = "TestBug9983";
146            if (exec) TestBug9983();                           break;
147        case 23: name = "TestDictRules";
148            if (exec) TestDictRules();                         break;
149        case 24: name = "TestBug5532";
150            if (exec) TestBug5532();                           break;
151        default: name = ""; break; //needed to end loop
152    }
153}
154
155
156//---------------------------------------------------------------------------
157//
158//   class BITestData   Holds a set of Break iterator test data and results
159//                      Includes
160//                         - the string data to be broken
161//                         - a vector of the expected break positions.
162//                         - a vector of source line numbers for the data,
163//                               (to help see where errors occured.)
164//                         - The expected break tag values.
165//                         - Vectors of actual break positions and tag values.
166//                         - Functions for comparing actual with expected and
167//                            reporting errors.
168//
169//----------------------------------------------------------------------------
170class BITestData {
171public:
172    UnicodeString    fDataToBreak;
173    UVector          fExpectedBreakPositions;
174    UVector          fExpectedTags;
175    UVector          fLineNum;
176    UVector          fActualBreakPositions;   // Test Results.
177    UVector          fActualTags;
178
179    BITestData(UErrorCode &status);
180    void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
181    void             checkResults(const char *heading, RBBITest *test);
182    void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
183    void             clearResults();
184};
185
186//
187// Constructor.
188//
189BITestData::BITestData(UErrorCode &status)
190: fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
191  fActualTags(status)
192{
193}
194
195//
196// addDataChunk.   Add a section (non-breaking) piece if data to the test data.
197//                 The macro form collects the line number, which is helpful
198//                 when tracking down failures.
199//
200//                 A null data item is inserted at the start of each test's data
201//                  to put the starting zero into the data list.  The position saved for
202//                  each non-null item is its ending position.
203//
204#define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
205void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
206    if (U_FAILURE(status)) {return;}
207    if (data != NULL) {
208        fDataToBreak.append(CharsToUnicodeString(data));
209    }
210    fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
211    fExpectedTags.addElement(tag, status);
212    fLineNum.addElement(lineNum, status);
213}
214
215
216//
217//  checkResults.   Compare the actual and expected break positions, report any differences.
218//
219void BITestData::checkResults(const char *heading, RBBITest *test) {
220    int32_t   expectedIndex = 0;
221    int32_t   actualIndex = 0;
222
223    for (;;) {
224        // If we've run through both the expected and actual results vectors, we're done.
225        //   break out of the loop.
226        if (expectedIndex >= fExpectedBreakPositions.size() &&
227            actualIndex   >= fActualBreakPositions.size()) {
228            break;
229        }
230
231
232        if (expectedIndex >= fExpectedBreakPositions.size()) {
233            err(heading, test, expectedIndex-1, actualIndex);
234            actualIndex++;
235            continue;
236        }
237
238        if (actualIndex >= fActualBreakPositions.size()) {
239            err(heading, test, expectedIndex, actualIndex-1);
240            expectedIndex++;
241            continue;
242        }
243
244        if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
245            err(heading, test, expectedIndex, actualIndex);
246            // Try to resync the positions of the indices, to avoid a rash of spurious erros.
247            if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
248                actualIndex++;
249            } else {
250                expectedIndex++;
251            }
252            continue;
253        }
254
255        if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
256            test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
257                heading, fLineNum.elementAt(expectedIndex),
258                fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
259        }
260
261        actualIndex++;
262        expectedIndex++;
263    }
264}
265
266//
267//  err   -  An error was found.  Report it, along with information about where the
268//                                incorrectly broken test data appeared in the source file.
269//
270void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
271{
272    int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
273    int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
274    int32_t   o        = 0;
275    int32_t   line     = fLineNum.elementAti(expectedIdx);
276    if (expectedIdx > 0) {
277        // The line numbers are off by one because a premature break occurs somewhere
278        //    within the previous item, rather than at the start of the current (expected) item.
279        //    We want to report the offset of the unexpected break from the start of
280        //      this previous item.
281        o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
282    }
283    if (actual < expected) {
284        test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
285    } else {
286        test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
287    }
288}
289
290
291void BITestData::clearResults() {
292    fActualBreakPositions.removeAllElements();
293    fActualTags.removeAllElements();
294}
295
296
297//--------------------------------------------------------------------------------------
298//
299//    RBBITest    constructor and destructor
300//
301//--------------------------------------------------------------------------------------
302
303RBBITest::RBBITest() {
304}
305
306
307RBBITest::~RBBITest() {
308}
309
310//-----------------------------------------------------------------------------------
311//
312//   Test for status {tag} return value from break rules.
313//        TODO:  a more thorough test.
314//
315//-----------------------------------------------------------------------------------
316void RBBITest::TestStatusReturn() {
317     UnicodeString rulesString1("$Letters = [:L:];\n"
318                                  "$Numbers = [:N:];\n"
319                                  "$Letters+{1};\n"
320                                  "$Numbers+{2};\n"
321                                  "Help\\ {4}/me\\!;\n"
322                                  "[^$Letters $Numbers];\n"
323                                  "!.*;\n", -1, US_INV);
324     UnicodeString testString1  = "abc123..abc Help me Help me!";
325                                // 01234567890123456789012345678
326     int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
327     int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
328
329     UErrorCode status=U_ZERO_ERROR;
330     UParseError    parseError;
331
332     BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
333     if(U_FAILURE(status)) {
334         dataerrln("FAIL : in construction - %s", u_errorName(status));
335     } else {
336         int32_t  pos;
337         int32_t  i = 0;
338         bi->setText(testString1);
339         for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
340             if (pos != bounds1[i]) {
341                 errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
342                 break;
343             }
344
345             int tag = bi->getRuleStatus();
346             if (tag != brkStatus[i]) {
347                 errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
348                 break;
349             }
350             i++;
351         }
352     }
353     delete bi;
354}
355
356
357static void printStringBreaks(UnicodeString ustr, int expected[],
358                              int expectedcount)
359{
360    UErrorCode status = U_ZERO_ERROR;
361    char name[100];
362    printf("code    alpha extend alphanum type word sent line name\n");
363    int j;
364    for (j = 0; j < ustr.length(); j ++) {
365        if (expectedcount > 0) {
366            int k;
367            for (k = 0; k < expectedcount; k ++) {
368                if (j == expected[k]) {
369                    printf("------------------------------------------------ %d\n",
370                           j);
371                }
372            }
373        }
374        UChar32 c = ustr.char32At(j);
375        if (c > 0xffff) {
376            j ++;
377        }
378        u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
379        printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
380                           u_isUAlphabetic(c),
381                           u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
382                           u_isalnum(c),
383                           u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
384                                                  u_charType(c),
385                                                  U_SHORT_PROPERTY_NAME),
386                           u_getPropertyValueName(UCHAR_WORD_BREAK,
387                                                  u_getIntPropertyValue(c,
388                                                          UCHAR_WORD_BREAK),
389                                                  U_SHORT_PROPERTY_NAME),
390                           u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
391                                   u_getIntPropertyValue(c,
392                                           UCHAR_SENTENCE_BREAK),
393                                   U_SHORT_PROPERTY_NAME),
394                           u_getPropertyValueName(UCHAR_LINE_BREAK,
395                                   u_getIntPropertyValue(c,
396                                           UCHAR_LINE_BREAK),
397                                   U_SHORT_PROPERTY_NAME),
398                           name);
399    }
400}
401
402
403void RBBITest::TestBug3818() {
404    UErrorCode  status = U_ZERO_ERROR;
405
406    // Four Thai words...
407    static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
408                                           0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
409    UnicodeString  thaiStr(thaiWordData);
410
411    BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
412    if (U_FAILURE(status) || bi == NULL) {
413        errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
414        return;
415    }
416    bi->setText(thaiStr);
417
418    int32_t  startOfSecondWord = bi->following(1);
419    if (startOfSecondWord != 4) {
420        errln("Fail at file %s, line %d expected start of word at 4, got %d",
421            __FILE__, __LINE__, startOfSecondWord);
422    }
423    startOfSecondWord = bi->following(0);
424    if (startOfSecondWord != 4) {
425        errln("Fail at file %s, line %d expected start of word at 4, got %d",
426            __FILE__, __LINE__, startOfSecondWord);
427    }
428    delete bi;
429}
430
431//----------------------------------------------------------------------------
432//
433// generalIteratorTest      Given a break iterator and a set of test data,
434//                          Run the tests and report the results.
435//
436//----------------------------------------------------------------------------
437void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
438{
439
440    bi.setText(td.fDataToBreak);
441
442    testFirstAndNext(bi, td);
443
444    testLastAndPrevious(bi, td);
445
446    testFollowing(bi, td);
447    testPreceding(bi, td);
448    testIsBoundary(bi, td);
449    doMultipleSelectionTest(bi, td);
450}
451
452
453//
454//   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
455//                       kind of loop.
456//
457void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
458{
459    UErrorCode  status = U_ZERO_ERROR;
460    int32_t     p;
461    int32_t     lastP = -1;
462    int32_t     tag;
463
464    logln("Test first and next");
465    bi.setText(td.fDataToBreak);
466    td.clearResults();
467
468    for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
469        td.fActualBreakPositions.addElement(p, status);  // Save result.
470        tag = bi.getRuleStatus();
471        td.fActualTags.addElement(tag, status);
472        if (p <= lastP) {
473            // If the iterator is not making forward progress, stop.
474            //  No need to raise an error here, it'll be detected in the normal check of results.
475            break;
476        }
477        lastP = p;
478    }
479    td.checkResults("testFirstAndNext", this);
480}
481
482
483//
484//  TestLastAndPrevious.   Run the iterator backwards, starting with last().
485//
486void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
487{
488    UErrorCode  status = U_ZERO_ERROR;
489    int32_t     p;
490    int32_t     lastP  = 0x7ffffffe;
491    int32_t     tag;
492
493    logln("Test last and previous");
494    bi.setText(td.fDataToBreak);
495    td.clearResults();
496
497    for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
498        // Save break position.  Insert it at start of vector of results, shoving
499        //    already-saved results further towards the end.
500        td.fActualBreakPositions.insertElementAt(p, 0, status);
501        // bi.previous();   // TODO:  Why does this fix things up????
502        // bi.next();
503        tag = bi.getRuleStatus();
504        td.fActualTags.insertElementAt(tag, 0, status);
505        if (p >= lastP) {
506            // If the iterator is not making progress, stop.
507            //  No need to raise an error here, it'll be detected in the normal check of results.
508            break;
509        }
510        lastP = p;
511    }
512    td.checkResults("testLastAndPrevious", this);
513}
514
515
516void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
517{
518    UErrorCode  status = U_ZERO_ERROR;
519    int32_t     p;
520    int32_t     tag;
521    int32_t     lastP  = -2;     // A value that will never be returned as a break position.
522                                 //   cannot be -1; that is returned for DONE.
523    int         i;
524
525    logln("testFollowing():");
526    bi.setText(td.fDataToBreak);
527    td.clearResults();
528
529    // Save the starting point, since we won't get that out of following.
530    p = bi.first();
531    td.fActualBreakPositions.addElement(p, status);  // Save result.
532    tag = bi.getRuleStatus();
533    td.fActualTags.addElement(tag, status);
534
535    for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
536        p = bi.following(i);
537        if (p != lastP) {
538            if (p == RuleBasedBreakIterator::DONE) {
539                break;
540            }
541            // We've reached a new break position.  Save it.
542            td.fActualBreakPositions.addElement(p, status);  // Save result.
543            tag = bi.getRuleStatus();
544            td.fActualTags.addElement(tag, status);
545            lastP = p;
546        }
547    }
548    // The loop normally exits by means of the break in the middle.
549    // Make sure that the index was at the correct position for the break iterator to have
550    //   returned DONE.
551    if (i != td.fDataToBreak.length()) {
552        errln("testFollowing():  iterator returned DONE prematurely.");
553    }
554
555    // Full check of all results.
556    td.checkResults("testFollowing", this);
557}
558
559
560
561void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
562    UErrorCode  status = U_ZERO_ERROR;
563    int32_t     p;
564    int32_t     tag;
565    int32_t     lastP  = 0x7ffffffe;
566    int         i;
567
568    logln("testPreceding():");
569    bi.setText(td.fDataToBreak);
570    td.clearResults();
571
572    p = bi.last();
573    td.fActualBreakPositions.addElement(p, status);
574    tag = bi.getRuleStatus();
575    td.fActualTags.addElement(tag, status);
576
577    for (i = td.fDataToBreak.length(); i>=-1; i--) {
578        p = bi.preceding(i);
579        if (p != lastP) {
580            if (p == RuleBasedBreakIterator::DONE) {
581                break;
582            }
583            // We've reached a new break position.  Save it.
584            td.fActualBreakPositions.insertElementAt(p, 0, status);
585            lastP = p;
586            tag = bi.getRuleStatus();
587            td.fActualTags.insertElementAt(tag, 0, status);
588        }
589    }
590    // The loop normally exits by means of the break in the middle.
591    // Make sure that the index was at the correct position for the break iterator to have
592    //   returned DONE.
593    if (i != 0) {
594        errln("testPreceding():  iterator returned DONE prematurely.");
595    }
596
597    // Full check of all results.
598    td.checkResults("testPreceding", this);
599}
600
601
602
603void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
604    UErrorCode  status = U_ZERO_ERROR;
605    int         i;
606    int32_t     tag;
607
608    logln("testIsBoundary():");
609    bi.setText(td.fDataToBreak);
610    td.clearResults();
611
612    for (i = 0; i <= td.fDataToBreak.length(); i++) {
613        if (bi.isBoundary(i)) {
614            td.fActualBreakPositions.addElement(i, status);  // Save result.
615            tag = bi.getRuleStatus();
616            td.fActualTags.addElement(tag, status);
617        }
618    }
619    td.checkResults("testIsBoundary: ", this);
620}
621
622
623
624void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
625{
626    iterator.setText(td.fDataToBreak);
627
628    RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
629    int32_t offset = iterator.first();
630    int32_t testOffset;
631    int32_t count = 0;
632
633    logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
634
635    if (*testIterator != iterator)
636        errln("clone() or operator!= failed: two clones compared unequal");
637
638    do {
639        testOffset = testIterator->first();
640        testOffset = testIterator->next(count);
641        if (offset != testOffset)
642            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
643
644        if (offset != RuleBasedBreakIterator::DONE) {
645            count++;
646            offset = iterator.next();
647
648            if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
649                errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
650                if (count > 10000 || offset == -1) {
651                    errln("operator== failed too many times. Stopping test.");
652                    if (offset == -1) {
653                        errln("Does (RuleBasedBreakIterator::DONE == -1)?");
654                    }
655                    return;
656                }
657            }
658        }
659    } while (offset != RuleBasedBreakIterator::DONE);
660
661    // now do it backwards...
662    offset = iterator.last();
663    count = 0;
664
665    do {
666        testOffset = testIterator->last();
667        testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
668        if (offset != testOffset)
669            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
670
671        if (offset != RuleBasedBreakIterator::DONE) {
672            count--;
673            offset = iterator.previous();
674        }
675    } while (offset != RuleBasedBreakIterator::DONE);
676
677    delete testIterator;
678}
679
680
681//---------------------------------------------
682//
683//     other tests
684//
685//---------------------------------------------
686void RBBITest::TestEmptyString()
687{
688    UnicodeString text = "";
689    UErrorCode status = U_ZERO_ERROR;
690
691    BITestData x(status);
692    ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
693    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
694    if (U_FAILURE(status))
695    {
696        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
697        return;
698    }
699    generalIteratorTest(*bi, x);
700    delete bi;
701}
702
703void RBBITest::TestGetAvailableLocales()
704{
705    int32_t locCount = 0;
706    const Locale* locList = BreakIterator::getAvailableLocales(locCount);
707
708    if (locCount == 0)
709        dataerrln("getAvailableLocales() returned an empty list!");
710    // Just make sure that it's returning good memory.
711    int32_t i;
712    for (i = 0; i < locCount; ++i) {
713        logln(locList[i].getName());
714    }
715}
716
717//Testing the BreakIterator::getDisplayName() function
718void RBBITest::TestGetDisplayName()
719{
720    UnicodeString   result;
721
722    BreakIterator::getDisplayName(Locale::getUS(), result);
723    if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
724        dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
725                + result);
726
727    BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
728    if (result != "French (France)")
729        dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
730                + result);
731}
732/**
733 * Test End Behaviour
734 * @bug 4068137
735 */
736void RBBITest::TestEndBehaviour()
737{
738    UErrorCode status = U_ZERO_ERROR;
739    UnicodeString testString("boo.");
740    BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
741    if (U_FAILURE(status))
742    {
743        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
744        return;
745    }
746    wb->setText(testString);
747
748    if (wb->first() != 0)
749        errln("Didn't get break at beginning of string.");
750    if (wb->next() != 3)
751        errln("Didn't get break before period in \"boo.\"");
752    if (wb->current() != 4 && wb->next() != 4)
753        errln("Didn't get break at end of string.");
754    delete wb;
755}
756/*
757 * @bug 4153072
758 */
759void RBBITest::TestBug4153072() {
760    UErrorCode status = U_ZERO_ERROR;
761    BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
762    if (U_FAILURE(status))
763    {
764        errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
765        return;
766    }
767    UnicodeString str("...Hello, World!...");
768    int32_t begin = 3;
769    int32_t end = str.length() - 3;
770    UBool onBoundary;
771
772    StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
773    iter->adoptText(textIterator);
774    int index;
775    // Note: with the switch to UText, there is no way to restrict the
776    //       iteration range to begin at an index other than zero.
777    //       String character iterators created with a non-zero bound are
778    //         treated by RBBI as being empty.
779    for (index = -1; index < begin + 1; ++index) {
780        onBoundary = iter->isBoundary(index);
781        if (index == 0?  !onBoundary : onBoundary) {
782            errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
783                            " and begin index = " + begin);
784        }
785    }
786    delete iter;
787}
788
789
790//
791// Test for problem reported by Ashok Matoria on 9 July 2007
792//    One.<kSoftHyphen><kSpace>Two.
793//
794//    Sentence break at start (0) and then on calling next() it breaks at
795//   'T' of "Two". Now, at this point if I do next() and
796//    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
797//
798void RBBITest::TestBug5775() {
799    UErrorCode status = U_ZERO_ERROR;
800    BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
801    TEST_ASSERT_SUCCESS(status);
802    if (U_FAILURE(status)) {
803        return;
804    }
805// Check for status first for better handling of no data errors.
806    TEST_ASSERT(bi != NULL);
807    if (bi == NULL) {
808        return;
809    }
810
811    UnicodeString s("One.\\u00ad Two.", -1, US_INV);
812    //               01234      56789
813    s = s.unescape();
814    bi->setText(s);
815    int pos = bi->next();
816    TEST_ASSERT(pos == 6);
817    pos = bi->next();
818    TEST_ASSERT(pos == 10);
819    pos = bi->previous();
820    TEST_ASSERT(pos == 6);
821    delete bi;
822}
823
824
825
826//------------------------------------------------------------------------------
827//
828//   RBBITest::Extended    Run  RBBI Tests from an external test data file
829//
830//------------------------------------------------------------------------------
831
832struct TestParams {
833    BreakIterator   *bi;
834    UnicodeString    dataToBreak;
835    UVector32       *expectedBreaks;
836    UVector32       *srcLine;
837    UVector32       *srcCol;
838};
839
840void RBBITest::executeTest(TestParams *t) {
841    int32_t    bp;
842    int32_t    prevBP;
843    int32_t    i;
844
845    if (t->bi == NULL) {
846        return;
847    }
848
849    t->bi->setText(t->dataToBreak);
850    //
851    //  Run the iterator forward
852    //
853    prevBP = -1;
854    for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
855        if (prevBP ==  bp) {
856            // Fail for lack of forward progress.
857            errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
858                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
859            break;
860        }
861
862        // Check that there were we didn't miss an expected break between the last one
863        //  and this one.
864        for (i=prevBP+1; i<bp; i++) {
865            if (t->expectedBreaks->elementAti(i) != 0) {
866                int expected[] = {0, i};
867                printStringBreaks(t->dataToBreak, expected, 2);
868                errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
869                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
870            }
871        }
872
873        // Check that the break we did find was expected
874        if (t->expectedBreaks->elementAti(bp) == 0) {
875            int expected[] = {0, bp};
876            printStringBreaks(t->dataToBreak, expected, 2);
877            errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
878                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
879        } else {
880            // The break was expected.
881            //   Check that the {nnn} tag value is correct.
882            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
883            if (expectedTagVal == -1) {
884                expectedTagVal = 0;
885            }
886            int32_t line = t->srcLine->elementAti(bp);
887            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
888            if (rs != expectedTagVal) {
889                errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
890                      "          Actual, Expected status = %4d, %4d",
891                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
892            }
893        }
894
895
896        prevBP = bp;
897    }
898
899    // Verify that there were no missed expected breaks after the last one found
900    for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
901        if (t->expectedBreaks->elementAti(i) != 0) {
902            errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
903                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
904        }
905    }
906
907    //
908    //  Run the iterator backwards, verify that the same breaks are found.
909    //
910    prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
911    for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
912        if (prevBP ==  bp) {
913            // Fail for lack of progress.
914            errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
915                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
916            break;
917        }
918
919        // Check that there were we didn't miss an expected break between the last one
920        //  and this one.  (UVector returns zeros for index out of bounds.)
921        for (i=prevBP-1; i>bp; i--) {
922            if (t->expectedBreaks->elementAti(i) != 0) {
923                errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
924                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
925            }
926        }
927
928        // Check that the break we did find was expected
929        if (t->expectedBreaks->elementAti(bp) == 0) {
930            errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
931                   bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
932        } else {
933            // The break was expected.
934            //   Check that the {nnn} tag value is correct.
935            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
936            if (expectedTagVal == -1) {
937                expectedTagVal = 0;
938            }
939            int line = t->srcLine->elementAti(bp);
940            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
941            if (rs != expectedTagVal) {
942                errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
943                      "          Actual, Expected status = %4d, %4d",
944                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
945            }
946        }
947
948        prevBP = bp;
949    }
950
951    // Verify that there were no missed breaks prior to the last one found
952    for (i=prevBP-1; i>=0; i--) {
953        if (t->expectedBreaks->elementAti(i) != 0) {
954            errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
955                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
956        }
957    }
958
959    // Check isBoundary()
960    for (i=0; i<t->expectedBreaks->size(); i++) {
961        UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0);
962        UBool boundaryFound    = t->bi->isBoundary(i);
963        if (boundaryExpected != boundaryFound) {
964            errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
965                  "        Expected, Actual= %s, %s",
966                  i, t->srcLine->elementAti(i), t->srcCol->elementAti(i),
967                  boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
968        }
969    }
970
971    // Check following()
972    for (i=0; i<t->expectedBreaks->size(); i++) {
973        int32_t actualBreak = t->bi->following(i);
974        int32_t expectedBreak = BreakIterator::DONE;
975        for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) {
976            if (t->expectedBreaks->elementAti(j) != 0) {
977                expectedBreak = j;
978                break;
979            }
980        }
981        if (expectedBreak != actualBreak) {
982            errln("following(%d) incorrect. File line,col= %4d,%4d\n"
983                  "        Expected, Actual= %d, %d",
984                  i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
985        }
986    }
987
988    // Check preceding()
989    for (i=t->expectedBreaks->size(); i>=0; i--) {
990        int32_t actualBreak = t->bi->preceding(i);
991        int32_t expectedBreak = BreakIterator::DONE;
992
993        for (int32_t j=i-1; j >= 0; j--) {
994            if (t->expectedBreaks->elementAti(j) != 0) {
995                expectedBreak = j;
996                break;
997            }
998        }
999        if (expectedBreak != actualBreak) {
1000            errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1001                  "        Expected, Actual= %d, %d",
1002                  i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
1003        }
1004    }
1005}
1006
1007
1008void RBBITest::TestExtended() {
1009#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1010    UErrorCode      status  = U_ZERO_ERROR;
1011    Locale          locale("");
1012
1013    UnicodeString       rules;
1014    TestParams          tp;
1015    tp.bi             = NULL;
1016    tp.expectedBreaks = new UVector32(status);
1017    tp.srcLine        = new UVector32(status);
1018    tp.srcCol         = new UVector32(status);
1019
1020    RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
1021    if (U_FAILURE(status)) {
1022        dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1023    }
1024
1025
1026    //
1027    //  Open and read the test data file.
1028    //
1029    const char *testDataDirectory = IntlTest::getSourceTestData(status);
1030    char testFileName[1000];
1031    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1032        errln("Can't open test data.  Path too long.");
1033        return;
1034    }
1035    strcpy(testFileName, testDataDirectory);
1036    strcat(testFileName, "rbbitst.txt");
1037
1038    int    len;
1039    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1040    if (U_FAILURE(status)) {
1041        return; /* something went wrong, error already output */
1042    }
1043
1044
1045
1046
1047    //
1048    //  Put the test data into a UnicodeString
1049    //
1050    UnicodeString testString(FALSE, testFile, len);
1051
1052    enum EParseState{
1053        PARSE_COMMENT,
1054        PARSE_TAG,
1055        PARSE_DATA,
1056        PARSE_NUM
1057    }
1058    parseState = PARSE_TAG;
1059
1060    EParseState savedState = PARSE_TAG;
1061
1062    static const UChar CH_LF        = 0x0a;
1063    static const UChar CH_CR        = 0x0d;
1064    static const UChar CH_HASH      = 0x23;
1065    /*static const UChar CH_PERIOD    = 0x2e;*/
1066    static const UChar CH_LT        = 0x3c;
1067    static const UChar CH_GT        = 0x3e;
1068    static const UChar CH_BACKSLASH = 0x5c;
1069    static const UChar CH_BULLET    = 0x2022;
1070
1071    int32_t    lineNum  = 1;
1072    int32_t    colStart = 0;
1073    int32_t    column   = 0;
1074    int32_t    charIdx  = 0;
1075
1076    int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1077
1078    for (charIdx = 0; charIdx < len; ) {
1079        status = U_ZERO_ERROR;
1080        UChar  c = testString.charAt(charIdx);
1081        charIdx++;
1082        if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1083            // treat CRLF as a unit
1084            c = CH_LF;
1085            charIdx++;
1086        }
1087        if (c == CH_LF || c == CH_CR) {
1088            lineNum++;
1089            colStart = charIdx;
1090        }
1091        column = charIdx - colStart + 1;
1092
1093        switch (parseState) {
1094        case PARSE_COMMENT:
1095            if (c == 0x0a || c == 0x0d) {
1096                parseState = savedState;
1097            }
1098            break;
1099
1100        case PARSE_TAG:
1101            {
1102            if (c == CH_HASH) {
1103                parseState = PARSE_COMMENT;
1104                savedState = PARSE_TAG;
1105                break;
1106            }
1107            if (u_isUWhiteSpace(c)) {
1108                break;
1109            }
1110            if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1111                delete tp.bi;
1112                tp.bi = BreakIterator::createWordInstance(locale,  status);
1113                charIdx += 5;
1114                break;
1115            }
1116            if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1117                delete tp.bi;
1118                tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1119                charIdx += 5;
1120                break;
1121            }
1122            if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1123                delete tp.bi;
1124                tp.bi = BreakIterator::createLineInstance(locale,  status);
1125                charIdx += 5;
1126                break;
1127            }
1128            if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1129                delete tp.bi;
1130                tp.bi = NULL;
1131                tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1132                charIdx += 5;
1133                break;
1134            }
1135            if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1136                delete tp.bi;
1137                tp.bi = BreakIterator::createTitleInstance(locale,  status);
1138                charIdx += 6;
1139                break;
1140            }
1141
1142            // <locale  loc_name>
1143            localeMatcher.reset(testString);
1144            if (localeMatcher.lookingAt(charIdx-1, status)) {
1145                UnicodeString localeName = localeMatcher.group(1, status);
1146                char localeName8[100];
1147                localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1148                locale = Locale::createFromName(localeName8);
1149                charIdx += localeMatcher.group(0, status).length() - 1;
1150                TEST_ASSERT_SUCCESS(status);
1151                break;
1152            }
1153            if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1154                parseState = PARSE_DATA;
1155                charIdx += 5;
1156                tp.dataToBreak = "";
1157                tp.expectedBreaks->removeAllElements();
1158                tp.srcCol ->removeAllElements();
1159                tp.srcLine->removeAllElements();
1160                break;
1161            }
1162
1163            errln("line %d: Tag expected in test file.", lineNum);
1164            parseState = PARSE_COMMENT;
1165            savedState = PARSE_DATA;
1166            goto end_test; // Stop the test.
1167            }
1168            break;
1169
1170        case PARSE_DATA:
1171            if (c == CH_BULLET) {
1172                int32_t  breakIdx = tp.dataToBreak.length();
1173                tp.expectedBreaks->setSize(breakIdx+1);
1174                tp.expectedBreaks->setElementAt(-1, breakIdx);
1175                tp.srcLine->setSize(breakIdx+1);
1176                tp.srcLine->setElementAt(lineNum, breakIdx);
1177                tp.srcCol ->setSize(breakIdx+1);
1178                tp.srcCol ->setElementAt(column, breakIdx);
1179                break;
1180            }
1181
1182            if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1183                // Add final entry to mappings from break location to source file position.
1184                //  Need one extra because last break position returned is after the
1185                //    last char in the data, not at the last char.
1186                tp.srcLine->addElement(lineNum, status);
1187                tp.srcCol ->addElement(column, status);
1188
1189                parseState = PARSE_TAG;
1190                charIdx += 6;
1191
1192                // RUN THE TEST!
1193                executeTest(&tp);
1194                break;
1195            }
1196
1197            if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1198                // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1199                // Get the code point from the name and insert it into the test data.
1200                //   (Damn, no API takes names in Unicode  !!!
1201                //    we've got to take it back to char *)
1202                int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1203                int32_t nameLength = nameEndIdx - (charIdx+2);
1204                char charNameBuf[200];
1205                UChar32 theChar = -1;
1206                if (nameEndIdx != -1) {
1207                    UErrorCode status = U_ZERO_ERROR;
1208                    testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1209                    charNameBuf[sizeof(charNameBuf)-1] = 0;
1210                    theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1211                    if (U_FAILURE(status)) {
1212                        theChar = -1;
1213                    }
1214                }
1215                if (theChar == -1) {
1216                    errln("Error in named character in test file at line %d, col %d",
1217                        lineNum, column);
1218                } else {
1219                    // Named code point was recognized.  Insert it
1220                    //   into the test data.
1221                    tp.dataToBreak.append(theChar);
1222                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
1223                        tp.srcLine->addElement(lineNum, status);
1224                        tp.srcCol ->addElement(column, status);
1225                    }
1226                }
1227                if (nameEndIdx > charIdx) {
1228                    charIdx = nameEndIdx+1;
1229
1230                }
1231                break;
1232            }
1233
1234
1235
1236
1237            if (testString.compare(charIdx-1, 2, "<>") == 0) {
1238                charIdx++;
1239                int32_t  breakIdx = tp.dataToBreak.length();
1240                tp.expectedBreaks->setSize(breakIdx+1);
1241                tp.expectedBreaks->setElementAt(-1, breakIdx);
1242                tp.srcLine->setSize(breakIdx+1);
1243                tp.srcLine->setElementAt(lineNum, breakIdx);
1244                tp.srcCol ->setSize(breakIdx+1);
1245                tp.srcCol ->setElementAt(column, breakIdx);
1246                break;
1247            }
1248
1249            if (c == CH_LT) {
1250                tagValue   = 0;
1251                parseState = PARSE_NUM;
1252                break;
1253            }
1254
1255            if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1256                parseState = PARSE_COMMENT;
1257                savedState = PARSE_DATA;
1258                break;
1259            }
1260
1261            if (c == CH_BACKSLASH) {
1262                // Check for \ at end of line, a line continuation.
1263                //     Advance over (discard) the newline
1264                UChar32 cp = testString.char32At(charIdx);
1265                if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1266                    // We have a CR LF
1267                    //  Need an extra increment of the input ptr to move over both of them
1268                    charIdx++;
1269                }
1270                if (cp == CH_LF || cp == CH_CR) {
1271                    lineNum++;
1272                    colStart = charIdx;
1273                    charIdx++;
1274                    break;
1275                }
1276
1277                // Let unescape handle the back slash.
1278                cp = testString.unescapeAt(charIdx);
1279                if (cp != -1) {
1280                    // Escape sequence was recognized.  Insert the char
1281                    //   into the test data.
1282                    tp.dataToBreak.append(cp);
1283                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
1284                        tp.srcLine->addElement(lineNum, status);
1285                        tp.srcCol ->addElement(column, status);
1286                    }
1287                    break;
1288                }
1289
1290
1291                // Not a recognized backslash escape sequence.
1292                // Take the next char as a literal.
1293                //  TODO:  Should this be an error?
1294                c = testString.charAt(charIdx);
1295                charIdx = testString.moveIndex32(charIdx, 1);
1296            }
1297
1298            // Normal, non-escaped data char.
1299            tp.dataToBreak.append(c);
1300
1301            // Save the mapping from offset in the data to line/column numbers in
1302            //   the original input file.  Will be used for better error messages only.
1303            //   If there's an expected break before this char, the slot in the mapping
1304            //     vector will already be set for this char; don't overwrite it.
1305            if (tp.dataToBreak.length() > tp.srcLine->size()) {
1306                tp.srcLine->addElement(lineNum, status);
1307                tp.srcCol ->addElement(column, status);
1308            }
1309            break;
1310
1311
1312        case PARSE_NUM:
1313            // We are parsing an expected numeric tag value, like <1234>,
1314            //   within a chunk of data.
1315            if (u_isUWhiteSpace(c)) {
1316                break;
1317            }
1318
1319            if (c == CH_GT) {
1320                // Finished the number.  Add the info to the expected break data,
1321                //   and switch parse state back to doing plain data.
1322                parseState = PARSE_DATA;
1323                if (tagValue == 0) {
1324                    tagValue = -1;
1325                }
1326                int32_t  breakIdx = tp.dataToBreak.length();
1327                tp.expectedBreaks->setSize(breakIdx+1);
1328                tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1329                tp.srcLine->setSize(breakIdx+1);
1330                tp.srcLine->setElementAt(lineNum, breakIdx);
1331                tp.srcCol ->setSize(breakIdx+1);
1332                tp.srcCol ->setElementAt(column, breakIdx);
1333                break;
1334            }
1335
1336            if (u_isdigit(c)) {
1337                tagValue = tagValue*10 + u_charDigitValue(c);
1338                break;
1339            }
1340
1341            errln("Syntax Error in test file at line %d, col %d",
1342                lineNum, column);
1343            parseState = PARSE_COMMENT;
1344            goto end_test; // Stop the test
1345            break;
1346        }
1347
1348
1349        if (U_FAILURE(status)) {
1350            dataerrln("ICU Error %s while parsing test file at line %d.",
1351                u_errorName(status), lineNum);
1352            status = U_ZERO_ERROR;
1353            goto end_test; // Stop the test
1354        }
1355
1356    }
1357
1358end_test:
1359    delete tp.bi;
1360    delete tp.expectedBreaks;
1361    delete tp.srcLine;
1362    delete tp.srcCol;
1363    delete [] testFile;
1364#endif
1365}
1366
1367
1368//-------------------------------------------------------------------------------
1369//
1370//  TestDictRules   create a break iterator from source rules that includes a
1371//                  dictionary range.   Regression for bug #7130.  Source rules
1372//                  do not declare a break iterator type (word, line, sentence, etc.
1373//                  but the dictionary code, without a type, would loop.
1374//
1375//-------------------------------------------------------------------------------
1376void RBBITest::TestDictRules() {
1377    const char *rules =  "$dictionary = [a-z]; \n"
1378                         "!!forward; \n"
1379                         "$dictionary $dictionary; \n"
1380                         "!!reverse; \n"
1381                         "$dictionary $dictionary; \n";
1382    const char *text = "aa";
1383    UErrorCode status = U_ZERO_ERROR;
1384    UParseError parseError;
1385
1386    RuleBasedBreakIterator bi(rules, parseError, status);
1387    if (U_SUCCESS(status)) {
1388        UnicodeString utext = text;
1389        bi.setText(utext);
1390        int32_t position;
1391        int32_t loops;
1392        for (loops = 0; loops<10; loops++) {
1393            position = bi.next();
1394            if (position == RuleBasedBreakIterator::DONE) {
1395                break;
1396            }
1397        }
1398        TEST_ASSERT(loops == 1);
1399    } else {
1400        dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1401    }
1402}
1403
1404
1405
1406//-------------------------------------------------------------------------------
1407//
1408//    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1409//    return the datain one big UChar * buffer, which the caller must delete.
1410//
1411//    parameters:
1412//          fileName:   the name of the file, with no directory part.  The test data directory
1413//                      is assumed.
1414//          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1415//          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1416//                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1417//                      Pass NULL for the system default encoding.
1418//          status
1419//    returns:
1420//                      The file data, converted to UChar.
1421//                      The caller must delete this when done with
1422//                           delete [] theBuffer;
1423//
1424//    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1425//           Move this function to some common place.
1426//
1427//--------------------------------------------------------------------------------
1428UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1429    UChar       *retPtr  = NULL;
1430    char        *fileBuf = NULL;
1431    UConverter* conv     = NULL;
1432    FILE        *f       = NULL;
1433
1434    ulen = 0;
1435    if (U_FAILURE(status)) {
1436        return retPtr;
1437    }
1438
1439    //
1440    //  Open the file.
1441    //
1442    f = fopen(fileName, "rb");
1443    if (f == 0) {
1444        dataerrln("Error opening test data file %s\n", fileName);
1445        status = U_FILE_ACCESS_ERROR;
1446        return NULL;
1447    }
1448    //
1449    //  Read it in
1450    //
1451    int   fileSize;
1452    int   amt_read;
1453
1454    fseek( f, 0, SEEK_END);
1455    fileSize = ftell(f);
1456    fileBuf = new char[fileSize];
1457    fseek(f, 0, SEEK_SET);
1458    amt_read = fread(fileBuf, 1, fileSize, f);
1459    if (amt_read != fileSize || fileSize <= 0) {
1460        errln("Error reading test data file.");
1461        goto cleanUpAndReturn;
1462    }
1463
1464    //
1465    // Look for a Unicode Signature (BOM) on the data just read
1466    //
1467    int32_t        signatureLength;
1468    const char *   fileBufC;
1469    const char*    bomEncoding;
1470
1471    fileBufC = fileBuf;
1472    bomEncoding = ucnv_detectUnicodeSignature(
1473        fileBuf, fileSize, &signatureLength, &status);
1474    if(bomEncoding!=NULL ){
1475        fileBufC  += signatureLength;
1476        fileSize  -= signatureLength;
1477        encoding = bomEncoding;
1478    }
1479
1480    //
1481    // Open a converter to take the rule file to UTF-16
1482    //
1483    conv = ucnv_open(encoding, &status);
1484    if (U_FAILURE(status)) {
1485        goto cleanUpAndReturn;
1486    }
1487
1488    //
1489    // Convert the rules to UChar.
1490    //  Preflight first to determine required buffer size.
1491    //
1492    ulen = ucnv_toUChars(conv,
1493        NULL,           //  dest,
1494        0,              //  destCapacity,
1495        fileBufC,
1496        fileSize,
1497        &status);
1498    if (status == U_BUFFER_OVERFLOW_ERROR) {
1499        // Buffer Overflow is expected from the preflight operation.
1500        status = U_ZERO_ERROR;
1501
1502        retPtr = new UChar[ulen+1];
1503        ucnv_toUChars(conv,
1504            retPtr,       //  dest,
1505            ulen+1,
1506            fileBufC,
1507            fileSize,
1508            &status);
1509    }
1510
1511cleanUpAndReturn:
1512    fclose(f);
1513    delete []fileBuf;
1514    ucnv_close(conv);
1515    if (U_FAILURE(status)) {
1516        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1517        delete []retPtr;
1518        retPtr = 0;
1519        ulen   = 0;
1520    };
1521    return retPtr;
1522}
1523
1524
1525
1526//--------------------------------------------------------------------------------------------
1527//
1528//   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1529//
1530//-------------------------------------------------------------------------------------------
1531void RBBITest::TestUnicodeFiles() {
1532    RuleBasedBreakIterator  *bi;
1533    UErrorCode               status = U_ZERO_ERROR;
1534
1535    bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1536    TEST_ASSERT_SUCCESS(status);
1537    if (U_SUCCESS(status)) {
1538        runUnicodeTestData("GraphemeBreakTest.txt", bi);
1539    }
1540    delete bi;
1541
1542    bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1543    TEST_ASSERT_SUCCESS(status);
1544    if (U_SUCCESS(status)) {
1545        runUnicodeTestData("WordBreakTest.txt", bi);
1546    }
1547    delete bi;
1548
1549    bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1550    TEST_ASSERT_SUCCESS(status);
1551    if (U_SUCCESS(status)) {
1552        runUnicodeTestData("SentenceBreakTest.txt", bi);
1553    }
1554    delete bi;
1555
1556    bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1557    TEST_ASSERT_SUCCESS(status);
1558    if (U_SUCCESS(status)) {
1559        runUnicodeTestData("LineBreakTest.txt", bi);
1560    }
1561    delete bi;
1562}
1563
1564
1565//--------------------------------------------------------------------------------------------
1566//
1567//   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1568//
1569//-------------------------------------------------------------------------------------------
1570void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1571#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1572    // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
1573    UBool isTicket7270Fixed = !logKnownIssue("7270");
1574    UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
1575    UErrorCode  status = U_ZERO_ERROR;
1576
1577    //
1578    //  Open and read the test data file, put it into a UnicodeString.
1579    //
1580    const char *testDataDirectory = IntlTest::getSourceTestData(status);
1581    char testFileName[1000];
1582    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1583        dataerrln("Can't open test data.  Path too long.");
1584        return;
1585    }
1586    strcpy(testFileName, testDataDirectory);
1587    strcat(testFileName, fileName);
1588
1589    logln("Opening data file %s\n", fileName);
1590
1591    int    len;
1592    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1593    if (status != U_FILE_ACCESS_ERROR) {
1594        TEST_ASSERT_SUCCESS(status);
1595        TEST_ASSERT(testFile != NULL);
1596    }
1597    if (U_FAILURE(status) || testFile == NULL) {
1598        return; /* something went wrong, error already output */
1599    }
1600    UnicodeString testFileAsString(TRUE, testFile, len);
1601
1602    //
1603    //  Parse the test data file using a regular expression.
1604    //  Each kind of token is recognized in its own capture group; what type of item was scanned
1605    //     is identified by which group had a match.
1606    //
1607    //    Caputure Group #                  1          2            3            4           5
1608    //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1609    //
1610    UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1611    RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1612    UnicodeString   testString;
1613    UVector32       breakPositions(status);
1614    int             lineNumber = 1;
1615    TEST_ASSERT_SUCCESS(status);
1616    if (U_FAILURE(status)) {
1617        return;
1618    }
1619
1620    //
1621    //  Scan through each test case, building up the string to be broken in testString,
1622    //   and the positions that should be boundaries in the breakPositions vector.
1623    //
1624    int spin = 0;
1625    while (tokenMatcher.find()) {
1626      	if(tokenMatcher.hitEnd()) {
1627          /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1628             This occurred when the text file was corrupt (wasn't marked as UTF-8)
1629             and caused an infinite loop here on EBCDIC systems!
1630          */
1631          fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1632          //	   return;
1633      	}
1634        if (tokenMatcher.start(1, status) >= 0) {
1635            // Scanned a divide sign, indicating a break position in the test data.
1636            if (testString.length()>0) {
1637                breakPositions.addElement(testString.length(), status);
1638            }
1639        }
1640        else if (tokenMatcher.start(2, status) >= 0) {
1641            // Scanned an 'x', meaning no break at this position in the test data
1642            //   Nothing to be done here.
1643            }
1644        else if (tokenMatcher.start(3, status) >= 0) {
1645            // Scanned Hex digits.  Convert them to binary, append to the character data string.
1646            const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1647            int length = hexNumber.length();
1648            if (length<=8) {
1649                char buf[10];
1650                hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1651                UChar32 c = (UChar32)strtol(buf, NULL, 16);
1652                if (c<=0x10ffff) {
1653                    testString.append(c);
1654                } else {
1655                    errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1656                       fileName, lineNumber);
1657                }
1658            } else {
1659                errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1660                       fileName, lineNumber);
1661             }
1662        }
1663        else if (tokenMatcher.start(4, status) >= 0) {
1664            // Scanned to end of a line, possibly skipping over a comment in the process.
1665            //   If the line from the file contained test data, run the test now.
1666            //
1667            if (testString.length() > 0) {
1668// TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data.
1669//             Rule 8
1670//                ZW SP* <break>
1671//             is not yet implemented.
1672if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber ||
1673                                            5202 == lineNumber ||
1674                                            5214 == lineNumber ||
1675                                            5246 == lineNumber ||
1676                                            5298 == lineNumber ||
1677                                            5302 == lineNumber ))) {
1678                checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1679}
1680            }
1681
1682            // Clear out this test case.
1683            //    The string and breakPositions vector will be refilled as the next
1684            //       test case is parsed.
1685            testString.remove();
1686            breakPositions.removeAllElements();
1687            lineNumber++;
1688        } else {
1689            // Scanner catchall.  Something unrecognized appeared on the line.
1690            char token[16];
1691            UnicodeString uToken = tokenMatcher.group(0, status);
1692            uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1693            token[sizeof(token)-1] = 0;
1694            errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1695
1696            // Clean up, in preparation for continuing with the next line.
1697            testString.remove();
1698            breakPositions.removeAllElements();
1699            lineNumber++;
1700        }
1701        TEST_ASSERT_SUCCESS(status);
1702        if (U_FAILURE(status)) {
1703            break;
1704        }
1705    }
1706
1707    delete [] testFile;
1708 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1709}
1710
1711//--------------------------------------------------------------------------------------------
1712//
1713//   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1714//                            test data files.  Do only a simple, forward-only check -
1715//                            this test is mostly to check that ICU and the Unicode
1716//                            data agree with each other.
1717//
1718//--------------------------------------------------------------------------------------------
1719void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1720                         const UnicodeString &testString,   // Text data to be broken
1721                         UVector32 *breakPositions,         // Positions where breaks should be found.
1722                         RuleBasedBreakIterator *bi) {
1723    int32_t pos;                 // Break Position in the test string
1724    int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1725    int32_t expectedPos;         // Expected break position (index into test string)
1726
1727    bi->setText(testString);
1728    pos = bi->first();
1729    pos = bi->next();
1730
1731    while (pos != BreakIterator::DONE) {
1732        if (expectedI >= breakPositions->size()) {
1733            errln("Test file \"%s\", line %d, unexpected break found at position %d",
1734                testFileName, lineNumber, pos);
1735            break;
1736        }
1737        expectedPos = breakPositions->elementAti(expectedI);
1738        if (pos < expectedPos) {
1739            errln("Test file \"%s\", line %d, unexpected break found at position %d",
1740                testFileName, lineNumber, pos);
1741            break;
1742        }
1743        if (pos > expectedPos) {
1744            errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1745                testFileName, lineNumber, expectedPos);
1746            break;
1747        }
1748        pos = bi->next();
1749        expectedI++;
1750    }
1751
1752    if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1753        errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1754            testFileName, lineNumber, breakPositions->elementAti(expectedI));
1755    }
1756}
1757
1758
1759
1760#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1761//---------------------------------------------------------------------------------------
1762//
1763//   classs RBBIMonkeyKind
1764//
1765//      Monkey Test for Break Iteration
1766//      Abstract interface class.   Concrete derived classes independently
1767//      implement the break rules for different iterator types.
1768//
1769//      The Monkey Test itself uses doesn't know which type of break iterator it is
1770//      testing, but works purely in terms of the interface defined here.
1771//
1772//---------------------------------------------------------------------------------------
1773class RBBIMonkeyKind {
1774public:
1775    // Return a UVector of UnicodeSets, representing the character classes used
1776    //   for this type of iterator.
1777    virtual  UVector  *charClasses() = 0;
1778
1779    // Set the test text on which subsequent calls to next() will operate
1780    virtual  void      setText(const UnicodeString &s) = 0;
1781
1782    // Find the next break postion, starting from the prev break position, or from zero.
1783    // Return -1 after reaching end of string.
1784    virtual  int32_t   next(int32_t i) = 0;
1785
1786    virtual ~RBBIMonkeyKind();
1787    UErrorCode       deferredStatus;
1788
1789
1790protected:
1791    RBBIMonkeyKind();
1792
1793private:
1794};
1795
1796RBBIMonkeyKind::RBBIMonkeyKind() {
1797    deferredStatus = U_ZERO_ERROR;
1798}
1799
1800RBBIMonkeyKind::~RBBIMonkeyKind() {
1801}
1802
1803
1804//----------------------------------------------------------------------------------------
1805//
1806//   Random Numbers.  Similar to standard lib rand() and srand()
1807//                    Not using library to
1808//                      1.  Get same results on all platforms.
1809//                      2.  Get access to current seed, to more easily reproduce failures.
1810//
1811//---------------------------------------------------------------------------------------
1812static uint32_t m_seed = 1;
1813
1814static uint32_t m_rand()
1815{
1816    m_seed = m_seed * 1103515245 + 12345;
1817    return (uint32_t)(m_seed/65536) % 32768;
1818}
1819
1820
1821//------------------------------------------------------------------------------------------
1822//
1823//   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1824//                             of RBBIMonkeyKind.
1825//
1826//------------------------------------------------------------------------------------------
1827class RBBICharMonkey: public RBBIMonkeyKind {
1828public:
1829    RBBICharMonkey();
1830    virtual          ~RBBICharMonkey();
1831    virtual  UVector *charClasses();
1832    virtual  void     setText(const UnicodeString &s);
1833    virtual  int32_t  next(int32_t i);
1834private:
1835    UVector   *fSets;
1836
1837    UnicodeSet  *fCRLFSet;
1838    UnicodeSet  *fControlSet;
1839    UnicodeSet  *fExtendSet;
1840    UnicodeSet  *fRegionalIndicatorSet;
1841    UnicodeSet  *fPrependSet;
1842    UnicodeSet  *fSpacingSet;
1843    UnicodeSet  *fLSet;
1844    UnicodeSet  *fVSet;
1845    UnicodeSet  *fTSet;
1846    UnicodeSet  *fLVSet;
1847    UnicodeSet  *fLVTSet;
1848    UnicodeSet  *fHangulSet;
1849    UnicodeSet  *fAnySet;
1850
1851    const UnicodeString *fText;
1852};
1853
1854
1855RBBICharMonkey::RBBICharMonkey() {
1856    UErrorCode  status = U_ZERO_ERROR;
1857
1858    fText = NULL;
1859
1860    fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1861    fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
1862    fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
1863    fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1864    fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1865    fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1866    fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1867    fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1868    fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1869    fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1870    fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1871    fHangulSet  = new UnicodeSet();
1872    fHangulSet->addAll(*fLSet);
1873    fHangulSet->addAll(*fVSet);
1874    fHangulSet->addAll(*fTSet);
1875    fHangulSet->addAll(*fLVSet);
1876    fHangulSet->addAll(*fLVTSet);
1877    fAnySet     = new UnicodeSet(0, 0x10ffff);
1878
1879    fSets       = new UVector(status);
1880    fSets->addElement(fCRLFSet,    status);
1881    fSets->addElement(fControlSet, status);
1882    fSets->addElement(fExtendSet,  status);
1883    fSets->addElement(fRegionalIndicatorSet, status);
1884    if (!fPrependSet->isEmpty()) {
1885        fSets->addElement(fPrependSet, status);
1886    }
1887    fSets->addElement(fSpacingSet, status);
1888    fSets->addElement(fHangulSet,  status);
1889    fSets->addElement(fAnySet,     status);
1890    if (U_FAILURE(status)) {
1891        deferredStatus = status;
1892    }
1893}
1894
1895
1896void RBBICharMonkey::setText(const UnicodeString &s) {
1897    fText = &s;
1898}
1899
1900
1901
1902int32_t RBBICharMonkey::next(int32_t prevPos) {
1903    int    p0, p1, p2, p3;    // Indices of the significant code points around the
1904                              //   break position being tested.  The candidate break
1905                              //   location is before p2.
1906
1907    int     breakPos = -1;
1908
1909    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1910
1911    if (U_FAILURE(deferredStatus)) {
1912        return -1;
1913    }
1914
1915    // Previous break at end of string.  return DONE.
1916    if (prevPos >= fText->length()) {
1917        return -1;
1918    }
1919    p0 = p1 = p2 = p3 = prevPos;
1920    c3 =  fText->char32At(prevPos);
1921    c0 = c1 = c2 = 0;
1922    (void)p0;   // suppress set but not used warning.
1923    (void)c0;
1924
1925    // Loop runs once per "significant" character position in the input text.
1926    for (;;) {
1927        // Move all of the positions forward in the input string.
1928        p0 = p1;  c0 = c1;
1929        p1 = p2;  c1 = c2;
1930        p2 = p3;  c2 = c3;
1931
1932        // Advancd p3 by one codepoint
1933        p3 = fText->moveIndex32(p3, 1);
1934        c3 = fText->char32At(p3);
1935
1936        if (p1 == p2) {
1937            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1938            continue;
1939        }
1940        if (p2 == fText->length()) {
1941            // Reached end of string.  Always a break position.
1942            break;
1943        }
1944
1945        // Rule  GB3   CR x LF
1946        //     No Extend or Format characters may appear between the CR and LF,
1947        //     which requires the additional check for p2 immediately following p1.
1948        //
1949        if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1950            continue;
1951        }
1952
1953        // Rule (GB4).   ( Control | CR | LF ) <break>
1954        if (fControlSet->contains(c1) ||
1955            c1 == 0x0D ||
1956            c1 == 0x0A)  {
1957            break;
1958        }
1959
1960        // Rule (GB5)    <break>  ( Control | CR | LF )
1961        //
1962        if (fControlSet->contains(c2) ||
1963            c2 == 0x0D ||
1964            c2 == 0x0A)  {
1965            break;
1966        }
1967
1968
1969        // Rule (GB6)  L x ( L | V | LV | LVT )
1970        if (fLSet->contains(c1) &&
1971               (fLSet->contains(c2)  ||
1972                fVSet->contains(c2)  ||
1973                fLVSet->contains(c2) ||
1974                fLVTSet->contains(c2))) {
1975            continue;
1976        }
1977
1978        // Rule (GB7)    ( LV | V )  x  ( V | T )
1979        if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1980            (fVSet->contains(c2) || fTSet->contains(c2)))  {
1981            continue;
1982        }
1983
1984        // Rule (GB8)    ( LVT | T)  x T
1985        if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1986            fTSet->contains(c2))  {
1987            continue;
1988        }
1989
1990        // Rule (GB8a)    Regional_Indicator x Regional_Indicator
1991        if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1992            continue;
1993        }
1994
1995        // Rule (GB9)    Numeric x ALetter
1996        if (fExtendSet->contains(c2))  {
1997            continue;
1998        }
1999
2000        // Rule (GB9a)   x  SpacingMark
2001        if (fSpacingSet->contains(c2)) {
2002            continue;
2003        }
2004
2005        // Rule (GB9b)   Prepend x
2006        if (fPrependSet->contains(c1)) {
2007            continue;
2008        }
2009
2010        // Rule (GB10)  Any  <break>  Any
2011        break;
2012    }
2013
2014    breakPos = p2;
2015    return breakPos;
2016}
2017
2018
2019
2020UVector  *RBBICharMonkey::charClasses() {
2021    return fSets;
2022}
2023
2024
2025RBBICharMonkey::~RBBICharMonkey() {
2026    delete fSets;
2027    delete fCRLFSet;
2028    delete fControlSet;
2029    delete fExtendSet;
2030    delete fRegionalIndicatorSet;
2031    delete fPrependSet;
2032    delete fSpacingSet;
2033    delete fLSet;
2034    delete fVSet;
2035    delete fTSet;
2036    delete fLVSet;
2037    delete fLVTSet;
2038    delete fHangulSet;
2039    delete fAnySet;
2040}
2041
2042//------------------------------------------------------------------------------------------
2043//
2044//   class RBBIWordMonkey      Word Break specific implementation
2045//                             of RBBIMonkeyKind.
2046//
2047//------------------------------------------------------------------------------------------
2048class RBBIWordMonkey: public RBBIMonkeyKind {
2049public:
2050    RBBIWordMonkey();
2051    virtual          ~RBBIWordMonkey();
2052    virtual  UVector *charClasses();
2053    virtual  void     setText(const UnicodeString &s);
2054    virtual int32_t   next(int32_t i);
2055private:
2056    UVector      *fSets;
2057
2058    UnicodeSet  *fCRSet;
2059    UnicodeSet  *fLFSet;
2060    UnicodeSet  *fNewlineSet;
2061    UnicodeSet  *fRegionalIndicatorSet;
2062    UnicodeSet  *fKatakanaSet;
2063    UnicodeSet  *fHebrew_LetterSet;
2064    UnicodeSet  *fALetterSet;
2065    // TODO(jungshik): Do we still need this change?
2066    // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
2067    UnicodeSet  *fSingle_QuoteSet;
2068    UnicodeSet  *fDouble_QuoteSet;
2069    UnicodeSet  *fMidNumLetSet;
2070    UnicodeSet  *fMidLetterSet;
2071    UnicodeSet  *fMidNumSet;
2072    UnicodeSet  *fNumericSet;
2073    UnicodeSet  *fFormatSet;
2074    UnicodeSet  *fOtherSet;
2075    UnicodeSet  *fExtendSet;
2076    UnicodeSet  *fExtendNumLetSet;
2077    UnicodeSet  *fDictionaryCjkSet;
2078
2079    const UnicodeString  *fText;
2080};
2081
2082
2083RBBIWordMonkey::RBBIWordMonkey()
2084{
2085    UErrorCode  status = U_ZERO_ERROR;
2086
2087    fSets            = new UVector(status);
2088
2089    fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2090    fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2091    fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2092    fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2093    // Exclude Hangul syllables from ALetterSet during testing.
2094    // Leave CJK dictionary characters out from the monkey tests!
2095#if 0
2096    fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
2097                                      "[\\p{Line_Break = Complex_Context}"
2098                                      "-\\p{Grapheme_Cluster_Break = Extend}"
2099                                      "-\\p{Grapheme_Cluster_Break = Control}"
2100                                      "]]",
2101                                      status);
2102#endif
2103    fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2104    fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2105    fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2106    fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2107    fALetterSet->removeAll(*fDictionaryCjkSet);
2108    fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
2109    fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
2110    fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2111    fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2112    fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2113    // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2114    // we should figure out why
2115    fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2116    fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2117    fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2118    fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2119
2120    fOtherSet        = new UnicodeSet();
2121    if(U_FAILURE(status)) {
2122      deferredStatus = status;
2123      return;
2124    }
2125
2126    fOtherSet->complement();
2127    fOtherSet->removeAll(*fCRSet);
2128    fOtherSet->removeAll(*fLFSet);
2129    fOtherSet->removeAll(*fNewlineSet);
2130    fOtherSet->removeAll(*fKatakanaSet);
2131    fOtherSet->removeAll(*fHebrew_LetterSet);
2132    fOtherSet->removeAll(*fALetterSet);
2133    fOtherSet->removeAll(*fSingle_QuoteSet);
2134    fOtherSet->removeAll(*fDouble_QuoteSet);
2135    fOtherSet->removeAll(*fMidLetterSet);
2136    fOtherSet->removeAll(*fMidNumSet);
2137    fOtherSet->removeAll(*fNumericSet);
2138    fOtherSet->removeAll(*fExtendNumLetSet);
2139    fOtherSet->removeAll(*fFormatSet);
2140    fOtherSet->removeAll(*fExtendSet);
2141    fOtherSet->removeAll(*fRegionalIndicatorSet);
2142    // Inhibit dictionary characters from being tested at all.
2143    fOtherSet->removeAll(*fDictionaryCjkSet);
2144    fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2145
2146    fSets->addElement(fCRSet,                status);
2147    fSets->addElement(fLFSet,                status);
2148    fSets->addElement(fNewlineSet,           status);
2149    fSets->addElement(fRegionalIndicatorSet, status);
2150    fSets->addElement(fHebrew_LetterSet,     status);
2151    fSets->addElement(fALetterSet,           status);
2152    fSets->addElement(fSingle_QuoteSet,      status);
2153    fSets->addElement(fDouble_QuoteSet,      status);
2154    //fSets->addElement(fKatakanaSet,          status); //TODO: work out how to test katakana
2155    fSets->addElement(fMidLetterSet,         status);
2156    fSets->addElement(fMidNumLetSet,         status);
2157    fSets->addElement(fMidNumSet,            status);
2158    fSets->addElement(fNumericSet,           status);
2159    fSets->addElement(fFormatSet,            status);
2160    fSets->addElement(fExtendSet,            status);
2161    fSets->addElement(fOtherSet,             status);
2162    fSets->addElement(fExtendNumLetSet,      status);
2163
2164    if (U_FAILURE(status)) {
2165        deferredStatus = status;
2166    }
2167}
2168
2169void RBBIWordMonkey::setText(const UnicodeString &s) {
2170    fText       = &s;
2171}
2172
2173
2174int32_t RBBIWordMonkey::next(int32_t prevPos) {
2175    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2176                              //   break position being tested.  The candidate break
2177                              //   location is before p2.
2178
2179    int     breakPos = -1;
2180
2181    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2182
2183    if (U_FAILURE(deferredStatus)) {
2184        return -1;
2185    }
2186
2187    // Prev break at end of string.  return DONE.
2188    if (prevPos >= fText->length()) {
2189        return -1;
2190    }
2191    p0 = p1 = p2 = p3 = prevPos;
2192    c3 =  fText->char32At(prevPos);
2193    c0 = c1 = c2 = 0;
2194    (void)p0;       // Suppress set but not used warning.
2195
2196    // Loop runs once per "significant" character position in the input text.
2197    for (;;) {
2198        // Move all of the positions forward in the input string.
2199        p0 = p1;  c0 = c1;
2200        p1 = p2;  c1 = c2;
2201        p2 = p3;  c2 = c3;
2202
2203        // Advancd p3 by    X(Extend | Format)*   Rule 4
2204        //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2205        do {
2206            p3 = fText->moveIndex32(p3, 1);
2207            c3 = fText->char32At(p3);
2208            if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2209               break;
2210            };
2211        }
2212        while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2213
2214
2215        if (p1 == p2) {
2216            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2217            continue;
2218        }
2219        if (p2 == fText->length()) {
2220            // Reached end of string.  Always a break position.
2221            break;
2222        }
2223
2224        // Rule  (3)   CR x LF
2225        //     No Extend or Format characters may appear between the CR and LF,
2226        //     which requires the additional check for p2 immediately following p1.
2227        //
2228        if (c1==0x0D && c2==0x0A) {
2229            continue;
2230        }
2231
2232        // Rule (3a)  Break before and after newlines (including CR and LF)
2233        //
2234        if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2235            break;
2236        };
2237        if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2238            break;
2239        };
2240
2241        // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2242        if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2243            (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2244            continue;
2245        }
2246
2247        // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2248        //
2249        if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2250             (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2251             (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2252            continue;
2253        }
2254
2255        // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
2256        if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2257            (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2258            (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2259            continue;
2260        }
2261
2262        // Rule (7a)     Hebrew_Letter x Single_Quote
2263        if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2264            continue;
2265        }
2266
2267        // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
2268        if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2269            continue;
2270        }
2271
2272        // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
2273        if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2274            continue;
2275        }
2276
2277        // Rule (8)    Numeric x Numeric
2278        if (fNumericSet->contains(c1) &&
2279            fNumericSet->contains(c2))  {
2280            continue;
2281        }
2282
2283        // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
2284        if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2285            fNumericSet->contains(c2))  {
2286            continue;
2287        }
2288
2289        // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
2290        if (fNumericSet->contains(c1) &&
2291            (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2292            continue;
2293        }
2294
2295        // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
2296        if (fNumericSet->contains(c0) &&
2297            (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2298            fNumericSet->contains(c2)) {
2299            continue;
2300        }
2301
2302        // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2303        if (fNumericSet->contains(c1) &&
2304            (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2305            fNumericSet->contains(c3)) {
2306            continue;
2307        }
2308
2309        // Rule (13)  Katakana x Katakana
2310        if (fKatakanaSet->contains(c1) &&
2311            fKatakanaSet->contains(c2))  {
2312            continue;
2313        }
2314
2315        // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2316        if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2317             fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2318             fExtendNumLetSet->contains(c2)) {
2319                continue;
2320        }
2321
2322        // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2323        if (fExtendNumLetSet->contains(c1) &&
2324                (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2325                 fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2326            continue;
2327        }
2328
2329        // Rule 13c
2330        if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2331            continue;
2332        }
2333
2334        // Rule 14.  Break found here.
2335        break;
2336    }
2337
2338    breakPos = p2;
2339    return breakPos;
2340}
2341
2342
2343UVector  *RBBIWordMonkey::charClasses() {
2344    return fSets;
2345}
2346
2347
2348RBBIWordMonkey::~RBBIWordMonkey() {
2349    delete fSets;
2350    delete fCRSet;
2351    delete fLFSet;
2352    delete fNewlineSet;
2353    delete fKatakanaSet;
2354    delete fHebrew_LetterSet;
2355    delete fALetterSet;
2356    delete fSingle_QuoteSet;
2357    delete fDouble_QuoteSet;
2358    delete fMidNumLetSet;
2359    delete fMidLetterSet;
2360    delete fMidNumSet;
2361    delete fNumericSet;
2362    delete fFormatSet;
2363    delete fExtendSet;
2364    delete fExtendNumLetSet;
2365    delete fRegionalIndicatorSet;
2366    delete fDictionaryCjkSet;
2367    delete fOtherSet;
2368}
2369
2370
2371
2372
2373//------------------------------------------------------------------------------------------
2374//
2375//   class RBBISentMonkey      Sentence Break specific implementation
2376//                             of RBBIMonkeyKind.
2377//
2378//------------------------------------------------------------------------------------------
2379class RBBISentMonkey: public RBBIMonkeyKind {
2380public:
2381    RBBISentMonkey();
2382    virtual          ~RBBISentMonkey();
2383    virtual  UVector *charClasses();
2384    virtual  void     setText(const UnicodeString &s);
2385    virtual int32_t   next(int32_t i);
2386private:
2387    int               moveBack(int posFrom);
2388    int               moveForward(int posFrom);
2389    UChar32           cAt(int pos);
2390
2391    UVector      *fSets;
2392
2393    UnicodeSet  *fSepSet;
2394    UnicodeSet  *fFormatSet;
2395    UnicodeSet  *fSpSet;
2396    UnicodeSet  *fLowerSet;
2397    UnicodeSet  *fUpperSet;
2398    UnicodeSet  *fOLetterSet;
2399    UnicodeSet  *fNumericSet;
2400    UnicodeSet  *fATermSet;
2401    UnicodeSet  *fSContinueSet;
2402    UnicodeSet  *fSTermSet;
2403    UnicodeSet  *fCloseSet;
2404    UnicodeSet  *fOtherSet;
2405    UnicodeSet  *fExtendSet;
2406
2407    const UnicodeString  *fText;
2408
2409};
2410
2411RBBISentMonkey::RBBISentMonkey()
2412{
2413    UErrorCode  status = U_ZERO_ERROR;
2414
2415    fSets            = new UVector(status);
2416
2417    //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2418    //                       set and made into character classes of their own.  For the monkey impl,
2419    //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2420    fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2421    fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2422    fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2423    fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2424    fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2425    fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2426    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2427    fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2428    fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2429    fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2430    fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2431    fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2432    fOtherSet        = new UnicodeSet();
2433
2434    if(U_FAILURE(status)) {
2435      deferredStatus = status;
2436      return;
2437    }
2438
2439    fOtherSet->complement();
2440    fOtherSet->removeAll(*fSepSet);
2441    fOtherSet->removeAll(*fFormatSet);
2442    fOtherSet->removeAll(*fSpSet);
2443    fOtherSet->removeAll(*fLowerSet);
2444    fOtherSet->removeAll(*fUpperSet);
2445    fOtherSet->removeAll(*fOLetterSet);
2446    fOtherSet->removeAll(*fNumericSet);
2447    fOtherSet->removeAll(*fATermSet);
2448    fOtherSet->removeAll(*fSContinueSet);
2449    fOtherSet->removeAll(*fSTermSet);
2450    fOtherSet->removeAll(*fCloseSet);
2451    fOtherSet->removeAll(*fExtendSet);
2452
2453    fSets->addElement(fSepSet,       status);
2454    fSets->addElement(fFormatSet,    status);
2455    fSets->addElement(fSpSet,        status);
2456    fSets->addElement(fLowerSet,     status);
2457    fSets->addElement(fUpperSet,     status);
2458    fSets->addElement(fOLetterSet,   status);
2459    fSets->addElement(fNumericSet,   status);
2460    fSets->addElement(fATermSet,     status);
2461    fSets->addElement(fSContinueSet, status);
2462    fSets->addElement(fSTermSet,     status);
2463    fSets->addElement(fCloseSet,     status);
2464    fSets->addElement(fOtherSet,     status);
2465    fSets->addElement(fExtendSet,    status);
2466
2467    if (U_FAILURE(status)) {
2468        deferredStatus = status;
2469    }
2470}
2471
2472
2473
2474void RBBISentMonkey::setText(const UnicodeString &s) {
2475    fText       = &s;
2476}
2477
2478UVector  *RBBISentMonkey::charClasses() {
2479    return fSets;
2480}
2481
2482
2483//  moveBack()   Find the "significant" code point preceding the index i.
2484//               Skips over ($Extend | $Format)* .
2485//
2486int RBBISentMonkey::moveBack(int i) {
2487    if (i <= 0) {
2488        return -1;
2489    }
2490    UChar32   c;
2491    int32_t   j = i;
2492    do {
2493        j = fText->moveIndex32(j, -1);
2494        c = fText->char32At(j);
2495    }
2496    while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2497    return j;
2498
2499 }
2500
2501
2502int RBBISentMonkey::moveForward(int i) {
2503    if (i>=fText->length()) {
2504        return fText->length();
2505    }
2506    UChar32   c;
2507    int32_t   j = i;
2508    do {
2509        j = fText->moveIndex32(j, 1);
2510        c = cAt(j);
2511    }
2512    while (fFormatSet->contains(c) || fExtendSet->contains(c));
2513    return j;
2514}
2515
2516UChar32 RBBISentMonkey::cAt(int pos) {
2517    if (pos<0 || pos>=fText->length()) {
2518        return -1;
2519    } else {
2520        return fText->char32At(pos);
2521    }
2522}
2523
2524int32_t RBBISentMonkey::next(int32_t prevPos) {
2525    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2526                              //   break position being tested.  The candidate break
2527                              //   location is before p2.
2528
2529    int     breakPos = -1;
2530
2531    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2532    UChar32 c;
2533
2534    if (U_FAILURE(deferredStatus)) {
2535        return -1;
2536    }
2537
2538    // Prev break at end of string.  return DONE.
2539    if (prevPos >= fText->length()) {
2540        return -1;
2541    }
2542    p0 = p1 = p2 = p3 = prevPos;
2543    c3 =  fText->char32At(prevPos);
2544    c0 = c1 = c2 = 0;
2545    (void)p0;     // Suppress set but not used warning.
2546
2547    // Loop runs once per "significant" character position in the input text.
2548    for (;;) {
2549        // Move all of the positions forward in the input string.
2550        p0 = p1;  c0 = c1;
2551        p1 = p2;  c1 = c2;
2552        p2 = p3;  c2 = c3;
2553
2554        // Advancd p3 by    X(Extend | Format)*   Rule 4
2555        p3 = moveForward(p3);
2556        c3 = cAt(p3);
2557
2558        // Rule (3)  CR x LF
2559        if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2560            continue;
2561        }
2562
2563        // Rule (4).   Sep  <break>
2564        if (fSepSet->contains(c1)) {
2565            p2 = p1+1;   // Separators don't combine with Extend or Format.
2566            break;
2567        }
2568
2569        if (p2 >= fText->length()) {
2570            // Reached end of string.  Always a break position.
2571            break;
2572        }
2573
2574        if (p2 == prevPos) {
2575            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2576            continue;
2577        }
2578
2579        // Rule (6).   ATerm x Numeric
2580        if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2581            continue;
2582        }
2583
2584        // Rule (7).  Upper ATerm  x  Uppper
2585        if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2586            continue;
2587        }
2588
2589        // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2590        //           Note:  STerm | ATerm are added to the negated part of the expression by a
2591        //                  note to the Unicode 5.0 documents.
2592        int p8 = p1;
2593        while (fSpSet->contains(cAt(p8))) {
2594            p8 = moveBack(p8);
2595        }
2596        while (fCloseSet->contains(cAt(p8))) {
2597            p8 = moveBack(p8);
2598        }
2599        if (fATermSet->contains(cAt(p8))) {
2600            p8=p2;
2601            for (;;) {
2602                c = cAt(p8);
2603                if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2604                    fLowerSet->contains(c) || fSepSet->contains(c) ||
2605                    fATermSet->contains(c) || fSTermSet->contains(c))  {
2606                    break;
2607                }
2608                p8 = moveForward(p8);
2609            }
2610            if (fLowerSet->contains(cAt(p8))) {
2611                continue;
2612            }
2613        }
2614
2615        // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2616        if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2617            p8 = p1;
2618            while (fSpSet->contains(cAt(p8))) {
2619                p8 = moveBack(p8);
2620            }
2621            while (fCloseSet->contains(cAt(p8))) {
2622                p8 = moveBack(p8);
2623            }
2624            c = cAt(p8);
2625            if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2626                continue;
2627            }
2628        }
2629
2630        // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2631        int p9 = p1;
2632        while (fCloseSet->contains(cAt(p9))) {
2633            p9 = moveBack(p9);
2634        }
2635        c = cAt(p9);
2636        if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2637            if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2638                continue;
2639            }
2640        }
2641
2642        // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2643        int p10 = p1;
2644        while (fSpSet->contains(cAt(p10))) {
2645            p10 = moveBack(p10);
2646        }
2647        while (fCloseSet->contains(cAt(p10))) {
2648            p10 = moveBack(p10);
2649        }
2650        if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2651            if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2652                continue;
2653            }
2654        }
2655
2656        // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2657        int p11 = p1;
2658        if (fSepSet->contains(cAt(p11))) {
2659            p11 = moveBack(p11);
2660        }
2661        while (fSpSet->contains(cAt(p11))) {
2662            p11 = moveBack(p11);
2663        }
2664        while (fCloseSet->contains(cAt(p11))) {
2665            p11 = moveBack(p11);
2666        }
2667        if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2668            break;
2669        }
2670
2671        //  Rule (12)  Any x Any
2672        continue;
2673    }
2674    breakPos = p2;
2675    return breakPos;
2676}
2677
2678RBBISentMonkey::~RBBISentMonkey() {
2679    delete fSets;
2680    delete fSepSet;
2681    delete fFormatSet;
2682    delete fSpSet;
2683    delete fLowerSet;
2684    delete fUpperSet;
2685    delete fOLetterSet;
2686    delete fNumericSet;
2687    delete fATermSet;
2688    delete fSContinueSet;
2689    delete fSTermSet;
2690    delete fCloseSet;
2691    delete fOtherSet;
2692    delete fExtendSet;
2693}
2694
2695
2696
2697//-------------------------------------------------------------------------------------------
2698//
2699//  RBBILineMonkey
2700//
2701//-------------------------------------------------------------------------------------------
2702
2703class RBBILineMonkey: public RBBIMonkeyKind {
2704public:
2705    RBBILineMonkey();
2706    virtual          ~RBBILineMonkey();
2707    virtual  UVector *charClasses();
2708    virtual  void     setText(const UnicodeString &s);
2709    virtual  int32_t  next(int32_t i);
2710    virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2711private:
2712    UVector      *fSets;
2713
2714    UnicodeSet  *fBK;
2715    UnicodeSet  *fCR;
2716    UnicodeSet  *fLF;
2717    UnicodeSet  *fCM;
2718    UnicodeSet  *fNL;
2719    UnicodeSet  *fSG;
2720    UnicodeSet  *fWJ;
2721    UnicodeSet  *fZW;
2722    UnicodeSet  *fGL;
2723    UnicodeSet  *fCB;
2724    UnicodeSet  *fSP;
2725    UnicodeSet  *fB2;
2726    UnicodeSet  *fBA;
2727    UnicodeSet  *fBB;
2728    UnicodeSet  *fHY;
2729    UnicodeSet  *fH2;
2730    UnicodeSet  *fH3;
2731    UnicodeSet  *fCL;
2732    UnicodeSet  *fCP;
2733    UnicodeSet  *fEX;
2734    UnicodeSet  *fIN;
2735    UnicodeSet  *fJL;
2736    UnicodeSet  *fJV;
2737    UnicodeSet  *fJT;
2738    UnicodeSet  *fNS;
2739    UnicodeSet  *fOP;
2740    UnicodeSet  *fQU;
2741    UnicodeSet  *fIS;
2742    UnicodeSet  *fNU;
2743    UnicodeSet  *fPO;
2744    UnicodeSet  *fPR;
2745    UnicodeSet  *fSY;
2746    UnicodeSet  *fAI;
2747    UnicodeSet  *fAL;
2748    UnicodeSet  *fCJ;
2749    UnicodeSet  *fHL;
2750    UnicodeSet  *fID;
2751    UnicodeSet  *fRI;
2752    UnicodeSet  *fSA;
2753    UnicodeSet  *fXX;
2754
2755    BreakIterator        *fCharBI;
2756    const UnicodeString  *fText;
2757    RegexMatcher         *fNumberMatcher;
2758};
2759
2760
2761RBBILineMonkey::RBBILineMonkey()
2762{
2763    UErrorCode  status = U_ZERO_ERROR;
2764
2765    fSets  = new UVector(status);
2766
2767    fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2768    fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2769    fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2770    fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2771    fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2772    fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2773    fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2774    fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2775    fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2776    fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2777    fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2778    fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2779    fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2780    fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2781    fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2782    fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2783    fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2784    fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2785    fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2786    fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2787    fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2788    fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2789    fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2790    fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2791    fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2792    fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2793    fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2794    fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2795    fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2796    fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2797    fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2798    fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2799    fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2800    fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2801    fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2802    fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2803    fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2804    fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2805    fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2806    fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2807
2808    if (U_FAILURE(status)) {
2809        deferredStatus = status;
2810        fCharBI = NULL;
2811        fNumberMatcher = NULL;
2812        return;
2813    }
2814
2815    fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2816    fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2817    fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
2818    fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2819
2820    fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2821
2822    fSets->addElement(fBK, status);
2823    fSets->addElement(fCR, status);
2824    fSets->addElement(fLF, status);
2825    fSets->addElement(fCM, status);
2826    fSets->addElement(fNL, status);
2827    fSets->addElement(fWJ, status);
2828    fSets->addElement(fZW, status);
2829    fSets->addElement(fGL, status);
2830    fSets->addElement(fCB, status);
2831    fSets->addElement(fSP, status);
2832    fSets->addElement(fB2, status);
2833    fSets->addElement(fBA, status);
2834    fSets->addElement(fBB, status);
2835    fSets->addElement(fHY, status);
2836    fSets->addElement(fH2, status);
2837    fSets->addElement(fH3, status);
2838    fSets->addElement(fCL, status);
2839    fSets->addElement(fCP, status);
2840    fSets->addElement(fEX, status);
2841    fSets->addElement(fIN, status);
2842    fSets->addElement(fJL, status);
2843    fSets->addElement(fJT, status);
2844    fSets->addElement(fJV, status);
2845    fSets->addElement(fNS, status);
2846    fSets->addElement(fOP, status);
2847    fSets->addElement(fQU, status);
2848    fSets->addElement(fIS, status);
2849    fSets->addElement(fNU, status);
2850    fSets->addElement(fPO, status);
2851    fSets->addElement(fPR, status);
2852    fSets->addElement(fSY, status);
2853    fSets->addElement(fAI, status);
2854    fSets->addElement(fAL, status);
2855    fSets->addElement(fHL, status);
2856    fSets->addElement(fID, status);
2857    fSets->addElement(fWJ, status);
2858    fSets->addElement(fRI, status);
2859    fSets->addElement(fSA, status);
2860    fSets->addElement(fSG, status);
2861
2862    const char *rules =
2863            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2864            "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2865            "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2866            "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2867            "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
2868            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
2869
2870    fNumberMatcher = new RegexMatcher(
2871        UnicodeString(rules, -1, US_INV), 0, status);
2872
2873    fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2874
2875    if (U_FAILURE(status)) {
2876        deferredStatus = status;
2877    }
2878}
2879
2880
2881void RBBILineMonkey::setText(const UnicodeString &s) {
2882    fText       = &s;
2883    fCharBI->setText(s);
2884    fNumberMatcher->reset(s);
2885}
2886
2887//
2888//  rule9Adjust
2889//     Line Break TR rules 9 and 10 implementation.
2890//     This deals with combining marks and other sequences that
2891//     that must be treated as if they were something other than what they actually are.
2892//
2893//     This is factored out into a separate function because it must be applied twice for
2894//     each potential break, once to the chars before the position being checked, then
2895//     again to the text following the possible break.
2896//
2897void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2898    if (pos == -1) {
2899        // Invalid initial position.  Happens during the warmup iteration of the
2900        //   main loop in next().
2901        return;
2902    }
2903
2904    int32_t  nPos = *nextPos;
2905
2906    // LB 9  Keep combining sequences together.
2907    //  advance over any CM class chars.  Note that Line Break CM is different
2908    //  from the normal Grapheme Extend property.
2909    if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2910          *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2911        for (;;) {
2912            *nextChar = fText->char32At(nPos);
2913            if (!fCM->contains(*nextChar)) {
2914                break;
2915            }
2916            nPos = fText->moveIndex32(nPos, 1);
2917        }
2918    }
2919
2920
2921    // LB 9 Treat X CM* as if it were x.
2922    //       No explicit action required.
2923
2924    // LB 10  Treat any remaining combining mark as AL
2925    if (fCM->contains(*posChar)) {
2926        *posChar = 0x41;   // thisChar = 'A';
2927    }
2928
2929    // Push the updated nextPos and nextChar back to our caller.
2930    // This only makes a difference if posChar got bigger by consuming a
2931    // combining sequence.
2932    *nextPos  = nPos;
2933    *nextChar = fText->char32At(nPos);
2934}
2935
2936
2937
2938int32_t RBBILineMonkey::next(int32_t startPos) {
2939    UErrorCode status = U_ZERO_ERROR;
2940    int32_t    pos;       //  Index of the char following a potential break position
2941    UChar32    thisChar;  //  Character at above position "pos"
2942
2943    int32_t    prevPos;   //  Index of the char preceding a potential break position
2944    UChar32    prevChar;  //  Character at above position.  Note that prevChar
2945                          //   and thisChar may not be adjacent because combining
2946                          //   characters between them will be ignored.
2947
2948    int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2949    UChar32    prevCharX2;
2950
2951    int32_t    nextPos;   //  Index of the next character following pos.
2952                          //     Usually skips over combining marks.
2953    int32_t    nextCPPos; //  Index of the code point following "pos."
2954                          //     May point to a combining mark.
2955    int32_t    tPos;      //  temp value.
2956    UChar32    c;
2957
2958    if (U_FAILURE(deferredStatus)) {
2959        return -1;
2960    }
2961
2962    if (startPos >= fText->length()) {
2963        return -1;
2964    }
2965
2966
2967    // Initial values for loop.  Loop will run the first time without finding breaks,
2968    //                           while the invalid values shift out and the "this" and
2969    //                           "prev" positions are filled in with good values.
2970    pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2971    thisChar = prevChar  = prevCharX2 = 0;
2972    nextPos  = nextCPPos = startPos;
2973
2974
2975    // Loop runs once per position in the test text, until a break position
2976    //  is found.
2977    for (;;) {
2978        prevPosX2 = prevPos;
2979        prevCharX2 = prevChar;
2980
2981        prevPos   = pos;
2982        prevChar  = thisChar;
2983
2984        pos       = nextPos;
2985        thisChar  = fText->char32At(pos);
2986
2987        nextCPPos = fText->moveIndex32(pos, 1);
2988        nextPos   = nextCPPos;
2989
2990        // Rule LB2 - Break at end of text.
2991        if (pos >= fText->length()) {
2992            break;
2993        }
2994
2995        // Rule LB 9 - adjust for combining sequences.
2996        //             We do this one out-of-order because the adjustment does not change anything
2997        //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2998        //             be applied.
2999        rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3000        nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3001        c = fText->char32At(nextPos);
3002        rule9Adjust(pos,     &thisChar, &nextPos, &c);
3003
3004        // If the loop is still warming up - if we haven't shifted the initial
3005        //   -1 positions out of prevPos yet - loop back to advance the
3006        //    position in the input without any further looking for breaks.
3007        if (prevPos == -1) {
3008            continue;
3009        }
3010
3011        // LB 4  Always break after hard line breaks,
3012        if (fBK->contains(prevChar)) {
3013            break;
3014        }
3015
3016        // LB 5  Break after CR, LF, NL, but not inside CR LF
3017        if (prevChar == 0x0d && thisChar == 0x0a) {
3018            continue;
3019        }
3020        if (prevChar == 0x0d ||
3021            prevChar == 0x0a ||
3022            prevChar == 0x85)  {
3023            break;
3024        }
3025
3026        // LB 6  Don't break before hard line breaks
3027        if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3028            fBK->contains(thisChar)) {
3029                continue;
3030        }
3031
3032
3033        // LB 7  Don't break before spaces or zero-width space.
3034        if (fSP->contains(thisChar)) {
3035            continue;
3036        }
3037
3038        if (fZW->contains(thisChar)) {
3039            continue;
3040        }
3041
3042        // LB 8  Break after zero width space
3043        if (fZW->contains(prevChar)) {
3044            break;
3045        }
3046
3047        // LB 9, 10  Already done, at top of loop.
3048        //
3049
3050
3051        // LB 11  Do not break before or after WORD JOINER and related characters.
3052        //    x  WJ
3053        //    WJ  x
3054        //
3055        if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3056            continue;
3057        }
3058
3059        // LB 12
3060        //    GL  x
3061        if (fGL->contains(prevChar)) {
3062            continue;
3063        }
3064
3065        // LB 12a
3066        //    [^SP BA HY] x GL
3067        if (!(fSP->contains(prevChar) ||
3068              fBA->contains(prevChar) ||
3069              fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3070            continue;
3071        }
3072
3073
3074
3075        // LB 13  Don't break before closings.
3076        //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3077        //        fall into LB 17 and the more general number regular expression.
3078        //
3079        if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3080            (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3081                                         fEX->contains(thisChar)  ||
3082            (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3083            (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3084            continue;
3085        }
3086
3087        // LB 14 Don't break after OP SP*
3088        //       Scan backwards, checking for this sequence.
3089        //       The OP char could include combining marks, so we actually check for
3090        //           OP CM* SP*
3091        //       Another Twist: The Rule 67 fixes may have changed a SP CM
3092        //       sequence into a ID char, so before scanning back through spaces,
3093        //       verify that prevChar is indeed a space.  The prevChar variable
3094        //       may differ from fText[prevPos]
3095        tPos = prevPos;
3096        if (fSP->contains(prevChar)) {
3097            while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3098                tPos=fText->moveIndex32(tPos, -1);
3099            }
3100        }
3101        while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3102            tPos=fText->moveIndex32(tPos, -1);
3103        }
3104        if (fOP->contains(fText->char32At(tPos))) {
3105            continue;
3106        }
3107
3108
3109        // LB 15    QU SP* x OP
3110        if (fOP->contains(thisChar)) {
3111            // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3112            int tPos = prevPos;
3113            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3114                tPos = fText->moveIndex32(tPos, -1);
3115            }
3116            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3117                tPos = fText->moveIndex32(tPos, -1);
3118            }
3119            if (fQU->contains(fText->char32At(tPos))) {
3120                continue;
3121            }
3122        }
3123
3124
3125
3126        // LB 16   (CL | CP) SP* x NS
3127        //    Scan backwards for SP* CM* (CL | CP)
3128        if (fNS->contains(thisChar)) {
3129            int tPos = prevPos;
3130            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3131                tPos = fText->moveIndex32(tPos, -1);
3132            }
3133            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3134                tPos = fText->moveIndex32(tPos, -1);
3135            }
3136            if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3137                continue;
3138            }
3139        }
3140
3141
3142        // LB 17        B2 SP* x B2
3143        if (fB2->contains(thisChar)) {
3144            //  Scan backwards, checking for the B2 CM* SP* sequence.
3145            tPos = prevPos;
3146            if (fSP->contains(prevChar)) {
3147                while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3148                    tPos=fText->moveIndex32(tPos, -1);
3149                }
3150            }
3151            while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3152                tPos=fText->moveIndex32(tPos, -1);
3153            }
3154            if (fB2->contains(fText->char32At(tPos))) {
3155                continue;
3156            }
3157        }
3158
3159
3160        // LB 18    break after space
3161        if (fSP->contains(prevChar)) {
3162            break;
3163        }
3164
3165        // LB 19
3166        //    x   QU
3167        //    QU  x
3168        if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3169            continue;
3170        }
3171
3172        // LB 20  Break around a CB
3173        if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3174            break;
3175        }
3176
3177        // LB 21
3178        if (fBA->contains(thisChar) ||
3179            fHY->contains(thisChar) ||
3180            fNS->contains(thisChar) ||
3181            fBB->contains(prevChar) )   {
3182            continue;
3183        }
3184
3185        // LB 21a
3186        //   HL (HY | BA) x
3187        if (fHL->contains(prevCharX2) &&
3188                (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3189            continue;
3190        }
3191
3192        // LB 21b
3193        //   SY x HL
3194        if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3195            continue;
3196        }
3197
3198        // LB 22
3199        if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3200            (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3201            (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3202            (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3203            (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3204            continue;
3205        }
3206
3207
3208        // LB 23    ID x PO
3209        //          AL x NU
3210        //          HL x NU
3211        //          NU x AL
3212        if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3213            (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3214            (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3215            (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3216            (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
3217            continue;
3218        }
3219
3220        // LB 24  Do not break between prefix and letters or ideographs.
3221        //        PR x ID
3222        //        PR x (AL | HL)
3223        //        PO x (AL | HL)
3224        if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3225            (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3226            (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
3227            continue;
3228        }
3229
3230
3231
3232        // LB 25    Numbers
3233        if (fNumberMatcher->lookingAt(prevPos, status)) {
3234            if (U_FAILURE(status)) {
3235                break;
3236            }
3237            // Matched a number.  But could have been just a single digit, which would
3238            //    not represent a "no break here" between prevChar and thisChar
3239            int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3240            if (numEndIdx > pos) {
3241                // Number match includes at least our two chars being checked
3242                if (numEndIdx > nextPos) {
3243                    // Number match includes additional chars.  Update pos and nextPos
3244                    //   so that next loop iteration will continue at the end of the number,
3245                    //   checking for breaks between last char in number & whatever follows.
3246                    pos = nextPos = numEndIdx;
3247                    do {
3248                        pos = fText->moveIndex32(pos, -1);
3249                        thisChar = fText->char32At(pos);
3250                    } while (fCM->contains(thisChar));
3251                }
3252                continue;
3253            }
3254        }
3255
3256
3257        // LB 26 Do not break a Korean syllable.
3258        if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3259                                        fJV->contains(thisChar) ||
3260                                        fH2->contains(thisChar) ||
3261                                        fH3->contains(thisChar))) {
3262                                            continue;
3263                                        }
3264
3265        if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3266            (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3267                continue;
3268        }
3269
3270        if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3271            fJT->contains(thisChar)) {
3272                continue;
3273        }
3274
3275        // LB 27 Treat a Korean Syllable Block the same as ID.
3276        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3277            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3278            fIN->contains(thisChar)) {
3279                continue;
3280            }
3281        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3282            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3283            fPO->contains(thisChar)) {
3284                continue;
3285            }
3286        if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3287            fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3288                continue;
3289            }
3290
3291
3292
3293        // LB 28  Do not break between alphabetics ("at").
3294        if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3295            continue;
3296        }
3297
3298        // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3299        if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3300            continue;
3301        }
3302
3303        // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3304        //          (AL | NU) x OP
3305        //          CP x (AL | NU)
3306        if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3307            continue;
3308        }
3309        if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3310            continue;
3311        }
3312
3313        // LB30a  Do not break between regional indicators.
3314        //        RI x RI
3315        if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3316            continue;
3317        }
3318
3319        // LB 31    Break everywhere else
3320        break;
3321
3322    }
3323
3324    return pos;
3325}
3326
3327
3328UVector  *RBBILineMonkey::charClasses() {
3329    return fSets;
3330}
3331
3332
3333RBBILineMonkey::~RBBILineMonkey() {
3334    delete fSets;
3335
3336    delete fBK;
3337    delete fCR;
3338    delete fLF;
3339    delete fCM;
3340    delete fNL;
3341    delete fWJ;
3342    delete fZW;
3343    delete fGL;
3344    delete fCB;
3345    delete fSP;
3346    delete fB2;
3347    delete fBA;
3348    delete fBB;
3349    delete fHY;
3350    delete fH2;
3351    delete fH3;
3352    delete fCL;
3353    delete fCP;
3354    delete fEX;
3355    delete fIN;
3356    delete fJL;
3357    delete fJV;
3358    delete fJT;
3359    delete fNS;
3360    delete fOP;
3361    delete fQU;
3362    delete fIS;
3363    delete fNU;
3364    delete fPO;
3365    delete fPR;
3366    delete fSY;
3367    delete fAI;
3368    delete fAL;
3369    delete fCJ;
3370    delete fHL;
3371    delete fID;
3372    delete fRI;
3373    delete fSA;
3374    delete fSG;
3375    delete fXX;
3376
3377    delete fCharBI;
3378    delete fNumberMatcher;
3379}
3380
3381
3382//-------------------------------------------------------------------------------------------
3383//
3384//   TestMonkey
3385//
3386//     params
3387//       seed=nnnnn        Random number starting seed.
3388//                         Setting the seed allows errors to be reproduced.
3389//       loop=nnn          Looping count.  Controls running time.
3390//                         -1:  run forever.
3391//                          0 or greater:  run length.
3392//
3393//       type = char | word | line | sent | title
3394//
3395//-------------------------------------------------------------------------------------------
3396
3397static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3398    int32_t val = defaultVal;
3399    name.append(" *= *(-?\\d+)");
3400    UErrorCode status = U_ZERO_ERROR;
3401    RegexMatcher m(name, params, 0, status);
3402    if (m.find()) {
3403        // The param exists.  Convert the string to an int.
3404        char valString[100];
3405        int32_t paramLength = m.end(1, status) - m.start(1, status);
3406        if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3407            paramLength = (int32_t)(sizeof(valString)-2);
3408        }
3409        params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3410        val = strtol(valString,  NULL, 10);
3411
3412        // Delete this parameter from the params string.
3413        m.reset();
3414        params = m.replaceFirst("", status);
3415    }
3416    U_ASSERT(U_SUCCESS(status));
3417    return val;
3418}
3419#endif
3420
3421#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3422static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3423                                    BreakIterator *bi,
3424                                    int expected[],
3425                                    int expectedcount)
3426{
3427    int count = 0;
3428    int i = 0;
3429    int forward[50];
3430    bi->setText(ustr);
3431    for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3432        forward[count] = i;
3433        if (count < expectedcount && expected[count] != i) {
3434            test->errln("break forward test failed: expected %d but got %d",
3435                        expected[count], i);
3436            break;
3437        }
3438        count ++;
3439    }
3440    if (count != expectedcount) {
3441        printStringBreaks(ustr, expected, expectedcount);
3442        test->errln("break forward test failed: missed %d match",
3443                    expectedcount - count);
3444        return;
3445    }
3446    // testing boundaries
3447    for (i = 1; i < expectedcount; i ++) {
3448        int j = expected[i - 1];
3449        if (!bi->isBoundary(j)) {
3450            printStringBreaks(ustr, expected, expectedcount);
3451            test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3452            return;
3453        }
3454        for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3455            if (bi->isBoundary(j)) {
3456                printStringBreaks(ustr, expected, expectedcount);
3457                test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3458                return;
3459            }
3460        }
3461    }
3462
3463    for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3464        count --;
3465        if (forward[count] != i) {
3466            printStringBreaks(ustr, expected, expectedcount);
3467            test->errln("happy break test previous() failed: expected %d but got %d",
3468                        forward[count], i);
3469            break;
3470        }
3471    }
3472    if (count != 0) {
3473        printStringBreaks(ustr, expected, expectedcount);
3474        test->errln("break test previous() failed: missed a match");
3475        return;
3476    }
3477
3478    // testing preceding
3479    for (i = 0; i < expectedcount - 1; i ++) {
3480        // int j = expected[i] + 1;
3481        int j = ustr.moveIndex32(expected[i], 1);
3482        for (; j <= expected[i + 1]; j ++) {
3483            if (bi->preceding(j) != expected[i]) {
3484                printStringBreaks(ustr, expected, expectedcount);
3485                test->errln("preceding(): Not expecting boundary at position %d", j);
3486                return;
3487            }
3488        }
3489    }
3490}
3491#endif
3492
3493void RBBITest::TestWordBreaks(void)
3494{
3495#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3496
3497    Locale        locale("en");
3498    UErrorCode    status = U_ZERO_ERROR;
3499    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3500    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3501    // Replaced any C+J characters in a row with a random sequence of characters
3502    // of the same length to make our C+J segmentation not get in the way.
3503    static const char *strlist[] =
3504    {
3505    "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3506    "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3507    "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3508    "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3509    "\\uac00\\u3588\\u009c\\u0953\\u194b",
3510    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3511    "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3512    "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3513    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3514    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3515    "\\u2027\\U000e0067\\u0a47\\u00b7",
3516    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3517    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3518    "\\u0589\\U000e006e\\u0a42\\U000104a5",
3519    "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3520    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3521    "\\u0027\\u11af\\U000e0057\\u0602",
3522    "\\U0001d7f2\\U000e007\\u0004\\u0589",
3523    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3524    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3525    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3526    "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3527    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3528    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3529    "\\u0233\\U000e0020\\u0a69\\u0d6a",
3530    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3531    "\\u18f4\\U000e0049\\u20e7\\u2027",
3532    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3533    "\\ua183\\u102d\\u0bec\\u003a",
3534    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3535    "\\u003a\\u0e57\\u0fad\\u002e",
3536    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3537    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3538    "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3539    "\\u003a\\u0664\\u00b7\\u1fba",
3540    "\\u003b\\u0027\\u00b7\\u47a3",
3541    "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3542    "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3543    "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3544    };
3545    int loop;
3546    if (U_FAILURE(status)) {
3547        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3548        return;
3549    }
3550    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3551        // printf("looping %d\n", loop);
3552        UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3553        // RBBICharMonkey monkey;
3554        RBBIWordMonkey monkey;
3555
3556        int expected[50];
3557        int expectedcount = 0;
3558
3559        monkey.setText(ustr);
3560        int i;
3561        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3562            expected[expectedcount ++] = i;
3563        }
3564
3565        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3566    }
3567    delete bi;
3568#endif
3569}
3570
3571void RBBITest::TestWordBoundary(void)
3572{
3573    // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3574    Locale        locale("en");
3575    UErrorCode    status = U_ZERO_ERROR;
3576    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3577    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3578    UChar         str[50];
3579    static const char *strlist[] =
3580    {
3581    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3582    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3583    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3584    "\\u2027\\U000e0067\\u0a47\\u00b7",
3585    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3586    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3587    "\\u0589\\U000e006e\\u0a42\\U000104a5",
3588    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3589    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3590    "\\u0027\\u11af\\U000e0057\\u0602",
3591    "\\U0001d7f2\\U000e007\\u0004\\u0589",
3592    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3593    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3594    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3595    "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3596    "\\U000e0065\\u302c\\u09ee\\U000e0068",
3597    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3598    "\\u0233\\U000e0020\\u0a69\\u0d6a",
3599    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3600    "\\u58f4\\U000e0049\\u20e7\\u2027",
3601    "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3602    "\\ua183\\u102d\\u0bec\\u003a",
3603    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3604    "\\u003a\\u0e57\\u0fad\\u002e",
3605    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3606    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3607    "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3608    "\\u003a\\u0664\\u00b7\\u1fba",
3609    "\\u003b\\u0027\\u00b7\\u47a3",
3610    };
3611    int loop;
3612    if (U_FAILURE(status)) {
3613        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3614        return;
3615    }
3616    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3617        // printf("looping %d\n", loop);
3618        u_unescape(strlist[loop], str, 20);
3619        UnicodeString ustr(str);
3620        int forward[50];
3621        int count = 0;
3622
3623        bi->setText(ustr);
3624        int prev = 0;
3625        int i;
3626        for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3627            forward[count ++] = i;
3628            if (i > prev) {
3629                int j;
3630                for (j = prev + 1; j < i; j ++) {
3631                    if (bi->isBoundary(j)) {
3632                        printStringBreaks(ustr, forward, count);
3633                        errln("happy boundary test failed: expected %d not a boundary",
3634                               j);
3635                        return;
3636                    }
3637                }
3638            }
3639            if (!bi->isBoundary(i)) {
3640                printStringBreaks(ustr, forward, count);
3641                errln("happy boundary test failed: expected %d a boundary",
3642                       i);
3643                return;
3644            }
3645            prev = i;
3646        }
3647    }
3648    delete bi;
3649}
3650
3651void RBBITest::TestLineBreaks(void)
3652{
3653#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3654    Locale        locale("en");
3655    UErrorCode    status = U_ZERO_ERROR;
3656    BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3657    const int32_t  STRSIZE = 50;
3658    UChar         str[STRSIZE];
3659    static const char *strlist[] =
3660    {
3661     "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3662     "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3663             "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3664     "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3665             "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3666     "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3667     "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3668     "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3669     "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3670     "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3671     "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3672     "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3673     "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3674     "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3675     "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3676     "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3677     "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3678     "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3679     "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3680     "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3681     "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3682     "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3683     "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3684     "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3685     "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3686     "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3687     "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3688     "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3689     "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3690     "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3691     "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3692     "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3693     "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3694     "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3695     "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3696     "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3697     "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3698     "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3699     "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3700     "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3701     "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3702     "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3703         "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3704         "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3705         "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3706     "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3707         "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3708    };
3709    int loop;
3710    TEST_ASSERT_SUCCESS(status);
3711    if (U_FAILURE(status)) {
3712        return;
3713    }
3714    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3715        // printf("looping %d\n", loop);
3716        int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3717        if (t >= STRSIZE) {
3718            TEST_ASSERT(FALSE);
3719            continue;
3720        }
3721
3722
3723        UnicodeString ustr(str);
3724        RBBILineMonkey monkey;
3725        if (U_FAILURE(monkey.deferredStatus)) {
3726            continue;
3727        }
3728
3729        const int EXPECTEDSIZE = 50;
3730        int expected[EXPECTEDSIZE];
3731        int expectedcount = 0;
3732
3733        monkey.setText(ustr);
3734        int i;
3735        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3736            if (expectedcount >= EXPECTEDSIZE) {
3737                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3738                return;
3739            }
3740            expected[expectedcount ++] = i;
3741        }
3742
3743        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3744    }
3745    delete bi;
3746#endif
3747}
3748
3749void RBBITest::TestSentBreaks(void)
3750{
3751#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3752    Locale        locale("en");
3753    UErrorCode    status = U_ZERO_ERROR;
3754    BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3755    UChar         str[200];
3756    static const char *strlist[] =
3757    {
3758     "Now\ris\nthe\r\ntime\n\rfor\r\r",
3759     "This\n",
3760     "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3761     "\"Sentence ending with a quote.\" Bye.",
3762     "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3763     "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3764     "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3765     "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3766     "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3767     "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3768     "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3769             "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3770             "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3771             "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3772     "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3773             "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3774             "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3775             "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3776             "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3777             "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3778    };
3779    int loop;
3780    if (U_FAILURE(status)) {
3781        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3782        return;
3783    }
3784    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3785        u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3786        UnicodeString ustr(str);
3787
3788        RBBISentMonkey monkey;
3789        if (U_FAILURE(monkey.deferredStatus)) {
3790            continue;
3791        }
3792
3793        const int EXPECTEDSIZE = 50;
3794        int expected[EXPECTEDSIZE];
3795        int expectedcount = 0;
3796
3797        monkey.setText(ustr);
3798        int i;
3799        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3800            if (expectedcount >= EXPECTEDSIZE) {
3801                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3802                return;
3803            }
3804            expected[expectedcount ++] = i;
3805        }
3806
3807        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3808    }
3809    delete bi;
3810#endif
3811}
3812
3813void RBBITest::TestMonkey(char *params) {
3814#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3815
3816    UErrorCode     status    = U_ZERO_ERROR;
3817    int32_t        loopCount = 500;
3818    int32_t        seed      = 1;
3819    UnicodeString  breakType = "all";
3820    Locale         locale("en");
3821    UBool          useUText  = FALSE;
3822
3823    if (quick == FALSE) {
3824        loopCount = 10000;
3825    }
3826
3827    if (params) {
3828        UnicodeString p(params);
3829        loopCount = getIntParam("loop", p, loopCount);
3830        seed      = getIntParam("seed", p, seed);
3831
3832        RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3833        if (m.find()) {
3834            breakType = m.group(1, status);
3835            m.reset();
3836            p = m.replaceFirst("", status);
3837        }
3838
3839        RegexMatcher u(" *utext", p, 0, status);
3840        if (u.find()) {
3841            useUText = TRUE;
3842            u.reset();
3843            p = u.replaceFirst("", status);
3844        }
3845
3846
3847        // m.reset(p);
3848        if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3849            // Each option is stripped out of the option string as it is processed.
3850            // All options have been checked.  The option string should have been completely emptied..
3851            char buf[100];
3852            p.extract(buf, sizeof(buf), NULL, status);
3853            buf[sizeof(buf)-1] = 0;
3854            errln("Unrecognized or extra parameter:  %s\n", buf);
3855            return;
3856        }
3857
3858    }
3859
3860    if (breakType == "char" || breakType == "all") {
3861        RBBICharMonkey  m;
3862        BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3863        if (U_SUCCESS(status)) {
3864            RunMonkey(bi, m, "char", seed, loopCount, useUText);
3865            if (breakType == "all" && useUText==FALSE) {
3866                // Also run a quick test with UText when "all" is specified
3867                RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3868            }
3869        }
3870        else {
3871            errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3872        }
3873        delete bi;
3874    }
3875
3876    if (breakType == "word" || breakType == "all") {
3877        logln("Word Break Monkey Test");
3878        RBBIWordMonkey  m;
3879        BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3880        if (U_SUCCESS(status)) {
3881            RunMonkey(bi, m, "word", seed, loopCount, useUText);
3882        }
3883        else {
3884            errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3885        }
3886        delete bi;
3887    }
3888
3889    if (breakType == "line" || breakType == "all") {
3890        logln("Line Break Monkey Test");
3891        RBBILineMonkey  m;
3892        BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3893        if (loopCount >= 10) {
3894            loopCount = loopCount / 5;   // Line break runs slower than the others.
3895        }
3896        if (U_SUCCESS(status)) {
3897            RunMonkey(bi, m, "line", seed, loopCount, useUText);
3898        }
3899        else {
3900            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3901        }
3902        delete bi;
3903    }
3904
3905    if (breakType == "sent" || breakType == "all"  ) {
3906        logln("Sentence Break Monkey Test");
3907        RBBISentMonkey  m;
3908        BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3909        if (loopCount >= 10) {
3910            loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3911        }
3912        if (U_SUCCESS(status)) {
3913            RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3914        }
3915        else {
3916            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3917        }
3918        delete bi;
3919    }
3920
3921#endif
3922}
3923
3924//
3925//  Run a RBBI monkey test.  Common routine, for all break iterator types.
3926//    Parameters:
3927//       bi      - the break iterator to use
3928//       mk      - MonkeyKind, abstraction for obtaining expected results
3929//       name    - Name of test (char, word, etc.) for use in error messages
3930//       seed    - Seed for starting random number generator (parameter from user)
3931//       numIterations
3932//
3933void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3934                         int32_t numIterations, UBool useUText) {
3935
3936#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3937
3938    const int32_t    TESTSTRINGLEN = 500;
3939    UnicodeString    testText;
3940    int32_t          numCharClasses;
3941    UVector          *chClasses;
3942    int              expected[TESTSTRINGLEN*2 + 1];
3943    int              expectedCount = 0;
3944    char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3945    char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3946    char             reverseBreaks[TESTSTRINGLEN*2+1];
3947    char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3948    char             followingBreaks[TESTSTRINGLEN*2+1];
3949    char             precedingBreaks[TESTSTRINGLEN*2+1];
3950    int              i;
3951    int              loopCount = 0;
3952
3953    m_seed = seed;
3954
3955    numCharClasses = mk.charClasses()->size();
3956    chClasses      = mk.charClasses();
3957
3958    // Check for errors that occured during the construction of the MonkeyKind object.
3959    //  Can't report them where they occured because errln() is a method coming from intlTest,
3960    //  and is not visible outside of RBBITest :-(
3961    if (U_FAILURE(mk.deferredStatus)) {
3962        errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3963        return;
3964    }
3965
3966    // Verify that the character classes all have at least one member.
3967    for (i=0; i<numCharClasses; i++) {
3968        UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3969        if (s == NULL || s->size() == 0) {
3970            errln("Character Class #%d is null or of zero size.", i);
3971            return;
3972        }
3973    }
3974
3975    while (loopCount < numIterations || numIterations == -1) {
3976        if (numIterations == -1 && loopCount % 10 == 0) {
3977            // If test is running in an infinite loop, display a periodic tic so
3978            //   we can tell that it is making progress.
3979            fprintf(stderr, ".");
3980        }
3981        // Save current random number seed, so that we can recreate the random numbers
3982        //   for this loop iteration in event of an error.
3983        seed = m_seed;
3984
3985        // Populate a test string with data.
3986        testText.truncate(0);
3987        for (i=0; i<TESTSTRINGLEN; i++) {
3988            int32_t  aClassNum = m_rand() % numCharClasses;
3989            UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3990            int32_t   charIdx = m_rand() % classSet->size();
3991            UChar32   c = classSet->charAt(charIdx);
3992            if (c < 0) {   // TODO:  deal with sets containing strings.
3993                errln("c < 0");
3994                break;
3995            }
3996            testText.append(c);
3997        }
3998
3999        // Calculate the expected results for this test string.
4000        mk.setText(testText);
4001        memset(expectedBreaks, 0, sizeof(expectedBreaks));
4002        expectedBreaks[0] = 1;
4003        int32_t breakPos = 0;
4004        expectedCount = 0;
4005        for (;;) {
4006            breakPos = mk.next(breakPos);
4007            if (breakPos == -1) {
4008                break;
4009            }
4010            if (breakPos > testText.length()) {
4011                errln("breakPos > testText.length()");
4012            }
4013            expectedBreaks[breakPos] = 1;
4014            U_ASSERT(expectedCount<testText.length());
4015            expected[expectedCount ++] = breakPos;
4016            (void)expected;   // Set but not used warning.
4017                              // TODO (andy): check it out.
4018        }
4019
4020        // Find the break positions using forward iteration
4021        memset(forwardBreaks, 0, sizeof(forwardBreaks));
4022        if (useUText) {
4023            UErrorCode status = U_ZERO_ERROR;
4024            UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4025            // testUText = utext_openUnicodeString(testUText, &testText, &status);
4026            bi->setText(testUText, status);
4027            TEST_ASSERT_SUCCESS(status);
4028            utext_close(testUText);   // The break iterator does a shallow clone of the UText
4029                                      //  This UText can be closed immediately, so long as the
4030                                      //  testText string continues to exist.
4031        } else {
4032            bi->setText(testText);
4033        }
4034
4035        for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4036            if (i < 0 || i > testText.length()) {
4037                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4038                break;
4039            }
4040            forwardBreaks[i] = 1;
4041        }
4042
4043        // Find the break positions using reverse iteration
4044        memset(reverseBreaks, 0, sizeof(reverseBreaks));
4045        for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4046            if (i < 0 || i > testText.length()) {
4047                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4048                break;
4049            }
4050            reverseBreaks[i] = 1;
4051        }
4052
4053        // Find the break positions using isBoundary() tests.
4054        memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4055        U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4056        for (i=0; i<=testText.length(); i++) {
4057            isBoundaryBreaks[i] = bi->isBoundary(i);
4058        }
4059
4060
4061        // Find the break positions using the following() function.
4062        // printf(".");
4063        memset(followingBreaks, 0, sizeof(followingBreaks));
4064        int32_t   lastBreakPos = 0;
4065        followingBreaks[0] = 1;
4066        for (i=0; i<testText.length(); i++) {
4067            breakPos = bi->following(i);
4068            if (breakPos <= i ||
4069                breakPos < lastBreakPos ||
4070                breakPos > testText.length() ||
4071                (breakPos > lastBreakPos && lastBreakPos > i)) {
4072                errln("%s break monkey test: "
4073                    "Out of range value returned by BreakIterator::following().\n"
4074                        "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4075                         name, seed, i, breakPos, lastBreakPos);
4076                break;
4077            }
4078            followingBreaks[breakPos] = 1;
4079            lastBreakPos = breakPos;
4080        }
4081
4082        // Find the break positions using the preceding() function.
4083        memset(precedingBreaks, 0, sizeof(precedingBreaks));
4084        lastBreakPos = testText.length();
4085        precedingBreaks[testText.length()] = 1;
4086        for (i=testText.length(); i>0; i--) {
4087            breakPos = bi->preceding(i);
4088            if (breakPos >= i ||
4089                breakPos > lastBreakPos ||
4090                (breakPos < 0 && testText.getChar32Start(i)>0) ||
4091                (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4092                errln("%s break monkey test: "
4093                    "Out of range value returned by BreakIterator::preceding().\n"
4094                    "index=%d;  prev returned %d; lastBreak=%d" ,
4095                    name,  i, breakPos, lastBreakPos);
4096                if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4097                    precedingBreaks[i] = 2;   // Forces an error.
4098                }
4099            } else {
4100                if (breakPos >= 0) {
4101                    precedingBreaks[breakPos] = 1;
4102                }
4103                lastBreakPos = breakPos;
4104            }
4105        }
4106
4107        // Compare the expected and actual results.
4108        for (i=0; i<=testText.length(); i++) {
4109            const char *errorType = NULL;
4110            if  (forwardBreaks[i] != expectedBreaks[i]) {
4111                errorType = "next()";
4112            } else if (reverseBreaks[i] != forwardBreaks[i]) {
4113                errorType = "previous()";
4114            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4115                errorType = "isBoundary()";
4116            } else if (followingBreaks[i] != expectedBreaks[i]) {
4117                errorType = "following()";
4118            } else if (precedingBreaks[i] != expectedBreaks[i]) {
4119                errorType = "preceding()";
4120            }
4121
4122
4123            if (errorType != NULL) {
4124                // Format a range of the test text that includes the failure as
4125                //  a data item that can be included in the rbbi test data file.
4126
4127                // Start of the range is the last point where expected and actual results
4128                //   both agreed that there was a break position.
4129                int startContext = i;
4130                int32_t count = 0;
4131                for (;;) {
4132                    if (startContext==0) { break; }
4133                    startContext --;
4134                    if (expectedBreaks[startContext] != 0) {
4135                        if (count == 2) break;
4136                        count ++;
4137                    }
4138                }
4139
4140                // End of range is two expected breaks past the start position.
4141                int endContext = i + 1;
4142                int ci;
4143                for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4144                    for (;;) {
4145                        if (endContext >= testText.length()) {break;}
4146                        if (expectedBreaks[endContext-1] != 0) {
4147                            if (count == 0) break;
4148                            count --;
4149                        }
4150                        endContext ++;
4151                    }
4152                }
4153
4154                // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4155                UnicodeString errorText = "<data>";
4156                /***if (strcmp(errorType, "next()") == 0) {
4157                    startContext = 0;
4158                    endContext = testText.length();
4159
4160                    printStringBreaks(testText, expected, expectedCount);
4161                }***/
4162
4163                for (ci=startContext; ci<endContext;) {
4164                    UnicodeString hexChars("0123456789abcdef");
4165                    UChar32  c;
4166                    int      bn;
4167                    c = testText.char32At(ci);
4168                    if (ci == i) {
4169                        // This is the location of the error.
4170                        errorText.append("<?>");
4171                    } else if (expectedBreaks[ci] != 0) {
4172                        // This a non-error expected break position.
4173                        errorText.append("\\");
4174                    }
4175                    if (c < 0x10000) {
4176                        errorText.append("\\u");
4177                        for (bn=12; bn>=0; bn-=4) {
4178                            errorText.append(hexChars.charAt((c>>bn)&0xf));
4179                        }
4180                    } else {
4181                        errorText.append("\\U");
4182                        for (bn=28; bn>=0; bn-=4) {
4183                            errorText.append(hexChars.charAt((c>>bn)&0xf));
4184                        }
4185                    }
4186                    ci = testText.moveIndex32(ci, 1);
4187                }
4188                errorText.append("\\");
4189                errorText.append("</data>\n");
4190
4191                // Output the error
4192                char  charErrorTxt[500];
4193                UErrorCode status = U_ZERO_ERROR;
4194                errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4195                charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4196                const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4197
4198                errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4199                    name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4200                    errorType, seed, i, charErrorTxt);
4201                break;
4202            }
4203        }
4204
4205        loopCount++;
4206    }
4207#endif
4208}
4209
4210
4211//  Bug 5532.  UTF-8 based UText fails in dictionary code.
4212//             This test checks the initial patch,
4213//             which is to just keep it from crashing.  Correct word boundaries
4214//             await a proper fix to the dictionary code.
4215//
4216void RBBITest::TestBug5532(void)  {
4217   // Text includes a mixture of Thai and Latin.
4218   const unsigned char utf8Data[] = {
4219           0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4220           0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4221           0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4222           0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4223           0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4224           0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4225           0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4226           0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4227           0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4228           0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4229           0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4230
4231    UErrorCode status = U_ZERO_ERROR;
4232    UText utext=UTEXT_INITIALIZER;
4233    utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4234    TEST_ASSERT_SUCCESS(status);
4235
4236    BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4237    TEST_ASSERT_SUCCESS(status);
4238    if (U_SUCCESS(status)) {
4239        bi->setText(&utext, status);
4240        TEST_ASSERT_SUCCESS(status);
4241
4242        int32_t breakCount = 0;
4243        int32_t previousBreak = -1;
4244        for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4245            // For now, just make sure that the break iterator doesn't hang.
4246            TEST_ASSERT(previousBreak < bi->current());
4247            previousBreak = bi->current();
4248        }
4249        TEST_ASSERT(breakCount > 0);
4250    }
4251    delete bi;
4252    utext_close(&utext);
4253}
4254
4255
4256void RBBITest::TestBug9983(void)  {
4257    UnicodeString text = UnicodeString("\\u002A"  // * Other
4258                                       "\\uFF65"  //   Other
4259                                       "\\u309C"  //   Katakana
4260                                       "\\uFF9F"  //   Extend
4261                                       "\\uFF65"  //   Other
4262                                       "\\u0020"  //   Other
4263                                       "\\u0000").unescape();
4264
4265    UErrorCode status = U_ZERO_ERROR;
4266    LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4267        BreakIterator::createWordInstance(Locale::getRoot(), status)));
4268    TEST_ASSERT_SUCCESS(status);
4269    LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4270        BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4271    TEST_ASSERT_SUCCESS(status);
4272    if (U_FAILURE(status)) {
4273        return;
4274    }
4275    int32_t offset, rstatus, iterationCount;
4276
4277    brkiter->setText(text);
4278    brkiter->last();
4279    iterationCount = 0;
4280    while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4281        iterationCount++;
4282        rstatus = brkiter->getRuleStatus();
4283        (void)rstatus;     // Suppress set but not used warning.
4284        if (iterationCount >= 10) {
4285           break;
4286        }
4287    }
4288    TEST_ASSERT(iterationCount == 6);
4289
4290    brkiterPOSIX->setText(text);
4291    brkiterPOSIX->last();
4292    iterationCount = 0;
4293    while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4294        iterationCount++;
4295        rstatus = brkiterPOSIX->getRuleStatus();
4296        (void)rstatus;     // Suppress set but not used warning.
4297        if (iterationCount >= 10) {
4298           break;
4299        }
4300    }
4301    TEST_ASSERT(iterationCount == 6);
4302}
4303
4304
4305//
4306//  TestDebug    -  A place-holder test for debugging purposes.
4307//                  For putting in fragments of other tests that can be invoked
4308//                  for tracing  without a lot of unwanted extra stuff happening.
4309//
4310void RBBITest::TestDebug(void) {
4311#if 0
4312    UErrorCode   status = U_ZERO_ERROR;
4313    int pos = 0;
4314    int ruleStatus = 0;
4315
4316    RuleBasedBreakIterator* bi =
4317       // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4318       // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4319       (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4320    UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4321    // UnicodeString s("Aaa.  Bcd");
4322    s = s.unescape();
4323    bi->setText(s);
4324    UBool r = bi->isBoundary(8);
4325    printf("%s", r?"true":"false");
4326    return;
4327    pos = bi->last();
4328    do {
4329        // ruleStatus = bi->getRuleStatus();
4330        printf("%d\t%d\n", pos, ruleStatus);
4331        pos = bi->previous();
4332    } while (pos != BreakIterator::DONE);
4333#endif
4334}
4335
4336void RBBITest::TestProperties() {
4337    UErrorCode errorCode = U_ZERO_ERROR;
4338    UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4339    if (!prependSet.isEmpty()) {
4340        errln(
4341            "[:GCB=Prepend:] is not empty any more. "
4342            "Uncomment relevant lines in source/data/brkitr/char.txt and "
4343            "change this test to the opposite condition.");
4344    }
4345}
4346
4347#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
4348