1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2012, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6/************************************************************************
7*   Date        Name        Description
8*   12/15/99    Madhu        Creation.
9*   01/12/2000  Madhu        Updated for changed API and added new tests
10************************************************************************/
11
12#include "utypeinfo.h"  // for 'typeid' to work
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/utypes.h"
19#include "unicode/brkiter.h"
20#include "unicode/rbbi.h"
21#include "unicode/uchar.h"
22#include "unicode/utf16.h"
23#include "unicode/ucnv.h"
24#include "unicode/schriter.h"
25#include "unicode/uniset.h"
26#if !UCONFIG_NO_REGULAR_EXPRESSIONS
27#include "unicode/regex.h"
28#endif
29#include "unicode/ustring.h"
30#include "unicode/utext.h"
31#include "intltest.h"
32#include "rbbitst.h"
33#include <string.h>
34#include "uvector.h"
35#include "uvectr32.h"
36#include <string.h>
37#include <stdio.h>
38#include <stdlib.h>
39#include "unicode/numfmt.h"
40#include "unicode/uscript.h"
41
42#define TEST_ASSERT(x) {if (!(x)) { \
43    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
44
45#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
46    errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
47
48
49//---------------------------------------------
50// runIndexedTest
51//---------------------------------------------
52
53
54//  Note:  Before adding new tests to this file, check whether the desired test data can
55//         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
56//         it's much less work than writing a new test, diagnostic output in the event of failures
57//         is good, and the test data file will is shared with ICU4J, so eventually the test
58//         will run there as well, without additional effort.
59
60void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
61{
62    if (exec) logln("TestSuite RuleBasedBreakIterator: ");
63
64    switch (index) {
65#if !UCONFIG_NO_FILE_IO
66        case 0: name = "TestBug4153072";
67            if(exec) TestBug4153072();                         break;
68#else
69        case 0: name = "skip";
70            break;
71#endif
72
73        case 1: name = "skip";
74            break;
75        case 2: name = "TestStatusReturn";
76            if(exec) TestStatusReturn();                       break;
77
78#if !UCONFIG_NO_FILE_IO
79        case 3: name = "TestUnicodeFiles";
80            if(exec) TestUnicodeFiles();                       break;
81        case 4: name = "TestEmptyString";
82            if(exec) TestEmptyString();                        break;
83#else
84        case 3: case 4: name = "skip";
85            break;
86#endif
87
88        case 5: name = "TestGetAvailableLocales";
89            if(exec) TestGetAvailableLocales();                break;
90
91        case 6: name = "TestGetDisplayName";
92            if(exec) TestGetDisplayName();                     break;
93
94#if !UCONFIG_NO_FILE_IO
95        case 7: name = "TestEndBehaviour";
96            if(exec) TestEndBehaviour();                       break;
97        case 8: case 9: case 10: name = "skip";
98             break;
99        case 11: name = "TestWordBreaks";
100             if(exec) TestWordBreaks();                        break;
101        case 12: name = "TestWordBoundary";
102             if(exec) TestWordBoundary();                      break;
103        case 13: name = "TestLineBreaks";
104             if(exec) TestLineBreaks();                        break;
105        case 14: name = "TestSentBreaks";
106             if(exec) TestSentBreaks();                        break;
107        case 15: name = "TestExtended";
108             if(exec) TestExtended();                          break;
109#else
110        case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
111             break;
112#endif
113
114#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
115        case 16:
116            name = "TestMonkey"; if(exec)  TestMonkey(params); break;
117#else
118        case 16:
119             name = "skip";                                    break;
120#endif
121
122#if !UCONFIG_NO_FILE_IO
123        case 17: name = "TestBug3818";
124            if(exec) TestBug3818();                            break;
125#else
126        case 17: name = "skip";
127            break;
128#endif
129
130        case 18: name = "skip";
131            break;
132        case 19: name = "TestDebug";
133            if(exec) TestDebug();                              break;
134        case 20: name = "skip";
135            break;
136
137#if !UCONFIG_NO_FILE_IO
138        case 21: name = "TestBug5775";
139            if (exec) TestBug5775();                           break;
140#else
141        case 21: name = "skip";
142            break;
143#endif
144
145        case 22: name = "skip";
146            break;
147        case 23: name = "TestDictRules";
148            if (exec) TestDictRules();                         break;
149        case 24: name = "TestBug5532";
150            if (exec) TestBug5532();                           break;
151        default: name = ""; break; //needed to end loop
152    }
153}
154
155
156//---------------------------------------------------------------------------
157//
158//   class BITestData   Holds a set of Break iterator test data and results
159//                      Includes
160//                         - the string data to be broken
161//                         - a vector of the expected break positions.
162//                         - a vector of source line numbers for the data,
163//                               (to help see where errors occured.)
164//                         - The expected break tag values.
165//                         - Vectors of actual break positions and tag values.
166//                         - Functions for comparing actual with expected and
167//                            reporting errors.
168//
169//----------------------------------------------------------------------------
170class BITestData {
171public:
172    UnicodeString    fDataToBreak;
173    UVector          fExpectedBreakPositions;
174    UVector          fExpectedTags;
175    UVector          fLineNum;
176    UVector          fActualBreakPositions;   // Test Results.
177    UVector          fActualTags;
178
179    BITestData(UErrorCode &status);
180    void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
181    void             checkResults(const char *heading, RBBITest *test);
182    void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
183    void             clearResults();
184};
185
186//
187// Constructor.
188//
189BITestData::BITestData(UErrorCode &status)
190: fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
191  fActualTags(status)
192{
193}
194
195//
196// addDataChunk.   Add a section (non-breaking) piece if data to the test data.
197//                 The macro form collects the line number, which is helpful
198//                 when tracking down failures.
199//
200//                 A null data item is inserted at the start of each test's data
201//                  to put the starting zero into the data list.  The position saved for
202//                  each non-null item is its ending position.
203//
204#define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
205void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
206    if (U_FAILURE(status)) {return;}
207    if (data != NULL) {
208        fDataToBreak.append(CharsToUnicodeString(data));
209    }
210    fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
211    fExpectedTags.addElement(tag, status);
212    fLineNum.addElement(lineNum, status);
213}
214
215
216//
217//  checkResults.   Compare the actual and expected break positions, report any differences.
218//
219void BITestData::checkResults(const char *heading, RBBITest *test) {
220    int32_t   expectedIndex = 0;
221    int32_t   actualIndex = 0;
222
223    for (;;) {
224        // If we've run through both the expected and actual results vectors, we're done.
225        //   break out of the loop.
226        if (expectedIndex >= fExpectedBreakPositions.size() &&
227            actualIndex   >= fActualBreakPositions.size()) {
228            break;
229        }
230
231
232        if (expectedIndex >= fExpectedBreakPositions.size()) {
233            err(heading, test, expectedIndex-1, actualIndex);
234            actualIndex++;
235            continue;
236        }
237
238        if (actualIndex >= fActualBreakPositions.size()) {
239            err(heading, test, expectedIndex, actualIndex-1);
240            expectedIndex++;
241            continue;
242        }
243
244        if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
245            err(heading, test, expectedIndex, actualIndex);
246            // Try to resync the positions of the indices, to avoid a rash of spurious erros.
247            if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
248                actualIndex++;
249            } else {
250                expectedIndex++;
251            }
252            continue;
253        }
254
255        if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
256            test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
257                heading, fLineNum.elementAt(expectedIndex),
258                fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
259        }
260
261        actualIndex++;
262        expectedIndex++;
263    }
264}
265
266//
267//  err   -  An error was found.  Report it, along with information about where the
268//                                incorrectly broken test data appeared in the source file.
269//
270void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
271{
272    int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
273    int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
274    int32_t   o        = 0;
275    int32_t   line     = fLineNum.elementAti(expectedIdx);
276    if (expectedIdx > 0) {
277        // The line numbers are off by one because a premature break occurs somewhere
278        //    within the previous item, rather than at the start of the current (expected) item.
279        //    We want to report the offset of the unexpected break from the start of
280        //      this previous item.
281        o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
282    }
283    if (actual < expected) {
284        test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
285    } else {
286        test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
287    }
288}
289
290
291void BITestData::clearResults() {
292    fActualBreakPositions.removeAllElements();
293    fActualTags.removeAllElements();
294}
295
296
297//--------------------------------------------------------------------------------------
298//
299//    RBBITest    constructor and destructor
300//
301//--------------------------------------------------------------------------------------
302
303RBBITest::RBBITest() {
304}
305
306
307RBBITest::~RBBITest() {
308}
309
310//-----------------------------------------------------------------------------------
311//
312//   Test for status {tag} return value from break rules.
313//        TODO:  a more thorough test.
314//
315//-----------------------------------------------------------------------------------
316void RBBITest::TestStatusReturn() {
317     UnicodeString rulesString1("$Letters = [:L:];\n"
318                                  "$Numbers = [:N:];\n"
319                                  "$Letters+{1};\n"
320                                  "$Numbers+{2};\n"
321                                  "Help\\ {4}/me\\!;\n"
322                                  "[^$Letters $Numbers];\n"
323                                  "!.*;\n", -1, US_INV);
324     UnicodeString testString1  = "abc123..abc Help me Help me!";
325                                // 01234567890123456789012345678
326     int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
327     int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
328
329     UErrorCode status=U_ZERO_ERROR;
330     UParseError    parseError;
331
332     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
333     if(U_FAILURE(status)) {
334         dataerrln("FAIL : in construction - %s", u_errorName(status));
335     } else {
336         int32_t  pos;
337         int32_t  i = 0;
338         bi->setText(testString1);
339         for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
340             if (pos != bounds1[i]) {
341                 errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
342                 break;
343             }
344
345             int tag = bi->getRuleStatus();
346             if (tag != brkStatus[i]) {
347                 errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
348                 break;
349             }
350             i++;
351         }
352     }
353     delete bi;
354}
355
356
357static void printStringBreaks(UnicodeString ustr, int expected[],
358                              int expectedcount)
359{
360    UErrorCode status = U_ZERO_ERROR;
361    char name[100];
362    printf("code    alpha extend alphanum type word sent line name\n");
363    int j;
364    for (j = 0; j < ustr.length(); j ++) {
365        if (expectedcount > 0) {
366            int k;
367            for (k = 0; k < expectedcount; k ++) {
368                if (j == expected[k]) {
369                    printf("------------------------------------------------ %d\n",
370                           j);
371                }
372            }
373        }
374        UChar32 c = ustr.char32At(j);
375        if (c > 0xffff) {
376            j ++;
377        }
378        u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
379        printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
380                           u_isUAlphabetic(c),
381                           u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
382                           u_isalnum(c),
383                           u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
384                                                  u_charType(c),
385                                                  U_SHORT_PROPERTY_NAME),
386                           u_getPropertyValueName(UCHAR_WORD_BREAK,
387                                                  u_getIntPropertyValue(c,
388                                                          UCHAR_WORD_BREAK),
389                                                  U_SHORT_PROPERTY_NAME),
390                           u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
391                                   u_getIntPropertyValue(c,
392                                           UCHAR_SENTENCE_BREAK),
393                                   U_SHORT_PROPERTY_NAME),
394                           u_getPropertyValueName(UCHAR_LINE_BREAK,
395                                   u_getIntPropertyValue(c,
396                                           UCHAR_LINE_BREAK),
397                                   U_SHORT_PROPERTY_NAME),
398                           name);
399    }
400}
401
402
403void RBBITest::TestBug3818() {
404    UErrorCode  status = U_ZERO_ERROR;
405
406    // Four Thai words...
407    static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
408                                           0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
409    UnicodeString  thaiStr(thaiWordData);
410
411    RuleBasedBreakIterator* bi =
412        (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
413    if (U_FAILURE(status) || bi == NULL) {
414        errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
415        return;
416    }
417    bi->setText(thaiStr);
418
419    int32_t  startOfSecondWord = bi->following(1);
420    if (startOfSecondWord != 4) {
421        errln("Fail at file %s, line %d expected start of word at 4, got %d",
422            __FILE__, __LINE__, startOfSecondWord);
423    }
424    startOfSecondWord = bi->following(0);
425    if (startOfSecondWord != 4) {
426        errln("Fail at file %s, line %d expected start of word at 4, got %d",
427            __FILE__, __LINE__, startOfSecondWord);
428    }
429    delete bi;
430}
431
432//----------------------------------------------------------------------------
433//
434// generalIteratorTest      Given a break iterator and a set of test data,
435//                          Run the tests and report the results.
436//
437//----------------------------------------------------------------------------
438void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
439{
440
441    bi.setText(td.fDataToBreak);
442
443    testFirstAndNext(bi, td);
444
445    testLastAndPrevious(bi, td);
446
447    testFollowing(bi, td);
448    testPreceding(bi, td);
449    testIsBoundary(bi, td);
450    doMultipleSelectionTest(bi, td);
451}
452
453
454//
455//   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
456//                       kind of loop.
457//
458void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
459{
460    UErrorCode  status = U_ZERO_ERROR;
461    int32_t     p;
462    int32_t     lastP = -1;
463    int32_t     tag;
464
465    logln("Test first and next");
466    bi.setText(td.fDataToBreak);
467    td.clearResults();
468
469    for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
470        td.fActualBreakPositions.addElement(p, status);  // Save result.
471        tag = bi.getRuleStatus();
472        td.fActualTags.addElement(tag, status);
473        if (p <= lastP) {
474            // If the iterator is not making forward progress, stop.
475            //  No need to raise an error here, it'll be detected in the normal check of results.
476            break;
477        }
478        lastP = p;
479    }
480    td.checkResults("testFirstAndNext", this);
481}
482
483
484//
485//  TestLastAndPrevious.   Run the iterator backwards, starting with last().
486//
487void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
488{
489    UErrorCode  status = U_ZERO_ERROR;
490    int32_t     p;
491    int32_t     lastP  = 0x7ffffffe;
492    int32_t     tag;
493
494    logln("Test last and previous");
495    bi.setText(td.fDataToBreak);
496    td.clearResults();
497
498    for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
499        // Save break position.  Insert it at start of vector of results, shoving
500        //    already-saved results further towards the end.
501        td.fActualBreakPositions.insertElementAt(p, 0, status);
502        // bi.previous();   // TODO:  Why does this fix things up????
503        // bi.next();
504        tag = bi.getRuleStatus();
505        td.fActualTags.insertElementAt(tag, 0, status);
506        if (p >= lastP) {
507            // If the iterator is not making progress, stop.
508            //  No need to raise an error here, it'll be detected in the normal check of results.
509            break;
510        }
511        lastP = p;
512    }
513    td.checkResults("testLastAndPrevious", this);
514}
515
516
517void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
518{
519    UErrorCode  status = U_ZERO_ERROR;
520    int32_t     p;
521    int32_t     tag;
522    int32_t     lastP  = -2;     // A value that will never be returned as a break position.
523                                 //   cannot be -1; that is returned for DONE.
524    int         i;
525
526    logln("testFollowing():");
527    bi.setText(td.fDataToBreak);
528    td.clearResults();
529
530    // Save the starting point, since we won't get that out of following.
531    p = bi.first();
532    td.fActualBreakPositions.addElement(p, status);  // Save result.
533    tag = bi.getRuleStatus();
534    td.fActualTags.addElement(tag, status);
535
536    for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
537        p = bi.following(i);
538        if (p != lastP) {
539            if (p == RuleBasedBreakIterator::DONE) {
540                break;
541            }
542            // We've reached a new break position.  Save it.
543            td.fActualBreakPositions.addElement(p, status);  // Save result.
544            tag = bi.getRuleStatus();
545            td.fActualTags.addElement(tag, status);
546            lastP = p;
547        }
548    }
549    // The loop normally exits by means of the break in the middle.
550    // Make sure that the index was at the correct position for the break iterator to have
551    //   returned DONE.
552    if (i != td.fDataToBreak.length()) {
553        errln("testFollowing():  iterator returned DONE prematurely.");
554    }
555
556    // Full check of all results.
557    td.checkResults("testFollowing", this);
558}
559
560
561
562void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
563    UErrorCode  status = U_ZERO_ERROR;
564    int32_t     p;
565    int32_t     tag;
566    int32_t     lastP  = 0x7ffffffe;
567    int         i;
568
569    logln("testPreceding():");
570    bi.setText(td.fDataToBreak);
571    td.clearResults();
572
573    p = bi.last();
574    td.fActualBreakPositions.addElement(p, status);
575    tag = bi.getRuleStatus();
576    td.fActualTags.addElement(tag, status);
577
578    for (i = td.fDataToBreak.length(); i>=-1; i--) {
579        p = bi.preceding(i);
580        if (p != lastP) {
581            if (p == RuleBasedBreakIterator::DONE) {
582                break;
583            }
584            // We've reached a new break position.  Save it.
585            td.fActualBreakPositions.insertElementAt(p, 0, status);
586            lastP = p;
587            tag = bi.getRuleStatus();
588            td.fActualTags.insertElementAt(tag, 0, status);
589        }
590    }
591    // The loop normally exits by means of the break in the middle.
592    // Make sure that the index was at the correct position for the break iterator to have
593    //   returned DONE.
594    if (i != 0) {
595        errln("testPreceding():  iterator returned DONE prematurely.");
596    }
597
598    // Full check of all results.
599    td.checkResults("testPreceding", this);
600}
601
602
603
604void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
605    UErrorCode  status = U_ZERO_ERROR;
606    int         i;
607    int32_t     tag;
608
609    logln("testIsBoundary():");
610    bi.setText(td.fDataToBreak);
611    td.clearResults();
612
613    for (i = 0; i <= td.fDataToBreak.length(); i++) {
614        if (bi.isBoundary(i)) {
615            td.fActualBreakPositions.addElement(i, status);  // Save result.
616            tag = bi.getRuleStatus();
617            td.fActualTags.addElement(tag, status);
618        }
619    }
620    td.checkResults("testIsBoundary: ", this);
621}
622
623
624
625void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
626{
627    iterator.setText(td.fDataToBreak);
628
629    RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
630    int32_t offset = iterator.first();
631    int32_t testOffset;
632    int32_t count = 0;
633
634    logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
635
636    if (*testIterator != iterator)
637        errln("clone() or operator!= failed: two clones compared unequal");
638
639    do {
640        testOffset = testIterator->first();
641        testOffset = testIterator->next(count);
642        if (offset != testOffset)
643            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
644
645        if (offset != RuleBasedBreakIterator::DONE) {
646            count++;
647            offset = iterator.next();
648
649            if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
650                errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
651                if (count > 10000 || offset == -1) {
652                    errln("operator== failed too many times. Stopping test.");
653                    if (offset == -1) {
654                        errln("Does (RuleBasedBreakIterator::DONE == -1)?");
655                    }
656                    return;
657                }
658            }
659        }
660    } while (offset != RuleBasedBreakIterator::DONE);
661
662    // now do it backwards...
663    offset = iterator.last();
664    count = 0;
665
666    do {
667        testOffset = testIterator->last();
668        testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
669        if (offset != testOffset)
670            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
671
672        if (offset != RuleBasedBreakIterator::DONE) {
673            count--;
674            offset = iterator.previous();
675        }
676    } while (offset != RuleBasedBreakIterator::DONE);
677
678    delete testIterator;
679}
680
681
682//---------------------------------------------
683//
684//     other tests
685//
686//---------------------------------------------
687void RBBITest::TestEmptyString()
688{
689    UnicodeString text = "";
690    UErrorCode status = U_ZERO_ERROR;
691
692    BITestData x(status);
693    ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
694    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
695    if (U_FAILURE(status))
696    {
697        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
698        return;
699    }
700    generalIteratorTest(*bi, x);
701    delete bi;
702}
703
704void RBBITest::TestGetAvailableLocales()
705{
706    int32_t locCount = 0;
707    const Locale* locList = BreakIterator::getAvailableLocales(locCount);
708
709    if (locCount == 0)
710        dataerrln("getAvailableLocales() returned an empty list!");
711    // Just make sure that it's returning good memory.
712    int32_t i;
713    for (i = 0; i < locCount; ++i) {
714        logln(locList[i].getName());
715    }
716}
717
718//Testing the BreakIterator::getDisplayName() function
719void RBBITest::TestGetDisplayName()
720{
721    UnicodeString   result;
722
723    BreakIterator::getDisplayName(Locale::getUS(), result);
724    if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
725        dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
726                + result);
727
728    BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
729    if (result != "French (France)")
730        dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
731                + result);
732}
733/**
734 * Test End Behaviour
735 * @bug 4068137
736 */
737void RBBITest::TestEndBehaviour()
738{
739    UErrorCode status = U_ZERO_ERROR;
740    UnicodeString testString("boo.");
741    BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
742    if (U_FAILURE(status))
743    {
744        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
745        return;
746    }
747    wb->setText(testString);
748
749    if (wb->first() != 0)
750        errln("Didn't get break at beginning of string.");
751    if (wb->next() != 3)
752        errln("Didn't get break before period in \"boo.\"");
753    if (wb->current() != 4 && wb->next() != 4)
754        errln("Didn't get break at end of string.");
755    delete wb;
756}
757/*
758 * @bug 4153072
759 */
760void RBBITest::TestBug4153072() {
761    UErrorCode status = U_ZERO_ERROR;
762    BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
763    if (U_FAILURE(status))
764    {
765        errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
766        return;
767    }
768    UnicodeString str("...Hello, World!...");
769    int32_t begin = 3;
770    int32_t end = str.length() - 3;
771    UBool onBoundary;
772
773    StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
774    iter->adoptText(textIterator);
775    int index;
776    // Note: with the switch to UText, there is no way to restrict the
777    //       iteration range to begin at an index other than zero.
778    //       String character iterators created with a non-zero bound are
779    //         treated by RBBI as being empty.
780    for (index = -1; index < begin + 1; ++index) {
781        onBoundary = iter->isBoundary(index);
782        if (index == 0?  !onBoundary : onBoundary) {
783            errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
784                            " and begin index = " + begin);
785        }
786    }
787    delete iter;
788}
789
790
791//
792// Test for problem reported by Ashok Matoria on 9 July 2007
793//    One.<kSoftHyphen><kSpace>Two.
794//
795//    Sentence break at start (0) and then on calling next() it breaks at
796//   'T' of "Two". Now, at this point if I do next() and
797//    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
798//
799void RBBITest::TestBug5775() {
800    UErrorCode status = U_ZERO_ERROR;
801    BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
802    TEST_ASSERT_SUCCESS(status);
803    if (U_FAILURE(status)) {
804        return;
805    }
806// Check for status first for better handling of no data errors.
807    TEST_ASSERT(bi != NULL);
808    if (bi == NULL) {
809        return;
810    }
811
812    UnicodeString s("One.\\u00ad Two.", -1, US_INV);
813    //               01234      56789
814    s = s.unescape();
815    bi->setText(s);
816    int pos = bi->next();
817    TEST_ASSERT(pos == 6);
818    pos = bi->next();
819    TEST_ASSERT(pos == 10);
820    pos = bi->previous();
821    TEST_ASSERT(pos == 6);
822    delete bi;
823}
824
825
826
827//------------------------------------------------------------------------------
828//
829//   RBBITest::Extended    Run  RBBI Tests from an external test data file
830//
831//------------------------------------------------------------------------------
832
833struct TestParams {
834    BreakIterator   *bi;
835    UnicodeString    dataToBreak;
836    UVector32       *expectedBreaks;
837    UVector32       *srcLine;
838    UVector32       *srcCol;
839};
840
841void RBBITest::executeTest(TestParams *t) {
842    int32_t    bp;
843    int32_t    prevBP;
844    int32_t    i;
845
846    if (t->bi == NULL) {
847        return;
848    }
849
850    t->bi->setText(t->dataToBreak);
851    //
852    //  Run the iterator forward
853    //
854    prevBP = -1;
855    for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
856        if (prevBP ==  bp) {
857            // Fail for lack of forward progress.
858            errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
859                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
860            break;
861        }
862
863        // Check that there were we didn't miss an expected break between the last one
864        //  and this one.
865        for (i=prevBP+1; i<bp; i++) {
866            if (t->expectedBreaks->elementAti(i) != 0) {
867                int expected[] = {0, i};
868                printStringBreaks(t->dataToBreak, expected, 2);
869                errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
870                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
871            }
872        }
873
874        // Check that the break we did find was expected
875        if (t->expectedBreaks->elementAti(bp) == 0) {
876            int expected[] = {0, bp};
877            printStringBreaks(t->dataToBreak, expected, 2);
878            errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
879                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
880        } else {
881            // The break was expected.
882            //   Check that the {nnn} tag value is correct.
883            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
884            if (expectedTagVal == -1) {
885                expectedTagVal = 0;
886            }
887            int32_t line = t->srcLine->elementAti(bp);
888            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
889            if (rs != expectedTagVal) {
890                errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
891                      "          Actual, Expected status = %4d, %4d",
892                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
893            }
894        }
895
896
897        prevBP = bp;
898    }
899
900    // Verify that there were no missed expected breaks after the last one found
901    for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
902        if (t->expectedBreaks->elementAti(i) != 0) {
903            errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
904                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
905        }
906    }
907
908    //
909    //  Run the iterator backwards, verify that the same breaks are found.
910    //
911    prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
912    for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
913        if (prevBP ==  bp) {
914            // Fail for lack of progress.
915            errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
916                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
917            break;
918        }
919
920        // Check that there were we didn't miss an expected break between the last one
921        //  and this one.  (UVector returns zeros for index out of bounds.)
922        for (i=prevBP-1; i>bp; i--) {
923            if (t->expectedBreaks->elementAti(i) != 0) {
924                errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
925                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
926            }
927        }
928
929        // Check that the break we did find was expected
930        if (t->expectedBreaks->elementAti(bp) == 0) {
931            errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
932                   bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
933        } else {
934            // The break was expected.
935            //   Check that the {nnn} tag value is correct.
936            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
937            if (expectedTagVal == -1) {
938                expectedTagVal = 0;
939            }
940            int line = t->srcLine->elementAti(bp);
941            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
942            if (rs != expectedTagVal) {
943                errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
944                      "          Actual, Expected status = %4d, %4d",
945                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
946            }
947        }
948
949        prevBP = bp;
950    }
951
952    // Verify that there were no missed breaks prior to the last one found
953    for (i=prevBP-1; i>=0; i--) {
954        if (t->expectedBreaks->elementAti(i) != 0) {
955            errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
956                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
957        }
958    }
959}
960
961
962void RBBITest::TestExtended() {
963#if !UCONFIG_NO_REGULAR_EXPRESSIONS
964    UErrorCode      status  = U_ZERO_ERROR;
965    Locale          locale("");
966
967    UnicodeString       rules;
968    TestParams          tp;
969    tp.bi             = NULL;
970    tp.expectedBreaks = new UVector32(status);
971    tp.srcLine        = new UVector32(status);
972    tp.srcCol         = new UVector32(status);
973
974    RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
975    if (U_FAILURE(status)) {
976        dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
977    }
978
979
980    //
981    //  Open and read the test data file.
982    //
983    const char *testDataDirectory = IntlTest::getSourceTestData(status);
984    char testFileName[1000];
985    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
986        errln("Can't open test data.  Path too long.");
987        return;
988    }
989    strcpy(testFileName, testDataDirectory);
990    strcat(testFileName, "rbbitst.txt");
991
992    int    len;
993    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
994    if (U_FAILURE(status)) {
995        return; /* something went wrong, error already output */
996    }
997
998
999
1000
1001    //
1002    //  Put the test data into a UnicodeString
1003    //
1004    UnicodeString testString(FALSE, testFile, len);
1005
1006    enum EParseState{
1007        PARSE_COMMENT,
1008        PARSE_TAG,
1009        PARSE_DATA,
1010        PARSE_NUM
1011    }
1012    parseState = PARSE_TAG;
1013
1014    EParseState savedState = PARSE_TAG;
1015
1016    static const UChar CH_LF        = 0x0a;
1017    static const UChar CH_CR        = 0x0d;
1018    static const UChar CH_HASH      = 0x23;
1019    /*static const UChar CH_PERIOD    = 0x2e;*/
1020    static const UChar CH_LT        = 0x3c;
1021    static const UChar CH_GT        = 0x3e;
1022    static const UChar CH_BACKSLASH = 0x5c;
1023    static const UChar CH_BULLET    = 0x2022;
1024
1025    int32_t    lineNum  = 1;
1026    int32_t    colStart = 0;
1027    int32_t    column   = 0;
1028    int32_t    charIdx  = 0;
1029
1030    int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1031
1032    for (charIdx = 0; charIdx < len; ) {
1033        status = U_ZERO_ERROR;
1034        UChar  c = testString.charAt(charIdx);
1035        charIdx++;
1036        if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1037            // treat CRLF as a unit
1038            c = CH_LF;
1039            charIdx++;
1040        }
1041        if (c == CH_LF || c == CH_CR) {
1042            lineNum++;
1043            colStart = charIdx;
1044        }
1045        column = charIdx - colStart + 1;
1046
1047        switch (parseState) {
1048        case PARSE_COMMENT:
1049            if (c == 0x0a || c == 0x0d) {
1050                parseState = savedState;
1051            }
1052            break;
1053
1054        case PARSE_TAG:
1055            {
1056            if (c == CH_HASH) {
1057                parseState = PARSE_COMMENT;
1058                savedState = PARSE_TAG;
1059                break;
1060            }
1061            if (u_isUWhiteSpace(c)) {
1062                break;
1063            }
1064            if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1065                delete tp.bi;
1066                tp.bi = BreakIterator::createWordInstance(locale,  status);
1067                charIdx += 5;
1068                break;
1069            }
1070            if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1071                delete tp.bi;
1072                tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1073                charIdx += 5;
1074                break;
1075            }
1076            if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1077                delete tp.bi;
1078                tp.bi = BreakIterator::createLineInstance(locale,  status);
1079                charIdx += 5;
1080                break;
1081            }
1082            if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1083                delete tp.bi;
1084                tp.bi = NULL;
1085                tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1086                charIdx += 5;
1087                break;
1088            }
1089            if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1090                delete tp.bi;
1091                tp.bi = BreakIterator::createTitleInstance(locale,  status);
1092                charIdx += 6;
1093                break;
1094            }
1095
1096            // <locale  loc_name>
1097            localeMatcher.reset(testString);
1098            if (localeMatcher.lookingAt(charIdx-1, status)) {
1099                UnicodeString localeName = localeMatcher.group(1, status);
1100                char localeName8[100];
1101                localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1102                locale = Locale::createFromName(localeName8);
1103                charIdx += localeMatcher.group(0, status).length();
1104                TEST_ASSERT_SUCCESS(status);
1105                break;
1106            }
1107            if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1108                parseState = PARSE_DATA;
1109                charIdx += 5;
1110                tp.dataToBreak = "";
1111                tp.expectedBreaks->removeAllElements();
1112                tp.srcCol ->removeAllElements();
1113                tp.srcLine->removeAllElements();
1114                break;
1115            }
1116
1117            errln("line %d: Tag expected in test file.", lineNum);
1118            parseState = PARSE_COMMENT;
1119            savedState = PARSE_DATA;
1120            goto end_test; // Stop the test.
1121            }
1122            break;
1123
1124        case PARSE_DATA:
1125            if (c == CH_BULLET) {
1126                int32_t  breakIdx = tp.dataToBreak.length();
1127                tp.expectedBreaks->setSize(breakIdx+1);
1128                tp.expectedBreaks->setElementAt(-1, breakIdx);
1129                tp.srcLine->setSize(breakIdx+1);
1130                tp.srcLine->setElementAt(lineNum, breakIdx);
1131                tp.srcCol ->setSize(breakIdx+1);
1132                tp.srcCol ->setElementAt(column, breakIdx);
1133                break;
1134            }
1135
1136            if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1137                // Add final entry to mappings from break location to source file position.
1138                //  Need one extra because last break position returned is after the
1139                //    last char in the data, not at the last char.
1140                tp.srcLine->addElement(lineNum, status);
1141                tp.srcCol ->addElement(column, status);
1142
1143                parseState = PARSE_TAG;
1144                charIdx += 6;
1145
1146                // RUN THE TEST!
1147                executeTest(&tp);
1148                break;
1149            }
1150
1151            if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1152                // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1153                // Get the code point from the name and insert it into the test data.
1154                //   (Damn, no API takes names in Unicode  !!!
1155                //    we've got to take it back to char *)
1156                int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1157                int32_t nameLength = nameEndIdx - (charIdx+2);
1158                char charNameBuf[200];
1159                UChar32 theChar = -1;
1160                if (nameEndIdx != -1) {
1161                    UErrorCode status = U_ZERO_ERROR;
1162                    testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1163                    charNameBuf[sizeof(charNameBuf)-1] = 0;
1164                    theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1165                    if (U_FAILURE(status)) {
1166                        theChar = -1;
1167                    }
1168                }
1169                if (theChar == -1) {
1170                    errln("Error in named character in test file at line %d, col %d",
1171                        lineNum, column);
1172                } else {
1173                    // Named code point was recognized.  Insert it
1174                    //   into the test data.
1175                    tp.dataToBreak.append(theChar);
1176                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
1177                        tp.srcLine->addElement(lineNum, status);
1178                        tp.srcCol ->addElement(column, status);
1179                    }
1180                }
1181                if (nameEndIdx > charIdx) {
1182                    charIdx = nameEndIdx+1;
1183
1184                }
1185                break;
1186            }
1187
1188
1189
1190
1191            if (testString.compare(charIdx-1, 2, "<>") == 0) {
1192                charIdx++;
1193                int32_t  breakIdx = tp.dataToBreak.length();
1194                tp.expectedBreaks->setSize(breakIdx+1);
1195                tp.expectedBreaks->setElementAt(-1, breakIdx);
1196                tp.srcLine->setSize(breakIdx+1);
1197                tp.srcLine->setElementAt(lineNum, breakIdx);
1198                tp.srcCol ->setSize(breakIdx+1);
1199                tp.srcCol ->setElementAt(column, breakIdx);
1200                break;
1201            }
1202
1203            if (c == CH_LT) {
1204                tagValue   = 0;
1205                parseState = PARSE_NUM;
1206                break;
1207            }
1208
1209            if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1210                parseState = PARSE_COMMENT;
1211                savedState = PARSE_DATA;
1212                break;
1213            }
1214
1215            if (c == CH_BACKSLASH) {
1216                // Check for \ at end of line, a line continuation.
1217                //     Advance over (discard) the newline
1218                UChar32 cp = testString.char32At(charIdx);
1219                if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1220                    // We have a CR LF
1221                    //  Need an extra increment of the input ptr to move over both of them
1222                    charIdx++;
1223                }
1224                if (cp == CH_LF || cp == CH_CR) {
1225                    lineNum++;
1226                    colStart = charIdx;
1227                    charIdx++;
1228                    break;
1229                }
1230
1231                // Let unescape handle the back slash.
1232                cp = testString.unescapeAt(charIdx);
1233                if (cp != -1) {
1234                    // Escape sequence was recognized.  Insert the char
1235                    //   into the test data.
1236                    tp.dataToBreak.append(cp);
1237                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
1238                        tp.srcLine->addElement(lineNum, status);
1239                        tp.srcCol ->addElement(column, status);
1240                    }
1241                    break;
1242                }
1243
1244
1245                // Not a recognized backslash escape sequence.
1246                // Take the next char as a literal.
1247                //  TODO:  Should this be an error?
1248                c = testString.charAt(charIdx);
1249                charIdx = testString.moveIndex32(charIdx, 1);
1250            }
1251
1252            // Normal, non-escaped data char.
1253            tp.dataToBreak.append(c);
1254
1255            // Save the mapping from offset in the data to line/column numbers in
1256            //   the original input file.  Will be used for better error messages only.
1257            //   If there's an expected break before this char, the slot in the mapping
1258            //     vector will already be set for this char; don't overwrite it.
1259            if (tp.dataToBreak.length() > tp.srcLine->size()) {
1260                tp.srcLine->addElement(lineNum, status);
1261                tp.srcCol ->addElement(column, status);
1262            }
1263            break;
1264
1265
1266        case PARSE_NUM:
1267            // We are parsing an expected numeric tag value, like <1234>,
1268            //   within a chunk of data.
1269            if (u_isUWhiteSpace(c)) {
1270                break;
1271            }
1272
1273            if (c == CH_GT) {
1274                // Finished the number.  Add the info to the expected break data,
1275                //   and switch parse state back to doing plain data.
1276                parseState = PARSE_DATA;
1277                if (tagValue == 0) {
1278                    tagValue = -1;
1279                }
1280                int32_t  breakIdx = tp.dataToBreak.length();
1281                tp.expectedBreaks->setSize(breakIdx+1);
1282                tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1283                tp.srcLine->setSize(breakIdx+1);
1284                tp.srcLine->setElementAt(lineNum, breakIdx);
1285                tp.srcCol ->setSize(breakIdx+1);
1286                tp.srcCol ->setElementAt(column, breakIdx);
1287                break;
1288            }
1289
1290            if (u_isdigit(c)) {
1291                tagValue = tagValue*10 + u_charDigitValue(c);
1292                break;
1293            }
1294
1295            errln("Syntax Error in test file at line %d, col %d",
1296                lineNum, column);
1297            parseState = PARSE_COMMENT;
1298            goto end_test; // Stop the test
1299            break;
1300        }
1301
1302
1303        if (U_FAILURE(status)) {
1304            dataerrln("ICU Error %s while parsing test file at line %d.",
1305                u_errorName(status), lineNum);
1306            status = U_ZERO_ERROR;
1307            goto end_test; // Stop the test
1308        }
1309
1310    }
1311
1312end_test:
1313    delete tp.bi;
1314    delete tp.expectedBreaks;
1315    delete tp.srcLine;
1316    delete tp.srcCol;
1317    delete [] testFile;
1318#endif
1319}
1320
1321
1322//-------------------------------------------------------------------------------
1323//
1324//  TestDictRules   create a break iterator from source rules that includes a
1325//                  dictionary range.   Regression for bug #7130.  Source rules
1326//                  do not declare a break iterator type (word, line, sentence, etc.
1327//                  but the dictionary code, without a type, would loop.
1328//
1329//-------------------------------------------------------------------------------
1330void RBBITest::TestDictRules() {
1331    const char *rules =  "$dictionary = [a-z]; \n"
1332                         "!!forward; \n"
1333                         "$dictionary $dictionary; \n"
1334                         "!!reverse; \n"
1335                         "$dictionary $dictionary; \n";
1336    const char *text = "aa";
1337    UErrorCode status = U_ZERO_ERROR;
1338    UParseError parseError;
1339
1340    RuleBasedBreakIterator bi(rules, parseError, status);
1341    if (U_SUCCESS(status)) {
1342        UnicodeString utext = text;
1343        bi.setText(utext);
1344        int32_t position;
1345        int32_t loops;
1346        for (loops = 0; loops<10; loops++) {
1347            position = bi.next();
1348            if (position == RuleBasedBreakIterator::DONE) {
1349                break;
1350            }
1351        }
1352        TEST_ASSERT(loops == 1);
1353    } else {
1354        dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1355    }
1356}
1357
1358
1359
1360//-------------------------------------------------------------------------------
1361//
1362//    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1363//    return the datain one big UChar * buffer, which the caller must delete.
1364//
1365//    parameters:
1366//          fileName:   the name of the file, with no directory part.  The test data directory
1367//                      is assumed.
1368//          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1369//          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1370//                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1371//                      Pass NULL for the system default encoding.
1372//          status
1373//    returns:
1374//                      The file data, converted to UChar.
1375//                      The caller must delete this when done with
1376//                           delete [] theBuffer;
1377//
1378//    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1379//           Move this function to some common place.
1380//
1381//--------------------------------------------------------------------------------
1382UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1383    UChar       *retPtr  = NULL;
1384    char        *fileBuf = NULL;
1385    UConverter* conv     = NULL;
1386    FILE        *f       = NULL;
1387
1388    ulen = 0;
1389    if (U_FAILURE(status)) {
1390        return retPtr;
1391    }
1392
1393    //
1394    //  Open the file.
1395    //
1396    f = fopen(fileName, "rb");
1397    if (f == 0) {
1398        dataerrln("Error opening test data file %s\n", fileName);
1399        status = U_FILE_ACCESS_ERROR;
1400        return NULL;
1401    }
1402    //
1403    //  Read it in
1404    //
1405    int   fileSize;
1406    int   amt_read;
1407
1408    fseek( f, 0, SEEK_END);
1409    fileSize = ftell(f);
1410    fileBuf = new char[fileSize];
1411    fseek(f, 0, SEEK_SET);
1412    amt_read = fread(fileBuf, 1, fileSize, f);
1413    if (amt_read != fileSize || fileSize <= 0) {
1414        errln("Error reading test data file.");
1415        goto cleanUpAndReturn;
1416    }
1417
1418    //
1419    // Look for a Unicode Signature (BOM) on the data just read
1420    //
1421    int32_t        signatureLength;
1422    const char *   fileBufC;
1423    const char*    bomEncoding;
1424
1425    fileBufC = fileBuf;
1426    bomEncoding = ucnv_detectUnicodeSignature(
1427        fileBuf, fileSize, &signatureLength, &status);
1428    if(bomEncoding!=NULL ){
1429        fileBufC  += signatureLength;
1430        fileSize  -= signatureLength;
1431        encoding = bomEncoding;
1432    }
1433
1434    //
1435    // Open a converter to take the rule file to UTF-16
1436    //
1437    conv = ucnv_open(encoding, &status);
1438    if (U_FAILURE(status)) {
1439        goto cleanUpAndReturn;
1440    }
1441
1442    //
1443    // Convert the rules to UChar.
1444    //  Preflight first to determine required buffer size.
1445    //
1446    ulen = ucnv_toUChars(conv,
1447        NULL,           //  dest,
1448        0,              //  destCapacity,
1449        fileBufC,
1450        fileSize,
1451        &status);
1452    if (status == U_BUFFER_OVERFLOW_ERROR) {
1453        // Buffer Overflow is expected from the preflight operation.
1454        status = U_ZERO_ERROR;
1455
1456        retPtr = new UChar[ulen+1];
1457        ucnv_toUChars(conv,
1458            retPtr,       //  dest,
1459            ulen+1,
1460            fileBufC,
1461            fileSize,
1462            &status);
1463    }
1464
1465cleanUpAndReturn:
1466    fclose(f);
1467    delete []fileBuf;
1468    ucnv_close(conv);
1469    if (U_FAILURE(status)) {
1470        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1471        delete []retPtr;
1472        retPtr = 0;
1473        ulen   = 0;
1474    };
1475    return retPtr;
1476}
1477
1478
1479
1480//--------------------------------------------------------------------------------------------
1481//
1482//   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1483//
1484//-------------------------------------------------------------------------------------------
1485void RBBITest::TestUnicodeFiles() {
1486    RuleBasedBreakIterator  *bi;
1487    UErrorCode               status = U_ZERO_ERROR;
1488
1489    bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1490    TEST_ASSERT_SUCCESS(status);
1491    if (U_SUCCESS(status)) {
1492        runUnicodeTestData("GraphemeBreakTest.txt", bi);
1493    }
1494    delete bi;
1495
1496    bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1497    TEST_ASSERT_SUCCESS(status);
1498    if (U_SUCCESS(status)) {
1499        runUnicodeTestData("WordBreakTest.txt", bi);
1500    }
1501    delete bi;
1502
1503    bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1504    TEST_ASSERT_SUCCESS(status);
1505    if (U_SUCCESS(status)) {
1506        runUnicodeTestData("SentenceBreakTest.txt", bi);
1507    }
1508    delete bi;
1509
1510    bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1511    TEST_ASSERT_SUCCESS(status);
1512    if (U_SUCCESS(status)) {
1513        runUnicodeTestData("LineBreakTest.txt", bi);
1514    }
1515    delete bi;
1516}
1517
1518
1519//--------------------------------------------------------------------------------------------
1520//
1521//   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1522//
1523//-------------------------------------------------------------------------------------------
1524void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1525#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1526    // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
1527    UBool isTicket7270Fixed = isICUVersionAtLeast(52, 1);
1528    UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
1529    UErrorCode  status = U_ZERO_ERROR;
1530
1531    //
1532    //  Open and read the test data file, put it into a UnicodeString.
1533    //
1534    const char *testDataDirectory = IntlTest::getSourceTestData(status);
1535    char testFileName[1000];
1536    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1537        dataerrln("Can't open test data.  Path too long.");
1538        return;
1539    }
1540    strcpy(testFileName, testDataDirectory);
1541    strcat(testFileName, fileName);
1542
1543    logln("Opening data file %s\n", fileName);
1544
1545    int    len;
1546    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1547    if (status != U_FILE_ACCESS_ERROR) {
1548        TEST_ASSERT_SUCCESS(status);
1549        TEST_ASSERT(testFile != NULL);
1550    }
1551    if (U_FAILURE(status) || testFile == NULL) {
1552        return; /* something went wrong, error already output */
1553    }
1554    UnicodeString testFileAsString(TRUE, testFile, len);
1555
1556    //
1557    //  Parse the test data file using a regular expression.
1558    //  Each kind of token is recognized in its own capture group; what type of item was scanned
1559    //     is identified by which group had a match.
1560    //
1561    //    Caputure Group #                  1          2            3            4           5
1562    //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1563    //
1564    UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1565    RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1566    UnicodeString   testString;
1567    UVector32       breakPositions(status);
1568    int             lineNumber = 1;
1569    TEST_ASSERT_SUCCESS(status);
1570    if (U_FAILURE(status)) {
1571        return;
1572    }
1573
1574    //
1575    //  Scan through each test case, building up the string to be broken in testString,
1576    //   and the positions that should be boundaries in the breakPositions vector.
1577    //
1578    int spin = 0;
1579    while (tokenMatcher.find()) {
1580      	if(tokenMatcher.hitEnd()) {
1581          /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1582             This occurred when the text file was corrupt (wasn't marked as UTF-8)
1583             and caused an infinite loop here on EBCDIC systems!
1584          */
1585          fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1586          //	   return;
1587      	}
1588        if (tokenMatcher.start(1, status) >= 0) {
1589            // Scanned a divide sign, indicating a break position in the test data.
1590            if (testString.length()>0) {
1591                breakPositions.addElement(testString.length(), status);
1592            }
1593        }
1594        else if (tokenMatcher.start(2, status) >= 0) {
1595            // Scanned an 'x', meaning no break at this position in the test data
1596            //   Nothing to be done here.
1597            }
1598        else if (tokenMatcher.start(3, status) >= 0) {
1599            // Scanned Hex digits.  Convert them to binary, append to the character data string.
1600            const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1601            int length = hexNumber.length();
1602            if (length<=8) {
1603                char buf[10];
1604                hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1605                UChar32 c = (UChar32)strtol(buf, NULL, 16);
1606                if (c<=0x10ffff) {
1607                    testString.append(c);
1608                } else {
1609                    errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1610                       fileName, lineNumber);
1611                }
1612            } else {
1613                errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1614                       fileName, lineNumber);
1615             }
1616        }
1617        else if (tokenMatcher.start(4, status) >= 0) {
1618            // Scanned to end of a line, possibly skipping over a comment in the process.
1619            //   If the line from the file contained test data, run the test now.
1620            //
1621            if (testString.length() > 0) {
1622// TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data.
1623//             Rule 8
1624//                ZW SP* <break>
1625//             is not yet implemented.
1626if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber ||
1627                                            5202 == lineNumber ||
1628                                            5214 == lineNumber ||
1629                                            5246 == lineNumber ||
1630                                            5298 == lineNumber ||
1631                                            5302 == lineNumber ))) {
1632                checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1633}
1634            }
1635
1636            // Clear out this test case.
1637            //    The string and breakPositions vector will be refilled as the next
1638            //       test case is parsed.
1639            testString.remove();
1640            breakPositions.removeAllElements();
1641            lineNumber++;
1642        } else {
1643            // Scanner catchall.  Something unrecognized appeared on the line.
1644            char token[16];
1645            UnicodeString uToken = tokenMatcher.group(0, status);
1646            uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1647            token[sizeof(token)-1] = 0;
1648            errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1649
1650            // Clean up, in preparation for continuing with the next line.
1651            testString.remove();
1652            breakPositions.removeAllElements();
1653            lineNumber++;
1654        }
1655        TEST_ASSERT_SUCCESS(status);
1656        if (U_FAILURE(status)) {
1657            break;
1658        }
1659    }
1660
1661    delete [] testFile;
1662 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1663}
1664
1665//--------------------------------------------------------------------------------------------
1666//
1667//   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1668//                            test data files.  Do only a simple, forward-only check -
1669//                            this test is mostly to check that ICU and the Unicode
1670//                            data agree with each other.
1671//
1672//--------------------------------------------------------------------------------------------
1673void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1674                         const UnicodeString &testString,   // Text data to be broken
1675                         UVector32 *breakPositions,         // Positions where breaks should be found.
1676                         RuleBasedBreakIterator *bi) {
1677    int32_t pos;                 // Break Position in the test string
1678    int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1679    int32_t expectedPos;         // Expected break position (index into test string)
1680
1681    bi->setText(testString);
1682    pos = bi->first();
1683    pos = bi->next();
1684
1685    while (pos != BreakIterator::DONE) {
1686        if (expectedI >= breakPositions->size()) {
1687            errln("Test file \"%s\", line %d, unexpected break found at position %d",
1688                testFileName, lineNumber, pos);
1689            break;
1690        }
1691        expectedPos = breakPositions->elementAti(expectedI);
1692        if (pos < expectedPos) {
1693            errln("Test file \"%s\", line %d, unexpected break found at position %d",
1694                testFileName, lineNumber, pos);
1695            break;
1696        }
1697        if (pos > expectedPos) {
1698            errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1699                testFileName, lineNumber, expectedPos);
1700            break;
1701        }
1702        pos = bi->next();
1703        expectedI++;
1704    }
1705
1706    if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1707        errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1708            testFileName, lineNumber, breakPositions->elementAti(expectedI));
1709    }
1710}
1711
1712
1713
1714#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1715//---------------------------------------------------------------------------------------
1716//
1717//   classs RBBIMonkeyKind
1718//
1719//      Monkey Test for Break Iteration
1720//      Abstract interface class.   Concrete derived classes independently
1721//      implement the break rules for different iterator types.
1722//
1723//      The Monkey Test itself uses doesn't know which type of break iterator it is
1724//      testing, but works purely in terms of the interface defined here.
1725//
1726//---------------------------------------------------------------------------------------
1727class RBBIMonkeyKind {
1728public:
1729    // Return a UVector of UnicodeSets, representing the character classes used
1730    //   for this type of iterator.
1731    virtual  UVector  *charClasses() = 0;
1732
1733    // Set the test text on which subsequent calls to next() will operate
1734    virtual  void      setText(const UnicodeString &s) = 0;
1735
1736    // Find the next break postion, starting from the prev break position, or from zero.
1737    // Return -1 after reaching end of string.
1738    virtual  int32_t   next(int32_t i) = 0;
1739
1740    virtual ~RBBIMonkeyKind();
1741    UErrorCode       deferredStatus;
1742
1743
1744protected:
1745    RBBIMonkeyKind();
1746
1747private:
1748};
1749
1750RBBIMonkeyKind::RBBIMonkeyKind() {
1751    deferredStatus = U_ZERO_ERROR;
1752}
1753
1754RBBIMonkeyKind::~RBBIMonkeyKind() {
1755}
1756
1757
1758//----------------------------------------------------------------------------------------
1759//
1760//   Random Numbers.  Similar to standard lib rand() and srand()
1761//                    Not using library to
1762//                      1.  Get same results on all platforms.
1763//                      2.  Get access to current seed, to more easily reproduce failures.
1764//
1765//---------------------------------------------------------------------------------------
1766static uint32_t m_seed = 1;
1767
1768static uint32_t m_rand()
1769{
1770    m_seed = m_seed * 1103515245 + 12345;
1771    return (uint32_t)(m_seed/65536) % 32768;
1772}
1773
1774
1775//------------------------------------------------------------------------------------------
1776//
1777//   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1778//                             of RBBIMonkeyKind.
1779//
1780//------------------------------------------------------------------------------------------
1781class RBBICharMonkey: public RBBIMonkeyKind {
1782public:
1783    RBBICharMonkey();
1784    virtual          ~RBBICharMonkey();
1785    virtual  UVector *charClasses();
1786    virtual  void     setText(const UnicodeString &s);
1787    virtual  int32_t  next(int32_t i);
1788private:
1789    UVector   *fSets;
1790
1791    UnicodeSet  *fCRLFSet;
1792    UnicodeSet  *fControlSet;
1793    UnicodeSet  *fExtendSet;
1794    UnicodeSet  *fRegionalIndicatorSet;
1795    UnicodeSet  *fPrependSet;
1796    UnicodeSet  *fSpacingSet;
1797    UnicodeSet  *fLSet;
1798    UnicodeSet  *fVSet;
1799    UnicodeSet  *fTSet;
1800    UnicodeSet  *fLVSet;
1801    UnicodeSet  *fLVTSet;
1802    UnicodeSet  *fHangulSet;
1803    UnicodeSet  *fAnySet;
1804
1805    const UnicodeString *fText;
1806};
1807
1808
1809RBBICharMonkey::RBBICharMonkey() {
1810    UErrorCode  status = U_ZERO_ERROR;
1811
1812    fText = NULL;
1813
1814    fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1815    fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
1816    fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
1817    fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1818    fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1819    fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1820    fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1821    fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1822    fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1823    fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1824    fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1825    fHangulSet  = new UnicodeSet();
1826    fHangulSet->addAll(*fLSet);
1827    fHangulSet->addAll(*fVSet);
1828    fHangulSet->addAll(*fTSet);
1829    fHangulSet->addAll(*fLVSet);
1830    fHangulSet->addAll(*fLVTSet);
1831    fAnySet     = new UnicodeSet(0, 0x10ffff);
1832
1833    fSets       = new UVector(status);
1834    fSets->addElement(fCRLFSet,    status);
1835    fSets->addElement(fControlSet, status);
1836    fSets->addElement(fExtendSet,  status);
1837    fSets->addElement(fRegionalIndicatorSet, status);
1838    if (!fPrependSet->isEmpty()) {
1839        fSets->addElement(fPrependSet, status);
1840    }
1841    fSets->addElement(fSpacingSet, status);
1842    fSets->addElement(fHangulSet,  status);
1843    fSets->addElement(fAnySet,     status);
1844    if (U_FAILURE(status)) {
1845        deferredStatus = status;
1846    }
1847}
1848
1849
1850void RBBICharMonkey::setText(const UnicodeString &s) {
1851    fText = &s;
1852}
1853
1854
1855
1856int32_t RBBICharMonkey::next(int32_t prevPos) {
1857    int    p0, p1, p2, p3;    // Indices of the significant code points around the
1858                              //   break position being tested.  The candidate break
1859                              //   location is before p2.
1860
1861    int     breakPos = -1;
1862
1863    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1864
1865    if (U_FAILURE(deferredStatus)) {
1866        return -1;
1867    }
1868
1869    // Previous break at end of string.  return DONE.
1870    if (prevPos >= fText->length()) {
1871        return -1;
1872    }
1873    p0 = p1 = p2 = p3 = prevPos;
1874    c3 =  fText->char32At(prevPos);
1875    c0 = c1 = c2 = 0;
1876
1877    // Loop runs once per "significant" character position in the input text.
1878    for (;;) {
1879        // Move all of the positions forward in the input string.
1880        p0 = p1;  c0 = c1;
1881        p1 = p2;  c1 = c2;
1882        p2 = p3;  c2 = c3;
1883
1884        // Advancd p3 by one codepoint
1885        p3 = fText->moveIndex32(p3, 1);
1886        c3 = fText->char32At(p3);
1887
1888        if (p1 == p2) {
1889            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1890            continue;
1891        }
1892        if (p2 == fText->length()) {
1893            // Reached end of string.  Always a break position.
1894            break;
1895        }
1896
1897        // Rule  GB3   CR x LF
1898        //     No Extend or Format characters may appear between the CR and LF,
1899        //     which requires the additional check for p2 immediately following p1.
1900        //
1901        if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1902            continue;
1903        }
1904
1905        // Rule (GB4).   ( Control | CR | LF ) <break>
1906        if (fControlSet->contains(c1) ||
1907            c1 == 0x0D ||
1908            c1 == 0x0A)  {
1909            break;
1910        }
1911
1912        // Rule (GB5)    <break>  ( Control | CR | LF )
1913        //
1914        if (fControlSet->contains(c2) ||
1915            c2 == 0x0D ||
1916            c2 == 0x0A)  {
1917            break;
1918        }
1919
1920
1921        // Rule (GB6)  L x ( L | V | LV | LVT )
1922        if (fLSet->contains(c1) &&
1923               (fLSet->contains(c2)  ||
1924                fVSet->contains(c2)  ||
1925                fLVSet->contains(c2) ||
1926                fLVTSet->contains(c2))) {
1927            continue;
1928        }
1929
1930        // Rule (GB7)    ( LV | V )  x  ( V | T )
1931        if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1932            (fVSet->contains(c2) || fTSet->contains(c2)))  {
1933            continue;
1934        }
1935
1936        // Rule (GB8)    ( LVT | T)  x T
1937        if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1938            fTSet->contains(c2))  {
1939            continue;
1940        }
1941
1942        // Rule (GB8a)    Regional_Indicator x Regional_Indicator
1943        if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1944            continue;
1945        }
1946
1947        // Rule (GB9)    Numeric x ALetter
1948        if (fExtendSet->contains(c2))  {
1949            continue;
1950        }
1951
1952        // Rule (GB9a)   x  SpacingMark
1953        if (fSpacingSet->contains(c2)) {
1954            continue;
1955        }
1956
1957        // Rule (GB9b)   Prepend x
1958        if (fPrependSet->contains(c1)) {
1959            continue;
1960        }
1961
1962        // Rule (GB10)  Any  <break>  Any
1963        break;
1964    }
1965
1966    breakPos = p2;
1967    return breakPos;
1968}
1969
1970
1971
1972UVector  *RBBICharMonkey::charClasses() {
1973    return fSets;
1974}
1975
1976
1977RBBICharMonkey::~RBBICharMonkey() {
1978    delete fSets;
1979    delete fCRLFSet;
1980    delete fControlSet;
1981    delete fExtendSet;
1982    delete fRegionalIndicatorSet;
1983    delete fPrependSet;
1984    delete fSpacingSet;
1985    delete fLSet;
1986    delete fVSet;
1987    delete fTSet;
1988    delete fLVSet;
1989    delete fLVTSet;
1990    delete fHangulSet;
1991    delete fAnySet;
1992}
1993
1994//------------------------------------------------------------------------------------------
1995//
1996//   class RBBIWordMonkey      Word Break specific implementation
1997//                             of RBBIMonkeyKind.
1998//
1999//------------------------------------------------------------------------------------------
2000class RBBIWordMonkey: public RBBIMonkeyKind {
2001public:
2002    RBBIWordMonkey();
2003    virtual          ~RBBIWordMonkey();
2004    virtual  UVector *charClasses();
2005    virtual  void     setText(const UnicodeString &s);
2006    virtual int32_t   next(int32_t i);
2007private:
2008    UVector      *fSets;
2009
2010    UnicodeSet  *fCRSet;
2011    UnicodeSet  *fLFSet;
2012    UnicodeSet  *fNewlineSet;
2013    UnicodeSet  *fKatakanaSet;
2014    UnicodeSet  *fALetterSet;
2015    // TODO(jungshik): Do we still need this change?
2016    // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
2017    UnicodeSet  *fMidNumLetSet;
2018    UnicodeSet  *fMidLetterSet;
2019    UnicodeSet  *fMidNumSet;
2020    UnicodeSet  *fNumericSet;
2021    UnicodeSet  *fFormatSet;
2022    UnicodeSet  *fOtherSet;
2023    UnicodeSet  *fExtendSet;
2024    UnicodeSet  *fExtendNumLetSet;
2025    UnicodeSet  *fRegionalIndicatorSet;
2026    UnicodeSet  *fDictionaryCjkSet;
2027
2028    RegexMatcher  *fMatcher;
2029
2030    const UnicodeString  *fText;
2031};
2032
2033
2034RBBIWordMonkey::RBBIWordMonkey()
2035{
2036    UErrorCode  status = U_ZERO_ERROR;
2037
2038    fSets            = new UVector(status);
2039
2040    fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2041    fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2042    fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2043    fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2044    // Exclude Hangul syllables from ALetterSet during testing.
2045    // Leave CJK dictionary characters out from the monkey tests!
2046#if 0
2047    fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
2048                                      "[\\p{Line_Break = Complex_Context}"
2049                                      "-\\p{Grapheme_Cluster_Break = Extend}"
2050                                      "-\\p{Grapheme_Cluster_Break = Control}"
2051                                      "]]",
2052                                      status);
2053#endif
2054    fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2055    fALetterSet->removeAll(*fDictionaryCjkSet);
2056    fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2057    fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2058    fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2059    fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2060    // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2061    // we should figure out why
2062    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2063    fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2064    fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2065    fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2066    fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2067
2068    fOtherSet        = new UnicodeSet();
2069    if(U_FAILURE(status)) {
2070      deferredStatus = status;
2071      return;
2072    }
2073
2074    fOtherSet->complement();
2075    fOtherSet->removeAll(*fCRSet);
2076    fOtherSet->removeAll(*fLFSet);
2077    fOtherSet->removeAll(*fNewlineSet);
2078    fOtherSet->removeAll(*fKatakanaSet);
2079    fOtherSet->removeAll(*fALetterSet);
2080    fOtherSet->removeAll(*fMidLetterSet);
2081    fOtherSet->removeAll(*fMidNumSet);
2082    fOtherSet->removeAll(*fNumericSet);
2083    fOtherSet->removeAll(*fExtendNumLetSet);
2084    fOtherSet->removeAll(*fFormatSet);
2085    fOtherSet->removeAll(*fExtendSet);
2086    fOtherSet->removeAll(*fRegionalIndicatorSet);
2087    // Inhibit dictionary characters from being tested at all.
2088    fOtherSet->removeAll(*fDictionaryCjkSet);
2089    fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2090
2091    fSets->addElement(fCRSet,        status);
2092    fSets->addElement(fLFSet,        status);
2093    fSets->addElement(fNewlineSet,   status);
2094    fSets->addElement(fALetterSet,   status);
2095    //fSets->addElement(fKatakanaSet,  status); //TODO: work out how to test katakana
2096    fSets->addElement(fMidLetterSet, status);
2097    fSets->addElement(fMidNumLetSet, status);
2098    fSets->addElement(fMidNumSet,    status);
2099    fSets->addElement(fNumericSet,   status);
2100    fSets->addElement(fFormatSet,    status);
2101    fSets->addElement(fExtendSet,    status);
2102    fSets->addElement(fOtherSet,     status);
2103    fSets->addElement(fExtendNumLetSet, status);
2104    fSets->addElement(fRegionalIndicatorSet, status);
2105
2106    if (U_FAILURE(status)) {
2107        deferredStatus = status;
2108    }
2109}
2110
2111void RBBIWordMonkey::setText(const UnicodeString &s) {
2112    fText       = &s;
2113}
2114
2115
2116int32_t RBBIWordMonkey::next(int32_t prevPos) {
2117    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2118                              //   break position being tested.  The candidate break
2119                              //   location is before p2.
2120
2121    int     breakPos = -1;
2122
2123    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2124
2125    if (U_FAILURE(deferredStatus)) {
2126        return -1;
2127    }
2128
2129    // Prev break at end of string.  return DONE.
2130    if (prevPos >= fText->length()) {
2131        return -1;
2132    }
2133    p0 = p1 = p2 = p3 = prevPos;
2134    c3 =  fText->char32At(prevPos);
2135    c0 = c1 = c2 = 0;
2136
2137    // Loop runs once per "significant" character position in the input text.
2138    for (;;) {
2139        // Move all of the positions forward in the input string.
2140        p0 = p1;  c0 = c1;
2141        p1 = p2;  c1 = c2;
2142        p2 = p3;  c2 = c3;
2143
2144        // Advancd p3 by    X(Extend | Format)*   Rule 4
2145        //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2146        do {
2147            p3 = fText->moveIndex32(p3, 1);
2148            c3 = fText->char32At(p3);
2149            if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2150               break;
2151            };
2152        }
2153        while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2154
2155
2156        if (p1 == p2) {
2157            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2158            continue;
2159        }
2160        if (p2 == fText->length()) {
2161            // Reached end of string.  Always a break position.
2162            break;
2163        }
2164
2165        // Rule  (3)   CR x LF
2166        //     No Extend or Format characters may appear between the CR and LF,
2167        //     which requires the additional check for p2 immediately following p1.
2168        //
2169        if (c1==0x0D && c2==0x0A) {
2170            continue;
2171        }
2172
2173        // Rule (3a)  Break before and after newlines (including CR and LF)
2174        //
2175        if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2176            break;
2177        };
2178        if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2179            break;
2180        };
2181
2182        // Rule (5).   ALetter x ALetter
2183        if (fALetterSet->contains(c1) &&
2184            fALetterSet->contains(c2))  {
2185            continue;
2186        }
2187
2188        // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
2189        //
2190        if ( fALetterSet->contains(c1)   &&
2191             (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
2192             fALetterSet->contains(c3)) {
2193            continue;
2194        }
2195
2196
2197        // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
2198        if (fALetterSet->contains(c0) &&
2199            (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
2200            fALetterSet->contains(c2)) {
2201            continue;
2202        }
2203
2204        // Rule (8)    Numeric x Numeric
2205        if (fNumericSet->contains(c1) &&
2206            fNumericSet->contains(c2))  {
2207            continue;
2208        }
2209
2210        // Rule (9)    ALetter x Numeric
2211        if (fALetterSet->contains(c1) &&
2212            fNumericSet->contains(c2))  {
2213            continue;
2214        }
2215
2216        // Rule (10)    Numeric x ALetter
2217        if (fNumericSet->contains(c1) &&
2218            fALetterSet->contains(c2))  {
2219            continue;
2220        }
2221
2222        // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
2223        if (fNumericSet->contains(c0) &&
2224            (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
2225            fNumericSet->contains(c2)) {
2226            continue;
2227        }
2228
2229        // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
2230        if (fNumericSet->contains(c1) &&
2231            (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
2232            fNumericSet->contains(c3)) {
2233            continue;
2234        }
2235
2236        // Rule (13)  Katakana x Katakana
2237        if (fKatakanaSet->contains(c1) &&
2238            fKatakanaSet->contains(c2))  {
2239            continue;
2240        }
2241
2242        // Rule 13a
2243        if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2244             fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2245             fExtendNumLetSet->contains(c2)) {
2246                continue;
2247        }
2248
2249        // Rule 13b
2250        if (fExtendNumLetSet->contains(c1) &&
2251                (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2252                fKatakanaSet->contains(c2)))  {
2253                continue;
2254        }
2255
2256        // Rule 13c
2257        if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2258            continue;
2259        }
2260
2261        // Rule 14.  Break found here.
2262        break;
2263    }
2264
2265    breakPos = p2;
2266    return breakPos;
2267}
2268
2269
2270UVector  *RBBIWordMonkey::charClasses() {
2271    return fSets;
2272}
2273
2274
2275RBBIWordMonkey::~RBBIWordMonkey() {
2276    delete fSets;
2277    delete fCRSet;
2278    delete fLFSet;
2279    delete fNewlineSet;
2280    delete fKatakanaSet;
2281    delete fALetterSet;
2282    delete fMidNumLetSet;
2283    delete fMidLetterSet;
2284    delete fMidNumSet;
2285    delete fNumericSet;
2286    delete fFormatSet;
2287    delete fExtendSet;
2288    delete fExtendNumLetSet;
2289    delete fRegionalIndicatorSet;
2290    delete fDictionaryCjkSet;
2291    delete fOtherSet;
2292}
2293
2294
2295
2296
2297//------------------------------------------------------------------------------------------
2298//
2299//   class RBBISentMonkey      Sentence Break specific implementation
2300//                             of RBBIMonkeyKind.
2301//
2302//------------------------------------------------------------------------------------------
2303class RBBISentMonkey: public RBBIMonkeyKind {
2304public:
2305    RBBISentMonkey();
2306    virtual          ~RBBISentMonkey();
2307    virtual  UVector *charClasses();
2308    virtual  void     setText(const UnicodeString &s);
2309    virtual int32_t   next(int32_t i);
2310private:
2311    int               moveBack(int posFrom);
2312    int               moveForward(int posFrom);
2313    UChar32           cAt(int pos);
2314
2315    UVector      *fSets;
2316
2317    UnicodeSet  *fSepSet;
2318    UnicodeSet  *fFormatSet;
2319    UnicodeSet  *fSpSet;
2320    UnicodeSet  *fLowerSet;
2321    UnicodeSet  *fUpperSet;
2322    UnicodeSet  *fOLetterSet;
2323    UnicodeSet  *fNumericSet;
2324    UnicodeSet  *fATermSet;
2325    UnicodeSet  *fSContinueSet;
2326    UnicodeSet  *fSTermSet;
2327    UnicodeSet  *fCloseSet;
2328    UnicodeSet  *fOtherSet;
2329    UnicodeSet  *fExtendSet;
2330
2331    const UnicodeString  *fText;
2332
2333};
2334
2335RBBISentMonkey::RBBISentMonkey()
2336{
2337    UErrorCode  status = U_ZERO_ERROR;
2338
2339    fSets            = new UVector(status);
2340
2341    //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2342    //                       set and made into character classes of their own.  For the monkey impl,
2343    //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2344    fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2345    fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2346    fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2347    fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2348    fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2349    fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2350    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2351    fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2352    fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2353    fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2354    fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2355    fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2356    fOtherSet        = new UnicodeSet();
2357
2358    if(U_FAILURE(status)) {
2359      deferredStatus = status;
2360      return;
2361    }
2362
2363    fOtherSet->complement();
2364    fOtherSet->removeAll(*fSepSet);
2365    fOtherSet->removeAll(*fFormatSet);
2366    fOtherSet->removeAll(*fSpSet);
2367    fOtherSet->removeAll(*fLowerSet);
2368    fOtherSet->removeAll(*fUpperSet);
2369    fOtherSet->removeAll(*fOLetterSet);
2370    fOtherSet->removeAll(*fNumericSet);
2371    fOtherSet->removeAll(*fATermSet);
2372    fOtherSet->removeAll(*fSContinueSet);
2373    fOtherSet->removeAll(*fSTermSet);
2374    fOtherSet->removeAll(*fCloseSet);
2375    fOtherSet->removeAll(*fExtendSet);
2376
2377    fSets->addElement(fSepSet,       status);
2378    fSets->addElement(fFormatSet,    status);
2379    fSets->addElement(fSpSet,        status);
2380    fSets->addElement(fLowerSet,     status);
2381    fSets->addElement(fUpperSet,     status);
2382    fSets->addElement(fOLetterSet,   status);
2383    fSets->addElement(fNumericSet,   status);
2384    fSets->addElement(fATermSet,     status);
2385    fSets->addElement(fSContinueSet, status);
2386    fSets->addElement(fSTermSet,     status);
2387    fSets->addElement(fCloseSet,     status);
2388    fSets->addElement(fOtherSet,     status);
2389    fSets->addElement(fExtendSet,    status);
2390
2391    if (U_FAILURE(status)) {
2392        deferredStatus = status;
2393    }
2394}
2395
2396
2397
2398void RBBISentMonkey::setText(const UnicodeString &s) {
2399    fText       = &s;
2400}
2401
2402UVector  *RBBISentMonkey::charClasses() {
2403    return fSets;
2404}
2405
2406
2407//  moveBack()   Find the "significant" code point preceding the index i.
2408//               Skips over ($Extend | $Format)* .
2409//
2410int RBBISentMonkey::moveBack(int i) {
2411    if (i <= 0) {
2412        return -1;
2413    }
2414    UChar32   c;
2415    int32_t   j = i;
2416    do {
2417        j = fText->moveIndex32(j, -1);
2418        c = fText->char32At(j);
2419    }
2420    while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2421    return j;
2422
2423 }
2424
2425
2426int RBBISentMonkey::moveForward(int i) {
2427    if (i>=fText->length()) {
2428        return fText->length();
2429    }
2430    UChar32   c;
2431    int32_t   j = i;
2432    do {
2433        j = fText->moveIndex32(j, 1);
2434        c = cAt(j);
2435    }
2436    while (fFormatSet->contains(c) || fExtendSet->contains(c));
2437    return j;
2438}
2439
2440UChar32 RBBISentMonkey::cAt(int pos) {
2441    if (pos<0 || pos>=fText->length()) {
2442        return -1;
2443    } else {
2444        return fText->char32At(pos);
2445    }
2446}
2447
2448int32_t RBBISentMonkey::next(int32_t prevPos) {
2449    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2450                              //   break position being tested.  The candidate break
2451                              //   location is before p2.
2452
2453    int     breakPos = -1;
2454
2455    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2456    UChar32 c;
2457
2458    if (U_FAILURE(deferredStatus)) {
2459        return -1;
2460    }
2461
2462    // Prev break at end of string.  return DONE.
2463    if (prevPos >= fText->length()) {
2464        return -1;
2465    }
2466    p0 = p1 = p2 = p3 = prevPos;
2467    c3 =  fText->char32At(prevPos);
2468    c0 = c1 = c2 = 0;
2469
2470    // Loop runs once per "significant" character position in the input text.
2471    for (;;) {
2472        // Move all of the positions forward in the input string.
2473        p0 = p1;  c0 = c1;
2474        p1 = p2;  c1 = c2;
2475        p2 = p3;  c2 = c3;
2476
2477        // Advancd p3 by    X(Extend | Format)*   Rule 4
2478        p3 = moveForward(p3);
2479        c3 = cAt(p3);
2480
2481        // Rule (3)  CR x LF
2482        if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2483            continue;
2484        }
2485
2486        // Rule (4).   Sep  <break>
2487        if (fSepSet->contains(c1)) {
2488            p2 = p1+1;   // Separators don't combine with Extend or Format.
2489            break;
2490        }
2491
2492        if (p2 >= fText->length()) {
2493            // Reached end of string.  Always a break position.
2494            break;
2495        }
2496
2497        if (p2 == prevPos) {
2498            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2499            continue;
2500        }
2501
2502        // Rule (6).   ATerm x Numeric
2503        if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2504            continue;
2505        }
2506
2507        // Rule (7).  Upper ATerm  x  Uppper
2508        if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2509            continue;
2510        }
2511
2512        // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2513        //           Note:  STerm | ATerm are added to the negated part of the expression by a
2514        //                  note to the Unicode 5.0 documents.
2515        int p8 = p1;
2516        while (fSpSet->contains(cAt(p8))) {
2517            p8 = moveBack(p8);
2518        }
2519        while (fCloseSet->contains(cAt(p8))) {
2520            p8 = moveBack(p8);
2521        }
2522        if (fATermSet->contains(cAt(p8))) {
2523            p8=p2;
2524            for (;;) {
2525                c = cAt(p8);
2526                if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2527                    fLowerSet->contains(c) || fSepSet->contains(c) ||
2528                    fATermSet->contains(c) || fSTermSet->contains(c))  {
2529                    break;
2530                }
2531                p8 = moveForward(p8);
2532            }
2533            if (fLowerSet->contains(cAt(p8))) {
2534                continue;
2535            }
2536        }
2537
2538        // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2539        if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2540            p8 = p1;
2541            while (fSpSet->contains(cAt(p8))) {
2542                p8 = moveBack(p8);
2543            }
2544            while (fCloseSet->contains(cAt(p8))) {
2545                p8 = moveBack(p8);
2546            }
2547            c = cAt(p8);
2548            if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2549                continue;
2550            }
2551        }
2552
2553        // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2554        int p9 = p1;
2555        while (fCloseSet->contains(cAt(p9))) {
2556            p9 = moveBack(p9);
2557        }
2558        c = cAt(p9);
2559        if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2560            if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2561                continue;
2562            }
2563        }
2564
2565        // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2566        int p10 = p1;
2567        while (fSpSet->contains(cAt(p10))) {
2568            p10 = moveBack(p10);
2569        }
2570        while (fCloseSet->contains(cAt(p10))) {
2571            p10 = moveBack(p10);
2572        }
2573        if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2574            if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2575                continue;
2576            }
2577        }
2578
2579        // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2580        int p11 = p1;
2581        if (fSepSet->contains(cAt(p11))) {
2582            p11 = moveBack(p11);
2583        }
2584        while (fSpSet->contains(cAt(p11))) {
2585            p11 = moveBack(p11);
2586        }
2587        while (fCloseSet->contains(cAt(p11))) {
2588            p11 = moveBack(p11);
2589        }
2590        if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2591            break;
2592        }
2593
2594        //  Rule (12)  Any x Any
2595        continue;
2596    }
2597    breakPos = p2;
2598    return breakPos;
2599}
2600
2601RBBISentMonkey::~RBBISentMonkey() {
2602    delete fSets;
2603    delete fSepSet;
2604    delete fFormatSet;
2605    delete fSpSet;
2606    delete fLowerSet;
2607    delete fUpperSet;
2608    delete fOLetterSet;
2609    delete fNumericSet;
2610    delete fATermSet;
2611    delete fSContinueSet;
2612    delete fSTermSet;
2613    delete fCloseSet;
2614    delete fOtherSet;
2615    delete fExtendSet;
2616}
2617
2618
2619
2620//-------------------------------------------------------------------------------------------
2621//
2622//  RBBILineMonkey
2623//
2624//-------------------------------------------------------------------------------------------
2625
2626class RBBILineMonkey: public RBBIMonkeyKind {
2627public:
2628    RBBILineMonkey();
2629    virtual          ~RBBILineMonkey();
2630    virtual  UVector *charClasses();
2631    virtual  void     setText(const UnicodeString &s);
2632    virtual  int32_t  next(int32_t i);
2633    virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2634private:
2635    UVector      *fSets;
2636
2637    UnicodeSet  *fBK;
2638    UnicodeSet  *fCR;
2639    UnicodeSet  *fLF;
2640    UnicodeSet  *fCM;
2641    UnicodeSet  *fNL;
2642    UnicodeSet  *fSG;
2643    UnicodeSet  *fWJ;
2644    UnicodeSet  *fZW;
2645    UnicodeSet  *fGL;
2646    UnicodeSet  *fCB;
2647    UnicodeSet  *fSP;
2648    UnicodeSet  *fB2;
2649    UnicodeSet  *fBA;
2650    UnicodeSet  *fBB;
2651    UnicodeSet  *fHY;
2652    UnicodeSet  *fH2;
2653    UnicodeSet  *fH3;
2654    UnicodeSet  *fCL;
2655    UnicodeSet  *fCP;
2656    UnicodeSet  *fEX;
2657    UnicodeSet  *fIN;
2658    UnicodeSet  *fJL;
2659    UnicodeSet  *fJV;
2660    UnicodeSet  *fJT;
2661    UnicodeSet  *fNS;
2662    UnicodeSet  *fOP;
2663    UnicodeSet  *fQU;
2664    UnicodeSet  *fIS;
2665    UnicodeSet  *fNU;
2666    UnicodeSet  *fPO;
2667    UnicodeSet  *fPR;
2668    UnicodeSet  *fSY;
2669    UnicodeSet  *fAI;
2670    UnicodeSet  *fAL;
2671    UnicodeSet  *fCJ;
2672    UnicodeSet  *fHL;
2673    UnicodeSet  *fID;
2674    UnicodeSet  *fRI;
2675    UnicodeSet  *fSA;
2676    UnicodeSet  *fXX;
2677
2678    BreakIterator  *fCharBI;
2679
2680    const UnicodeString  *fText;
2681    int32_t              *fOrigPositions;
2682
2683    RegexMatcher         *fNumberMatcher;
2684    RegexMatcher         *fLB11Matcher;
2685};
2686
2687
2688RBBILineMonkey::RBBILineMonkey()
2689{
2690    UErrorCode  status = U_ZERO_ERROR;
2691
2692    fSets  = new UVector(status);
2693
2694    fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2695    fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2696    fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2697    fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2698    fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2699    fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2700    fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2701    fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2702    fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2703    fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2704    fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2705    fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2706    fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2707    fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2708    fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2709    fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2710    fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2711    fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2712    fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2713    fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2714    fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2715    fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2716    fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2717    fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2718    fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2719    fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2720    fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2721    fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2722    fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2723    fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2724    fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2725    fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2726    fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2727    fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2728    fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2729    fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2730    fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2731    fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2732    fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2733    fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2734
2735    if (U_FAILURE(status)) {
2736        deferredStatus = status;
2737        fCharBI = NULL;
2738        fNumberMatcher = NULL;
2739        return;
2740    }
2741
2742    fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2743    fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2744    fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
2745    fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2746
2747    fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2748
2749    fSets->addElement(fBK, status);
2750    fSets->addElement(fCR, status);
2751    fSets->addElement(fLF, status);
2752    fSets->addElement(fCM, status);
2753    fSets->addElement(fNL, status);
2754    fSets->addElement(fWJ, status);
2755    fSets->addElement(fZW, status);
2756    fSets->addElement(fGL, status);
2757    fSets->addElement(fCB, status);
2758    fSets->addElement(fSP, status);
2759    fSets->addElement(fB2, status);
2760    fSets->addElement(fBA, status);
2761    fSets->addElement(fBB, status);
2762    fSets->addElement(fHY, status);
2763    fSets->addElement(fH2, status);
2764    fSets->addElement(fH3, status);
2765    fSets->addElement(fCL, status);
2766    fSets->addElement(fCP, status);
2767    fSets->addElement(fEX, status);
2768    fSets->addElement(fIN, status);
2769    fSets->addElement(fJL, status);
2770    fSets->addElement(fJT, status);
2771    fSets->addElement(fJV, status);
2772    fSets->addElement(fNS, status);
2773    fSets->addElement(fOP, status);
2774    fSets->addElement(fQU, status);
2775    fSets->addElement(fIS, status);
2776    fSets->addElement(fNU, status);
2777    fSets->addElement(fPO, status);
2778    fSets->addElement(fPR, status);
2779    fSets->addElement(fSY, status);
2780    fSets->addElement(fAI, status);
2781    fSets->addElement(fAL, status);
2782    fSets->addElement(fHL, status);
2783    fSets->addElement(fID, status);
2784    fSets->addElement(fWJ, status);
2785    fSets->addElement(fRI, status);
2786    fSets->addElement(fSA, status);
2787    fSets->addElement(fSG, status);
2788
2789    const char *rules =
2790            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2791            "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2792            "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2793            "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2794            "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
2795            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
2796
2797    fNumberMatcher = new RegexMatcher(
2798        UnicodeString(rules, -1, US_INV), 0, status);
2799
2800    fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2801
2802    if (U_FAILURE(status)) {
2803        deferredStatus = status;
2804    }
2805}
2806
2807
2808void RBBILineMonkey::setText(const UnicodeString &s) {
2809    fText       = &s;
2810    fCharBI->setText(s);
2811    fNumberMatcher->reset(s);
2812}
2813
2814//
2815//  rule9Adjust
2816//     Line Break TR rules 9 and 10 implementation.
2817//     This deals with combining marks and other sequences that
2818//     that must be treated as if they were something other than what they actually are.
2819//
2820//     This is factored out into a separate function because it must be applied twice for
2821//     each potential break, once to the chars before the position being checked, then
2822//     again to the text following the possible break.
2823//
2824void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2825    if (pos == -1) {
2826        // Invalid initial position.  Happens during the warmup iteration of the
2827        //   main loop in next().
2828        return;
2829    }
2830
2831    int32_t  nPos = *nextPos;
2832
2833    // LB 9  Keep combining sequences together.
2834    //  advance over any CM class chars.  Note that Line Break CM is different
2835    //  from the normal Grapheme Extend property.
2836    if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2837          *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2838        for (;;) {
2839            *nextChar = fText->char32At(nPos);
2840            if (!fCM->contains(*nextChar)) {
2841                break;
2842            }
2843            nPos = fText->moveIndex32(nPos, 1);
2844        }
2845    }
2846
2847
2848    // LB 9 Treat X CM* as if it were x.
2849    //       No explicit action required.
2850
2851    // LB 10  Treat any remaining combining mark as AL
2852    if (fCM->contains(*posChar)) {
2853        *posChar = 0x41;   // thisChar = 'A';
2854    }
2855
2856    // Push the updated nextPos and nextChar back to our caller.
2857    // This only makes a difference if posChar got bigger by consuming a
2858    // combining sequence.
2859    *nextPos  = nPos;
2860    *nextChar = fText->char32At(nPos);
2861}
2862
2863
2864
2865int32_t RBBILineMonkey::next(int32_t startPos) {
2866    UErrorCode status = U_ZERO_ERROR;
2867    int32_t    pos;       //  Index of the char following a potential break position
2868    UChar32    thisChar;  //  Character at above position "pos"
2869
2870    int32_t    prevPos;   //  Index of the char preceding a potential break position
2871    UChar32    prevChar;  //  Character at above position.  Note that prevChar
2872                          //   and thisChar may not be adjacent because combining
2873                          //   characters between them will be ignored.
2874
2875    int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2876    UChar32    prevCharX2;
2877
2878    int32_t    nextPos;   //  Index of the next character following pos.
2879                          //     Usually skips over combining marks.
2880    int32_t    nextCPPos; //  Index of the code point following "pos."
2881                          //     May point to a combining mark.
2882    int32_t    tPos;      //  temp value.
2883    UChar32    c;
2884
2885    if (U_FAILURE(deferredStatus)) {
2886        return -1;
2887    }
2888
2889    if (startPos >= fText->length()) {
2890        return -1;
2891    }
2892
2893
2894    // Initial values for loop.  Loop will run the first time without finding breaks,
2895    //                           while the invalid values shift out and the "this" and
2896    //                           "prev" positions are filled in with good values.
2897    pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2898    thisChar = prevChar  = prevCharX2 = 0;
2899    nextPos  = nextCPPos = startPos;
2900
2901
2902    // Loop runs once per position in the test text, until a break position
2903    //  is found.
2904    for (;;) {
2905        prevPosX2 = prevPos;
2906        prevCharX2 = prevChar;
2907
2908        prevPos   = pos;
2909        prevChar  = thisChar;
2910
2911        pos       = nextPos;
2912        thisChar  = fText->char32At(pos);
2913
2914        nextCPPos = fText->moveIndex32(pos, 1);
2915        nextPos   = nextCPPos;
2916
2917        // Rule LB2 - Break at end of text.
2918        if (pos >= fText->length()) {
2919            break;
2920        }
2921
2922        // Rule LB 9 - adjust for combining sequences.
2923        //             We do this one out-of-order because the adjustment does not change anything
2924        //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2925        //             be applied.
2926        rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
2927        nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2928        c = fText->char32At(nextPos);
2929        rule9Adjust(pos,     &thisChar, &nextPos, &c);
2930
2931        // If the loop is still warming up - if we haven't shifted the initial
2932        //   -1 positions out of prevPos yet - loop back to advance the
2933        //    position in the input without any further looking for breaks.
2934        if (prevPos == -1) {
2935            continue;
2936        }
2937
2938        // LB 4  Always break after hard line breaks,
2939        if (fBK->contains(prevChar)) {
2940            break;
2941        }
2942
2943        // LB 5  Break after CR, LF, NL, but not inside CR LF
2944        if (prevChar == 0x0d && thisChar == 0x0a) {
2945            continue;
2946        }
2947        if (prevChar == 0x0d ||
2948            prevChar == 0x0a ||
2949            prevChar == 0x85)  {
2950            break;
2951        }
2952
2953        // LB 6  Don't break before hard line breaks
2954        if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2955            fBK->contains(thisChar)) {
2956                continue;
2957        }
2958
2959
2960        // LB 7  Don't break before spaces or zero-width space.
2961        if (fSP->contains(thisChar)) {
2962            continue;
2963        }
2964
2965        if (fZW->contains(thisChar)) {
2966            continue;
2967        }
2968
2969        // LB 8  Break after zero width space
2970        if (fZW->contains(prevChar)) {
2971            break;
2972        }
2973
2974        // LB 9, 10  Already done, at top of loop.
2975        //
2976
2977
2978        // LB 11  Do not break before or after WORD JOINER and related characters.
2979        //    x  WJ
2980        //    WJ  x
2981        //
2982        if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
2983            continue;
2984        }
2985
2986        // LB 12
2987        //    GL  x
2988        if (fGL->contains(prevChar)) {
2989            continue;
2990        }
2991
2992        // LB 12a
2993        //    [^SP BA HY] x GL
2994        if (!(fSP->contains(prevChar) ||
2995              fBA->contains(prevChar) ||
2996              fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
2997            continue;
2998        }
2999
3000
3001
3002        // LB 13  Don't break before closings.
3003        //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3004        //        fall into LB 17 and the more general number regular expression.
3005        //
3006        if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3007            (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3008                                         fEX->contains(thisChar)  ||
3009            (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3010            (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3011            continue;
3012        }
3013
3014        // LB 14 Don't break after OP SP*
3015        //       Scan backwards, checking for this sequence.
3016        //       The OP char could include combining marks, so we actually check for
3017        //           OP CM* SP*
3018        //       Another Twist: The Rule 67 fixes may have changed a SP CM
3019        //       sequence into a ID char, so before scanning back through spaces,
3020        //       verify that prevChar is indeed a space.  The prevChar variable
3021        //       may differ from fText[prevPos]
3022        tPos = prevPos;
3023        if (fSP->contains(prevChar)) {
3024            while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3025                tPos=fText->moveIndex32(tPos, -1);
3026            }
3027        }
3028        while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3029            tPos=fText->moveIndex32(tPos, -1);
3030        }
3031        if (fOP->contains(fText->char32At(tPos))) {
3032            continue;
3033        }
3034
3035
3036        // LB 15    QU SP* x OP
3037        if (fOP->contains(thisChar)) {
3038            // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3039            int tPos = prevPos;
3040            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3041                tPos = fText->moveIndex32(tPos, -1);
3042            }
3043            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3044                tPos = fText->moveIndex32(tPos, -1);
3045            }
3046            if (fQU->contains(fText->char32At(tPos))) {
3047                continue;
3048            }
3049        }
3050
3051
3052
3053        // LB 16   (CL | CP) SP* x NS
3054        //    Scan backwards for SP* CM* (CL | CP)
3055        if (fNS->contains(thisChar)) {
3056            int tPos = prevPos;
3057            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3058                tPos = fText->moveIndex32(tPos, -1);
3059            }
3060            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3061                tPos = fText->moveIndex32(tPos, -1);
3062            }
3063            if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3064                continue;
3065            }
3066        }
3067
3068
3069        // LB 17        B2 SP* x B2
3070        if (fB2->contains(thisChar)) {
3071            //  Scan backwards, checking for the B2 CM* SP* sequence.
3072            tPos = prevPos;
3073            if (fSP->contains(prevChar)) {
3074                while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3075                    tPos=fText->moveIndex32(tPos, -1);
3076                }
3077            }
3078            while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3079                tPos=fText->moveIndex32(tPos, -1);
3080            }
3081            if (fB2->contains(fText->char32At(tPos))) {
3082                continue;
3083            }
3084        }
3085
3086
3087        // LB 18    break after space
3088        if (fSP->contains(prevChar)) {
3089            break;
3090        }
3091
3092        // LB 19
3093        //    x   QU
3094        //    QU  x
3095        if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3096            continue;
3097        }
3098
3099        // LB 20  Break around a CB
3100        if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3101            break;
3102        }
3103
3104        // LB 21
3105        if (fBA->contains(thisChar) ||
3106            fHY->contains(thisChar) ||
3107            fNS->contains(thisChar) ||
3108            fBB->contains(prevChar) )   {
3109            continue;
3110        }
3111
3112        // LB 21a
3113        //   HL (HY | BA) x
3114        if (fHL->contains(prevCharX2) &&
3115                (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3116            continue;
3117        }
3118
3119        // LB 22
3120        if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3121            (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3122            (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3123            (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3124            (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3125            continue;
3126        }
3127
3128
3129        // LB 23    ID x PO
3130        //          AL x NU
3131        //          HL x NU
3132        //          NU x AL
3133        if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3134            (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3135            (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3136            (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3137            (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
3138            continue;
3139        }
3140
3141        // LB 24  Do not break between prefix and letters or ideographs.
3142        //        PR x ID
3143        //        PR x (AL | HL)
3144        //        PO x (AL | HL)
3145        if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3146            (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3147            (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
3148            continue;
3149        }
3150
3151
3152
3153        // LB 25    Numbers
3154        if (fNumberMatcher->lookingAt(prevPos, status)) {
3155            if (U_FAILURE(status)) {
3156                break;
3157            }
3158            // Matched a number.  But could have been just a single digit, which would
3159            //    not represent a "no break here" between prevChar and thisChar
3160            int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3161            if (numEndIdx > pos) {
3162                // Number match includes at least our two chars being checked
3163                if (numEndIdx > nextPos) {
3164                    // Number match includes additional chars.  Update pos and nextPos
3165                    //   so that next loop iteration will continue at the end of the number,
3166                    //   checking for breaks between last char in number & whatever follows.
3167                    pos = nextPos = numEndIdx;
3168                    do {
3169                        pos = fText->moveIndex32(pos, -1);
3170                        thisChar = fText->char32At(pos);
3171                    } while (fCM->contains(thisChar));
3172                }
3173                continue;
3174            }
3175        }
3176
3177
3178        // LB 26 Do not break a Korean syllable.
3179        if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3180                                        fJV->contains(thisChar) ||
3181                                        fH2->contains(thisChar) ||
3182                                        fH3->contains(thisChar))) {
3183                                            continue;
3184                                        }
3185
3186        if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3187            (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3188                continue;
3189        }
3190
3191        if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3192            fJT->contains(thisChar)) {
3193                continue;
3194        }
3195
3196        // LB 27 Treat a Korean Syllable Block the same as ID.
3197        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3198            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3199            fIN->contains(thisChar)) {
3200                continue;
3201            }
3202        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3203            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3204            fPO->contains(thisChar)) {
3205                continue;
3206            }
3207        if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3208            fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3209                continue;
3210            }
3211
3212
3213
3214        // LB 28  Do not break between alphabetics ("at").
3215        if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3216            continue;
3217        }
3218
3219        // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3220        if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3221            continue;
3222        }
3223
3224        // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3225        //          (AL | NU) x OP
3226        //          CP x (AL | NU)
3227        if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3228            continue;
3229        }
3230        if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3231            continue;
3232        }
3233
3234        // LB30a  Do not break between regional indicators.
3235        //        RI x RI
3236        if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3237            continue;
3238        }
3239
3240        // LB 31    Break everywhere else
3241        break;
3242
3243    }
3244
3245    return pos;
3246}
3247
3248
3249UVector  *RBBILineMonkey::charClasses() {
3250    return fSets;
3251}
3252
3253
3254RBBILineMonkey::~RBBILineMonkey() {
3255    delete fSets;
3256
3257    delete fBK;
3258    delete fCR;
3259    delete fLF;
3260    delete fCM;
3261    delete fNL;
3262    delete fWJ;
3263    delete fZW;
3264    delete fGL;
3265    delete fCB;
3266    delete fSP;
3267    delete fB2;
3268    delete fBA;
3269    delete fBB;
3270    delete fHY;
3271    delete fH2;
3272    delete fH3;
3273    delete fCL;
3274    delete fCP;
3275    delete fEX;
3276    delete fIN;
3277    delete fJL;
3278    delete fJV;
3279    delete fJT;
3280    delete fNS;
3281    delete fOP;
3282    delete fQU;
3283    delete fIS;
3284    delete fNU;
3285    delete fPO;
3286    delete fPR;
3287    delete fSY;
3288    delete fAI;
3289    delete fAL;
3290    delete fCJ;
3291    delete fHL;
3292    delete fID;
3293    delete fRI;
3294    delete fSA;
3295    delete fSG;
3296    delete fXX;
3297
3298    delete fCharBI;
3299    delete fNumberMatcher;
3300}
3301
3302
3303//-------------------------------------------------------------------------------------------
3304//
3305//   TestMonkey
3306//
3307//     params
3308//       seed=nnnnn        Random number starting seed.
3309//                         Setting the seed allows errors to be reproduced.
3310//       loop=nnn          Looping count.  Controls running time.
3311//                         -1:  run forever.
3312//                          0 or greater:  run length.
3313//
3314//       type = char | word | line | sent | title
3315//
3316//-------------------------------------------------------------------------------------------
3317
3318static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3319    int32_t val = defaultVal;
3320    name.append(" *= *(-?\\d+)");
3321    UErrorCode status = U_ZERO_ERROR;
3322    RegexMatcher m(name, params, 0, status);
3323    if (m.find()) {
3324        // The param exists.  Convert the string to an int.
3325        char valString[100];
3326        int32_t paramLength = m.end(1, status) - m.start(1, status);
3327        if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3328            paramLength = (int32_t)(sizeof(valString)-2);
3329        }
3330        params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3331        val = strtol(valString,  NULL, 10);
3332
3333        // Delete this parameter from the params string.
3334        m.reset();
3335        params = m.replaceFirst("", status);
3336    }
3337    U_ASSERT(U_SUCCESS(status));
3338    return val;
3339}
3340#endif
3341
3342#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3343static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3344                                    BreakIterator *bi,
3345                                    int expected[],
3346                                    int expectedcount)
3347{
3348    int count = 0;
3349    int i = 0;
3350    int forward[50];
3351    bi->setText(ustr);
3352    for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3353        forward[count] = i;
3354        if (count < expectedcount && expected[count] != i) {
3355            test->errln("break forward test failed: expected %d but got %d",
3356                        expected[count], i);
3357            break;
3358        }
3359        count ++;
3360    }
3361    if (count != expectedcount) {
3362        printStringBreaks(ustr, expected, expectedcount);
3363        test->errln("break forward test failed: missed %d match",
3364                    expectedcount - count);
3365        return;
3366    }
3367    // testing boundaries
3368    for (i = 1; i < expectedcount; i ++) {
3369        int j = expected[i - 1];
3370        if (!bi->isBoundary(j)) {
3371            printStringBreaks(ustr, expected, expectedcount);
3372            test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3373            return;
3374        }
3375        for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3376            if (bi->isBoundary(j)) {
3377                printStringBreaks(ustr, expected, expectedcount);
3378                test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3379                return;
3380            }
3381        }
3382    }
3383
3384    for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3385        count --;
3386        if (forward[count] != i) {
3387            printStringBreaks(ustr, expected, expectedcount);
3388            test->errln("happy break test previous() failed: expected %d but got %d",
3389                        forward[count], i);
3390            break;
3391        }
3392    }
3393    if (count != 0) {
3394        printStringBreaks(ustr, expected, expectedcount);
3395        test->errln("break test previous() failed: missed a match");
3396        return;
3397    }
3398
3399    // testing preceding
3400    for (i = 0; i < expectedcount - 1; i ++) {
3401        // int j = expected[i] + 1;
3402        int j = ustr.moveIndex32(expected[i], 1);
3403        for (; j <= expected[i + 1]; j ++) {
3404            if (bi->preceding(j) != expected[i]) {
3405                printStringBreaks(ustr, expected, expectedcount);
3406                test->errln("preceding(): Not expecting boundary at position %d", j);
3407                return;
3408            }
3409        }
3410    }
3411}
3412#endif
3413
3414void RBBITest::TestWordBreaks(void)
3415{
3416#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3417
3418    Locale        locale("en");
3419    UErrorCode    status = U_ZERO_ERROR;
3420    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3421    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3422    // Replaced any C+J characters in a row with a random sequence of characters
3423    // of the same length to make our C+J segmentation not get in the way.
3424    static const char *strlist[] =
3425    {
3426    "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3427    "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3428    "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3429    "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3430    "\\uac00\\u3588\\u009c\\u0953\\u194b",
3431    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3432    "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3433    "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3434    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3435    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3436    "\\u2027\\U000e0067\\u0a47\\u00b7",
3437    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3438    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3439    "\\u0589\\U000e006e\\u0a42\\U000104a5",
3440    "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3441    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3442    "\\u0027\\u11af\\U000e0057\\u0602",
3443    "\\U0001d7f2\\U000e007\\u0004\\u0589",
3444    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3445    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3446    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3447    "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3448    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3449    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3450    "\\u0233\\U000e0020\\u0a69\\u0d6a",
3451    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3452    "\\u18f4\\U000e0049\\u20e7\\u2027",
3453    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3454    "\\ua183\\u102d\\u0bec\\u003a",
3455    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3456    "\\u003a\\u0e57\\u0fad\\u002e",
3457    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3458    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3459    "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3460    "\\u003a\\u0664\\u00b7\\u1fba",
3461    "\\u003b\\u0027\\u00b7\\u47a3",
3462    "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3463    "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3464    "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3465    };
3466    int loop;
3467    if (U_FAILURE(status)) {
3468        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3469        return;
3470    }
3471    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3472        // printf("looping %d\n", loop);
3473        UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3474        // RBBICharMonkey monkey;
3475        RBBIWordMonkey monkey;
3476
3477        int expected[50];
3478        int expectedcount = 0;
3479
3480        monkey.setText(ustr);
3481        int i;
3482        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3483            expected[expectedcount ++] = i;
3484        }
3485
3486        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3487    }
3488    delete bi;
3489#endif
3490}
3491
3492void RBBITest::TestWordBoundary(void)
3493{
3494    // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3495    Locale        locale("en");
3496    UErrorCode    status = U_ZERO_ERROR;
3497    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3498    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3499    UChar         str[50];
3500    static const char *strlist[] =
3501    {
3502    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3503    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3504    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3505    "\\u2027\\U000e0067\\u0a47\\u00b7",
3506    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3507    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3508    "\\u0589\\U000e006e\\u0a42\\U000104a5",
3509    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3510    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3511    "\\u0027\\u11af\\U000e0057\\u0602",
3512    "\\U0001d7f2\\U000e007\\u0004\\u0589",
3513    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3514    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3515    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3516    "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3517    "\\U000e0065\\u302c\\u09ee\\U000e0068",
3518    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3519    "\\u0233\\U000e0020\\u0a69\\u0d6a",
3520    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3521    "\\u58f4\\U000e0049\\u20e7\\u2027",
3522    "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3523    "\\ua183\\u102d\\u0bec\\u003a",
3524    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3525    "\\u003a\\u0e57\\u0fad\\u002e",
3526    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3527    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3528    "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3529    "\\u003a\\u0664\\u00b7\\u1fba",
3530    "\\u003b\\u0027\\u00b7\\u47a3",
3531    };
3532    int loop;
3533    if (U_FAILURE(status)) {
3534        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3535        return;
3536    }
3537    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3538        // printf("looping %d\n", loop);
3539        u_unescape(strlist[loop], str, 20);
3540        UnicodeString ustr(str);
3541        int forward[50];
3542        int count = 0;
3543
3544        bi->setText(ustr);
3545        int prev = 0;
3546        int i;
3547        for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3548            forward[count ++] = i;
3549            if (i > prev) {
3550                int j;
3551                for (j = prev + 1; j < i; j ++) {
3552                    if (bi->isBoundary(j)) {
3553                        printStringBreaks(ustr, forward, count);
3554                        errln("happy boundary test failed: expected %d not a boundary",
3555                               j);
3556                        return;
3557                    }
3558                }
3559            }
3560            if (!bi->isBoundary(i)) {
3561                printStringBreaks(ustr, forward, count);
3562                errln("happy boundary test failed: expected %d a boundary",
3563                       i);
3564                return;
3565            }
3566            prev = i;
3567        }
3568    }
3569    delete bi;
3570}
3571
3572void RBBITest::TestLineBreaks(void)
3573{
3574#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3575    Locale        locale("en");
3576    UErrorCode    status = U_ZERO_ERROR;
3577    BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3578    const int32_t  STRSIZE = 50;
3579    UChar         str[STRSIZE];
3580    static const char *strlist[] =
3581    {
3582     "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3583     "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3584             "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3585     "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3586             "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3587     "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3588     "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3589     "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3590     "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3591     "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3592     "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3593     "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3594     "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3595     "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3596     "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3597     "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3598     "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3599     "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3600     "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3601     "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3602     "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3603     "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3604     "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3605     "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3606     "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3607     "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3608     "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3609     "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3610     "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3611     "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3612     "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3613     "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3614     "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3615     "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3616     "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3617     "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3618     "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3619     "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3620     "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3621     "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3622     "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3623     "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3624         "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3625         "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3626         "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3627     "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3628         "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3629    };
3630    int loop;
3631    TEST_ASSERT_SUCCESS(status);
3632    if (U_FAILURE(status)) {
3633        return;
3634    }
3635    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3636        // printf("looping %d\n", loop);
3637        int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3638        if (t >= STRSIZE) {
3639            TEST_ASSERT(FALSE);
3640            continue;
3641        }
3642
3643
3644        UnicodeString ustr(str);
3645        RBBILineMonkey monkey;
3646        if (U_FAILURE(monkey.deferredStatus)) {
3647            continue;
3648        }
3649
3650        const int EXPECTEDSIZE = 50;
3651        int expected[EXPECTEDSIZE];
3652        int expectedcount = 0;
3653
3654        monkey.setText(ustr);
3655        int i;
3656        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3657            if (expectedcount >= EXPECTEDSIZE) {
3658                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3659                return;
3660            }
3661            expected[expectedcount ++] = i;
3662        }
3663
3664        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3665    }
3666    delete bi;
3667#endif
3668}
3669
3670void RBBITest::TestSentBreaks(void)
3671{
3672#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3673    Locale        locale("en");
3674    UErrorCode    status = U_ZERO_ERROR;
3675    BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3676    UChar         str[200];
3677    static const char *strlist[] =
3678    {
3679     "Now\ris\nthe\r\ntime\n\rfor\r\r",
3680     "This\n",
3681     "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3682     "\"Sentence ending with a quote.\" Bye.",
3683     "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3684     "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3685     "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3686     "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3687     "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3688     "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3689     "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3690             "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3691             "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3692             "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3693     "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3694             "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3695             "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3696             "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3697             "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3698             "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3699    };
3700    int loop;
3701    if (U_FAILURE(status)) {
3702        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3703        return;
3704    }
3705    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3706        u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3707        UnicodeString ustr(str);
3708
3709        RBBISentMonkey monkey;
3710        if (U_FAILURE(monkey.deferredStatus)) {
3711            continue;
3712        }
3713
3714        const int EXPECTEDSIZE = 50;
3715        int expected[EXPECTEDSIZE];
3716        int expectedcount = 0;
3717
3718        monkey.setText(ustr);
3719        int i;
3720        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3721            if (expectedcount >= EXPECTEDSIZE) {
3722                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3723                return;
3724            }
3725            expected[expectedcount ++] = i;
3726        }
3727
3728        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3729    }
3730    delete bi;
3731#endif
3732}
3733
3734void RBBITest::TestMonkey(char *params) {
3735#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3736
3737    UErrorCode     status    = U_ZERO_ERROR;
3738    int32_t        loopCount = 500;
3739    int32_t        seed      = 1;
3740    UnicodeString  breakType = "all";
3741    Locale         locale("en");
3742    UBool          useUText  = FALSE;
3743
3744    if (quick == FALSE) {
3745        loopCount = 10000;
3746    }
3747
3748    if (params) {
3749        UnicodeString p(params);
3750        loopCount = getIntParam("loop", p, loopCount);
3751        seed      = getIntParam("seed", p, seed);
3752
3753        RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3754        if (m.find()) {
3755            breakType = m.group(1, status);
3756            m.reset();
3757            p = m.replaceFirst("", status);
3758        }
3759
3760        RegexMatcher u(" *utext", p, 0, status);
3761        if (u.find()) {
3762            useUText = TRUE;
3763            u.reset();
3764            p = u.replaceFirst("", status);
3765        }
3766
3767
3768        // m.reset(p);
3769        if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3770            // Each option is stripped out of the option string as it is processed.
3771            // All options have been checked.  The option string should have been completely emptied..
3772            char buf[100];
3773            p.extract(buf, sizeof(buf), NULL, status);
3774            buf[sizeof(buf)-1] = 0;
3775            errln("Unrecognized or extra parameter:  %s\n", buf);
3776            return;
3777        }
3778
3779    }
3780
3781    if (breakType == "char" || breakType == "all") {
3782        RBBICharMonkey  m;
3783        BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3784        if (U_SUCCESS(status)) {
3785            RunMonkey(bi, m, "char", seed, loopCount, useUText);
3786            if (breakType == "all" && useUText==FALSE) {
3787                // Also run a quick test with UText when "all" is specified
3788                RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3789            }
3790        }
3791        else {
3792            errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3793        }
3794        delete bi;
3795    }
3796
3797    if (breakType == "word" || breakType == "all") {
3798        logln("Word Break Monkey Test");
3799        RBBIWordMonkey  m;
3800        BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3801        if (U_SUCCESS(status)) {
3802            RunMonkey(bi, m, "word", seed, loopCount, useUText);
3803        }
3804        else {
3805            errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3806        }
3807        delete bi;
3808    }
3809
3810    if (breakType == "line" || breakType == "all") {
3811        logln("Line Break Monkey Test");
3812        RBBILineMonkey  m;
3813        BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3814        if (loopCount >= 10) {
3815            loopCount = loopCount / 5;   // Line break runs slower than the others.
3816        }
3817        if (U_SUCCESS(status)) {
3818            RunMonkey(bi, m, "line", seed, loopCount, useUText);
3819        }
3820        else {
3821            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3822        }
3823        delete bi;
3824    }
3825
3826    if (breakType == "sent" || breakType == "all"  ) {
3827        logln("Sentence Break Monkey Test");
3828        RBBISentMonkey  m;
3829        BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3830        if (loopCount >= 10) {
3831            loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3832        }
3833        if (U_SUCCESS(status)) {
3834            RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3835        }
3836        else {
3837            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3838        }
3839        delete bi;
3840    }
3841
3842#endif
3843}
3844
3845//
3846//  Run a RBBI monkey test.  Common routine, for all break iterator types.
3847//    Parameters:
3848//       bi      - the break iterator to use
3849//       mk      - MonkeyKind, abstraction for obtaining expected results
3850//       name    - Name of test (char, word, etc.) for use in error messages
3851//       seed    - Seed for starting random number generator (parameter from user)
3852//       numIterations
3853//
3854void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3855                         int32_t numIterations, UBool useUText) {
3856
3857#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3858
3859    const int32_t    TESTSTRINGLEN = 500;
3860    UnicodeString    testText;
3861    int32_t          numCharClasses;
3862    UVector          *chClasses;
3863    int              expected[TESTSTRINGLEN*2 + 1];
3864    int              expectedCount = 0;
3865    char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3866    char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3867    char             reverseBreaks[TESTSTRINGLEN*2+1];
3868    char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3869    char             followingBreaks[TESTSTRINGLEN*2+1];
3870    char             precedingBreaks[TESTSTRINGLEN*2+1];
3871    int              i;
3872    int              loopCount = 0;
3873
3874    m_seed = seed;
3875
3876    numCharClasses = mk.charClasses()->size();
3877    chClasses      = mk.charClasses();
3878
3879    // Check for errors that occured during the construction of the MonkeyKind object.
3880    //  Can't report them where they occured because errln() is a method coming from intlTest,
3881    //  and is not visible outside of RBBITest :-(
3882    if (U_FAILURE(mk.deferredStatus)) {
3883        errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3884        return;
3885    }
3886
3887    // Verify that the character classes all have at least one member.
3888    for (i=0; i<numCharClasses; i++) {
3889        UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3890        if (s == NULL || s->size() == 0) {
3891            errln("Character Class #%d is null or of zero size.", i);
3892            return;
3893        }
3894    }
3895
3896    while (loopCount < numIterations || numIterations == -1) {
3897        if (numIterations == -1 && loopCount % 10 == 0) {
3898            // If test is running in an infinite loop, display a periodic tic so
3899            //   we can tell that it is making progress.
3900            fprintf(stderr, ".");
3901        }
3902        // Save current random number seed, so that we can recreate the random numbers
3903        //   for this loop iteration in event of an error.
3904        seed = m_seed;
3905
3906        // Populate a test string with data.
3907        testText.truncate(0);
3908        for (i=0; i<TESTSTRINGLEN; i++) {
3909            int32_t  aClassNum = m_rand() % numCharClasses;
3910            UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3911            int32_t   charIdx = m_rand() % classSet->size();
3912            UChar32   c = classSet->charAt(charIdx);
3913            if (c < 0) {   // TODO:  deal with sets containing strings.
3914                errln("c < 0");
3915                break;
3916            }
3917            testText.append(c);
3918        }
3919
3920        // Calculate the expected results for this test string.
3921        mk.setText(testText);
3922        memset(expectedBreaks, 0, sizeof(expectedBreaks));
3923        expectedBreaks[0] = 1;
3924        int32_t breakPos = 0;
3925        expectedCount = 0;
3926        for (;;) {
3927            breakPos = mk.next(breakPos);
3928            if (breakPos == -1) {
3929                break;
3930            }
3931            if (breakPos > testText.length()) {
3932                errln("breakPos > testText.length()");
3933            }
3934            expectedBreaks[breakPos] = 1;
3935            U_ASSERT(expectedCount<testText.length());
3936            expected[expectedCount ++] = breakPos;
3937        }
3938
3939        // Find the break positions using forward iteration
3940        memset(forwardBreaks, 0, sizeof(forwardBreaks));
3941        if (useUText) {
3942            UErrorCode status = U_ZERO_ERROR;
3943            UText *testUText = utext_openReplaceable(NULL, &testText, &status);
3944            // testUText = utext_openUnicodeString(testUText, &testText, &status);
3945            bi->setText(testUText, status);
3946            TEST_ASSERT_SUCCESS(status);
3947            utext_close(testUText);   // The break iterator does a shallow clone of the UText
3948                                      //  This UText can be closed immediately, so long as the
3949                                      //  testText string continues to exist.
3950        } else {
3951            bi->setText(testText);
3952        }
3953
3954        for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
3955            if (i < 0 || i > testText.length()) {
3956                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3957                break;
3958            }
3959            forwardBreaks[i] = 1;
3960        }
3961
3962        // Find the break positions using reverse iteration
3963        memset(reverseBreaks, 0, sizeof(reverseBreaks));
3964        for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
3965            if (i < 0 || i > testText.length()) {
3966                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3967                break;
3968            }
3969            reverseBreaks[i] = 1;
3970        }
3971
3972        // Find the break positions using isBoundary() tests.
3973        memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
3974        U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
3975        for (i=0; i<=testText.length(); i++) {
3976            isBoundaryBreaks[i] = bi->isBoundary(i);
3977        }
3978
3979
3980        // Find the break positions using the following() function.
3981        // printf(".");
3982        memset(followingBreaks, 0, sizeof(followingBreaks));
3983        int32_t   lastBreakPos = 0;
3984        followingBreaks[0] = 1;
3985        for (i=0; i<testText.length(); i++) {
3986            breakPos = bi->following(i);
3987            if (breakPos <= i ||
3988                breakPos < lastBreakPos ||
3989                breakPos > testText.length() ||
3990                (breakPos > lastBreakPos && lastBreakPos > i)) {
3991                errln("%s break monkey test: "
3992                    "Out of range value returned by BreakIterator::following().\n"
3993                        "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
3994                         name, seed, i, breakPos, lastBreakPos);
3995                break;
3996            }
3997            followingBreaks[breakPos] = 1;
3998            lastBreakPos = breakPos;
3999        }
4000
4001        // Find the break positions using the preceding() function.
4002        memset(precedingBreaks, 0, sizeof(precedingBreaks));
4003        lastBreakPos = testText.length();
4004        precedingBreaks[testText.length()] = 1;
4005        for (i=testText.length(); i>0; i--) {
4006            breakPos = bi->preceding(i);
4007            if (breakPos >= i ||
4008                breakPos > lastBreakPos ||
4009                (breakPos < 0 && testText.getChar32Start(i)>0) ||
4010                (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4011                errln("%s break monkey test: "
4012                    "Out of range value returned by BreakIterator::preceding().\n"
4013                    "index=%d;  prev returned %d; lastBreak=%d" ,
4014                    name,  i, breakPos, lastBreakPos);
4015                if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4016                    precedingBreaks[i] = 2;   // Forces an error.
4017                }
4018            } else {
4019                if (breakPos >= 0) {
4020                    precedingBreaks[breakPos] = 1;
4021                }
4022                lastBreakPos = breakPos;
4023            }
4024        }
4025
4026        // Compare the expected and actual results.
4027        for (i=0; i<=testText.length(); i++) {
4028            const char *errorType = NULL;
4029            if  (forwardBreaks[i] != expectedBreaks[i]) {
4030                errorType = "next()";
4031            } else if (reverseBreaks[i] != forwardBreaks[i]) {
4032                errorType = "previous()";
4033            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4034                errorType = "isBoundary()";
4035            } else if (followingBreaks[i] != expectedBreaks[i]) {
4036                errorType = "following()";
4037            } else if (precedingBreaks[i] != expectedBreaks[i]) {
4038                errorType = "preceding()";
4039            }
4040
4041
4042            if (errorType != NULL) {
4043                // Format a range of the test text that includes the failure as
4044                //  a data item that can be included in the rbbi test data file.
4045
4046                // Start of the range is the last point where expected and actual results
4047                //   both agreed that there was a break position.
4048                int startContext = i;
4049                int32_t count = 0;
4050                for (;;) {
4051                    if (startContext==0) { break; }
4052                    startContext --;
4053                    if (expectedBreaks[startContext] != 0) {
4054                        if (count == 2) break;
4055                        count ++;
4056                    }
4057                }
4058
4059                // End of range is two expected breaks past the start position.
4060                int endContext = i + 1;
4061                int ci;
4062                for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4063                    for (;;) {
4064                        if (endContext >= testText.length()) {break;}
4065                        if (expectedBreaks[endContext-1] != 0) {
4066                            if (count == 0) break;
4067                            count --;
4068                        }
4069                        endContext ++;
4070                    }
4071                }
4072
4073                // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4074                UnicodeString errorText = "<data>";
4075                /***if (strcmp(errorType, "next()") == 0) {
4076                    startContext = 0;
4077                    endContext = testText.length();
4078
4079                    printStringBreaks(testText, expected, expectedCount);
4080                }***/
4081
4082                for (ci=startContext; ci<endContext;) {
4083                    UnicodeString hexChars("0123456789abcdef");
4084                    UChar32  c;
4085                    int      bn;
4086                    c = testText.char32At(ci);
4087                    if (ci == i) {
4088                        // This is the location of the error.
4089                        errorText.append("<?>");
4090                    } else if (expectedBreaks[ci] != 0) {
4091                        // This a non-error expected break position.
4092                        errorText.append("\\");
4093                    }
4094                    if (c < 0x10000) {
4095                        errorText.append("\\u");
4096                        for (bn=12; bn>=0; bn-=4) {
4097                            errorText.append(hexChars.charAt((c>>bn)&0xf));
4098                        }
4099                    } else {
4100                        errorText.append("\\U");
4101                        for (bn=28; bn>=0; bn-=4) {
4102                            errorText.append(hexChars.charAt((c>>bn)&0xf));
4103                        }
4104                    }
4105                    ci = testText.moveIndex32(ci, 1);
4106                }
4107                errorText.append("\\");
4108                errorText.append("</data>\n");
4109
4110                // Output the error
4111                char  charErrorTxt[500];
4112                UErrorCode status = U_ZERO_ERROR;
4113                errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4114                charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4115                const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4116
4117                errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4118                    name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4119                    errorType, seed, i, charErrorTxt);
4120                break;
4121            }
4122        }
4123
4124        loopCount++;
4125    }
4126#endif
4127}
4128
4129
4130//  Bug 5532.  UTF-8 based UText fails in dictionary code.
4131//             This test checks the initial patch,
4132//             which is to just keep it from crashing.  Correct word boundaries
4133//             await a proper fix to the dictionary code.
4134//
4135void RBBITest::TestBug5532(void)  {
4136   // Text includes a mixture of Thai and Latin.
4137   const unsigned char utf8Data[] = {
4138           0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4139           0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4140           0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4141           0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4142           0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4143           0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4144           0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4145           0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4146           0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4147           0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4148           0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4149
4150    UErrorCode status = U_ZERO_ERROR;
4151    UText utext=UTEXT_INITIALIZER;
4152    utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4153    TEST_ASSERT_SUCCESS(status);
4154
4155    BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4156    TEST_ASSERT_SUCCESS(status);
4157    if (U_SUCCESS(status)) {
4158        bi->setText(&utext, status);
4159        TEST_ASSERT_SUCCESS(status);
4160
4161        int32_t breakCount = 0;
4162        int32_t previousBreak = -1;
4163        for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4164            // For now, just make sure that the break iterator doesn't hang.
4165            TEST_ASSERT(previousBreak < bi->current());
4166            previousBreak = bi->current();
4167        }
4168        TEST_ASSERT(breakCount > 0);
4169    }
4170    delete bi;
4171    utext_close(&utext);
4172}
4173
4174
4175//
4176//  TestDebug    -  A place-holder test for debugging purposes.
4177//                  For putting in fragments of other tests that can be invoked
4178//                  for tracing  without a lot of unwanted extra stuff happening.
4179//
4180void RBBITest::TestDebug(void) {
4181#if 0
4182    UErrorCode   status = U_ZERO_ERROR;
4183    int pos = 0;
4184    int ruleStatus = 0;
4185
4186    RuleBasedBreakIterator* bi =
4187       // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4188       // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4189       (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4190    UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4191    // UnicodeString s("Aaa.  Bcd");
4192    s = s.unescape();
4193    bi->setText(s);
4194    UBool r = bi->isBoundary(8);
4195    printf("%s", r?"true":"false");
4196    return;
4197    pos = bi->last();
4198    do {
4199        // ruleStatus = bi->getRuleStatus();
4200        printf("%d\t%d\n", pos, ruleStatus);
4201        pos = bi->previous();
4202    } while (pos != BreakIterator::DONE);
4203#endif
4204}
4205
4206void RBBITest::TestProperties() {
4207    UErrorCode errorCode = U_ZERO_ERROR;
4208    UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4209    if (!prependSet.isEmpty()) {
4210        errln(
4211            "[:GCB=Prepend:] is not empty any more. "
4212            "Uncomment relevant lines in source/data/brkitr/char.txt and "
4213            "change this test to the opposite condition.");
4214    }
4215}
4216
4217#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
4218