1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2015, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6/************************************************************************
7*   Date        Name        Description
8*   12/15/99    Madhu        Creation.
9*   01/12/2000  Madhu        Updated for changed API and added new tests
10************************************************************************/
11
12#include "utypeinfo.h"  // for 'typeid' to work
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/utypes.h"
19#include "unicode/brkiter.h"
20#include "unicode/rbbi.h"
21#include "unicode/uchar.h"
22#include "unicode/utf16.h"
23#include "unicode/ucnv.h"
24#include "unicode/schriter.h"
25#include "unicode/uniset.h"
26#if !UCONFIG_NO_REGULAR_EXPRESSIONS
27#include "unicode/regex.h"
28#endif
29#include "unicode/ustring.h"
30#include "unicode/utext.h"
31#include "intltest.h"
32#include "rbbitst.h"
33#include <string.h>
34#include "charstr.h"
35#include "uvector.h"
36#include "uvectr32.h"
37#include <stdio.h>
38#include <stdlib.h>
39#include "unicode/numfmt.h"
40#include "unicode/uscript.h"
41#include "cmemory.h"
42
43#define TEST_ASSERT(x) {if (!(x)) { \
44    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
45
46#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
47    errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
48
49
50//---------------------------------------------
51// runIndexedTest
52//---------------------------------------------
53
54
55//  Note:  Before adding new tests to this file, check whether the desired test data can
56//         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
57//         it's much less work than writing a new test, diagnostic output in the event of failures
58//         is good, and the test data file will is shared with ICU4J, so eventually the test
59//         will run there as well, without additional effort.
60
61void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
62{
63    if (exec) logln("TestSuite RuleBasedBreakIterator: ");
64
65    switch (index) {
66#if !UCONFIG_NO_FILE_IO
67        case 0: name = "TestBug4153072";
68            if(exec) TestBug4153072();                         break;
69#else
70        case 0: name = "skip";
71            break;
72#endif
73
74        case 1: name = "skip";
75            break;
76        case 2: name = "TestStatusReturn";
77            if(exec) TestStatusReturn();                       break;
78
79#if !UCONFIG_NO_FILE_IO
80        case 3: name = "TestUnicodeFiles";
81            if(exec) TestUnicodeFiles();                       break;
82        case 4: name = "TestEmptyString";
83            if(exec) TestEmptyString();                        break;
84#else
85        case 3: case 4: name = "skip";
86            break;
87#endif
88
89        case 5: name = "TestGetAvailableLocales";
90            if(exec) TestGetAvailableLocales();                break;
91
92        case 6: name = "TestGetDisplayName";
93            if(exec) TestGetDisplayName();                     break;
94
95#if !UCONFIG_NO_FILE_IO
96        case 7: name = "TestEndBehaviour";
97            if(exec) TestEndBehaviour();                       break;
98        case 8: case 9: case 10: name = "skip";
99             break;
100        case 11: name = "TestWordBreaks";
101             if(exec) TestWordBreaks();                        break;
102        case 12: name = "TestWordBoundary";
103             if(exec) TestWordBoundary();                      break;
104        case 13: name = "TestLineBreaks";
105             if(exec) TestLineBreaks();                        break;
106        case 14: name = "TestSentBreaks";
107             if(exec) TestSentBreaks();                        break;
108        case 15: name = "TestExtended";
109             if(exec) TestExtended();                          break;
110#else
111        case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
112             break;
113#endif
114
115#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
116        case 16:
117            name = "TestMonkey"; if(exec)  TestMonkey(params); break;
118#else
119        case 16:
120             name = "skip";                                    break;
121#endif
122
123#if !UCONFIG_NO_FILE_IO
124        case 17: name = "TestBug3818";
125            if(exec) TestBug3818();                            break;
126#else
127        case 17: name = "skip";
128            break;
129#endif
130
131        case 18: name = "skip";
132            break;
133        case 19: name = "TestDebug";
134            if(exec) TestDebug();                              break;
135        case 20: name = "skip";
136            break;
137
138#if !UCONFIG_NO_FILE_IO
139        case 21: name = "TestBug5775";
140            if (exec) TestBug5775();                           break;
141#else
142        case 21: name = "skip";
143            break;
144#endif
145
146        case 22: name = "TestBug9983";
147            if (exec) TestBug9983();                           break;
148        case 23: name = "TestDictRules";
149            if (exec) TestDictRules();                         break;
150        case 24: name = "TestBug5532";
151            if (exec) TestBug5532();                           break;
152        default: name = ""; break; //needed to end loop
153    }
154}
155
156
157//---------------------------------------------------------------------------
158//
159//   class BITestData   Holds a set of Break iterator test data and results
160//                      Includes
161//                         - the string data to be broken
162//                         - a vector of the expected break positions.
163//                         - a vector of source line numbers for the data,
164//                               (to help see where errors occured.)
165//                         - The expected break tag values.
166//                         - Vectors of actual break positions and tag values.
167//                         - Functions for comparing actual with expected and
168//                            reporting errors.
169//
170//----------------------------------------------------------------------------
171class BITestData {
172public:
173    UnicodeString    fDataToBreak;
174    UVector          fExpectedBreakPositions;
175    UVector          fExpectedTags;
176    UVector          fLineNum;
177    UVector          fActualBreakPositions;   // Test Results.
178    UVector          fActualTags;
179
180    BITestData(UErrorCode &status);
181    void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
182    void             checkResults(const char *heading, RBBITest *test);
183    void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
184    void             clearResults();
185};
186
187//
188// Constructor.
189//
190BITestData::BITestData(UErrorCode &status)
191: fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
192  fActualTags(status)
193{
194}
195
196//
197// addDataChunk.   Add a section (non-breaking) piece if data to the test data.
198//                 The macro form collects the line number, which is helpful
199//                 when tracking down failures.
200//
201//                 A null data item is inserted at the start of each test's data
202//                  to put the starting zero into the data list.  The position saved for
203//                  each non-null item is its ending position.
204//
205#define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
206void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
207    if (U_FAILURE(status)) {return;}
208    if (data != NULL) {
209        fDataToBreak.append(CharsToUnicodeString(data));
210    }
211    fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
212    fExpectedTags.addElement(tag, status);
213    fLineNum.addElement(lineNum, status);
214}
215
216
217//
218//  checkResults.   Compare the actual and expected break positions, report any differences.
219//
220void BITestData::checkResults(const char *heading, RBBITest *test) {
221    int32_t   expectedIndex = 0;
222    int32_t   actualIndex = 0;
223
224    for (;;) {
225        // If we've run through both the expected and actual results vectors, we're done.
226        //   break out of the loop.
227        if (expectedIndex >= fExpectedBreakPositions.size() &&
228            actualIndex   >= fActualBreakPositions.size()) {
229            break;
230        }
231
232
233        if (expectedIndex >= fExpectedBreakPositions.size()) {
234            err(heading, test, expectedIndex-1, actualIndex);
235            actualIndex++;
236            continue;
237        }
238
239        if (actualIndex >= fActualBreakPositions.size()) {
240            err(heading, test, expectedIndex, actualIndex-1);
241            expectedIndex++;
242            continue;
243        }
244
245        if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
246            err(heading, test, expectedIndex, actualIndex);
247            // Try to resync the positions of the indices, to avoid a rash of spurious erros.
248            if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
249                actualIndex++;
250            } else {
251                expectedIndex++;
252            }
253            continue;
254        }
255
256        if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
257            test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
258                heading, fLineNum.elementAt(expectedIndex),
259                fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
260        }
261
262        actualIndex++;
263        expectedIndex++;
264    }
265}
266
267//
268//  err   -  An error was found.  Report it, along with information about where the
269//                                incorrectly broken test data appeared in the source file.
270//
271void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
272{
273    int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
274    int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
275    int32_t   o        = 0;
276    int32_t   line     = fLineNum.elementAti(expectedIdx);
277    if (expectedIdx > 0) {
278        // The line numbers are off by one because a premature break occurs somewhere
279        //    within the previous item, rather than at the start of the current (expected) item.
280        //    We want to report the offset of the unexpected break from the start of
281        //      this previous item.
282        o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
283    }
284    if (actual < expected) {
285        test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
286    } else {
287        test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
288    }
289}
290
291
292void BITestData::clearResults() {
293    fActualBreakPositions.removeAllElements();
294    fActualTags.removeAllElements();
295}
296
297
298//--------------------------------------------------------------------------------------
299//
300//    RBBITest    constructor and destructor
301//
302//--------------------------------------------------------------------------------------
303
304RBBITest::RBBITest() {
305}
306
307
308RBBITest::~RBBITest() {
309}
310
311//-----------------------------------------------------------------------------------
312//
313//   Test for status {tag} return value from break rules.
314//        TODO:  a more thorough test.
315//
316//-----------------------------------------------------------------------------------
317void RBBITest::TestStatusReturn() {
318     UnicodeString rulesString1("$Letters = [:L:];\n"
319                                  "$Numbers = [:N:];\n"
320                                  "$Letters+{1};\n"
321                                  "$Numbers+{2};\n"
322                                  "Help\\ {4}/me\\!;\n"
323                                  "[^$Letters $Numbers];\n"
324                                  "!.*;\n", -1, US_INV);
325     UnicodeString testString1  = "abc123..abc Help me Help me!";
326                                // 01234567890123456789012345678
327     int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
328     int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
329
330     UErrorCode status=U_ZERO_ERROR;
331     UParseError    parseError;
332
333     BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
334     if(U_FAILURE(status)) {
335         dataerrln("FAIL : in construction - %s", u_errorName(status));
336     } else {
337         int32_t  pos;
338         int32_t  i = 0;
339         bi->setText(testString1);
340         for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
341             if (pos != bounds1[i]) {
342                 errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
343                 break;
344             }
345
346             int tag = bi->getRuleStatus();
347             if (tag != brkStatus[i]) {
348                 errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
349                 break;
350             }
351             i++;
352         }
353     }
354     delete bi;
355}
356
357
358static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
359    UErrorCode status = U_ZERO_ERROR;
360    char name[100];
361    printf("code    alpha extend alphanum type word sent line name\n");
362    int nextExpectedIndex = 0;
363    utext_setNativeIndex(tstr, 0);
364    for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
365        if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
366            printf("------------------------------------------------ %d\n", j);
367            ++nextExpectedIndex;
368        }
369
370        UChar32 c = utext_next32(tstr);
371        u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
372        printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
373                           u_isUAlphabetic(c),
374                           u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
375                           u_isalnum(c),
376                           u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
377                                                  u_charType(c),
378                                                  U_SHORT_PROPERTY_NAME),
379                           u_getPropertyValueName(UCHAR_WORD_BREAK,
380                                                  u_getIntPropertyValue(c,
381                                                          UCHAR_WORD_BREAK),
382                                                  U_SHORT_PROPERTY_NAME),
383                           u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
384                                   u_getIntPropertyValue(c,
385                                           UCHAR_SENTENCE_BREAK),
386                                   U_SHORT_PROPERTY_NAME),
387                           u_getPropertyValueName(UCHAR_LINE_BREAK,
388                                   u_getIntPropertyValue(c,
389                                           UCHAR_LINE_BREAK),
390                                   U_SHORT_PROPERTY_NAME),
391                           name);
392    }
393}
394
395
396static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
397   UErrorCode status = U_ZERO_ERROR;
398   UText *tstr = NULL;
399   tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
400   if (U_FAILURE(status)) {
401       printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
402       return;
403    }
404   printStringBreaks(tstr, expected, expectedCount);
405   utext_close(tstr);
406}
407
408
409void RBBITest::TestBug3818() {
410    UErrorCode  status = U_ZERO_ERROR;
411
412    // Four Thai words...
413    static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
414                                           0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
415    UnicodeString  thaiStr(thaiWordData);
416
417    BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
418    if (U_FAILURE(status) || bi == NULL) {
419        errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
420        return;
421    }
422    bi->setText(thaiStr);
423
424    int32_t  startOfSecondWord = bi->following(1);
425    if (startOfSecondWord != 4) {
426        errln("Fail at file %s, line %d expected start of word at 4, got %d",
427            __FILE__, __LINE__, startOfSecondWord);
428    }
429    startOfSecondWord = bi->following(0);
430    if (startOfSecondWord != 4) {
431        errln("Fail at file %s, line %d expected start of word at 4, got %d",
432            __FILE__, __LINE__, startOfSecondWord);
433    }
434    delete bi;
435}
436
437//----------------------------------------------------------------------------
438//
439// generalIteratorTest      Given a break iterator and a set of test data,
440//                          Run the tests and report the results.
441//
442//----------------------------------------------------------------------------
443void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
444{
445
446    bi.setText(td.fDataToBreak);
447
448    testFirstAndNext(bi, td);
449
450    testLastAndPrevious(bi, td);
451
452    testFollowing(bi, td);
453    testPreceding(bi, td);
454    testIsBoundary(bi, td);
455    doMultipleSelectionTest(bi, td);
456}
457
458
459//
460//   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
461//                       kind of loop.
462//
463void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
464{
465    UErrorCode  status = U_ZERO_ERROR;
466    int32_t     p;
467    int32_t     lastP = -1;
468    int32_t     tag;
469
470    logln("Test first and next");
471    bi.setText(td.fDataToBreak);
472    td.clearResults();
473
474    for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
475        td.fActualBreakPositions.addElement(p, status);  // Save result.
476        tag = bi.getRuleStatus();
477        td.fActualTags.addElement(tag, status);
478        if (p <= lastP) {
479            // If the iterator is not making forward progress, stop.
480            //  No need to raise an error here, it'll be detected in the normal check of results.
481            break;
482        }
483        lastP = p;
484    }
485    td.checkResults("testFirstAndNext", this);
486}
487
488
489//
490//  TestLastAndPrevious.   Run the iterator backwards, starting with last().
491//
492void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
493{
494    UErrorCode  status = U_ZERO_ERROR;
495    int32_t     p;
496    int32_t     lastP  = 0x7ffffffe;
497    int32_t     tag;
498
499    logln("Test last and previous");
500    bi.setText(td.fDataToBreak);
501    td.clearResults();
502
503    for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
504        // Save break position.  Insert it at start of vector of results, shoving
505        //    already-saved results further towards the end.
506        td.fActualBreakPositions.insertElementAt(p, 0, status);
507        // bi.previous();   // TODO:  Why does this fix things up????
508        // bi.next();
509        tag = bi.getRuleStatus();
510        td.fActualTags.insertElementAt(tag, 0, status);
511        if (p >= lastP) {
512            // If the iterator is not making progress, stop.
513            //  No need to raise an error here, it'll be detected in the normal check of results.
514            break;
515        }
516        lastP = p;
517    }
518    td.checkResults("testLastAndPrevious", this);
519}
520
521
522void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
523{
524    UErrorCode  status = U_ZERO_ERROR;
525    int32_t     p;
526    int32_t     tag;
527    int32_t     lastP  = -2;     // A value that will never be returned as a break position.
528                                 //   cannot be -1; that is returned for DONE.
529    int         i;
530
531    logln("testFollowing():");
532    bi.setText(td.fDataToBreak);
533    td.clearResults();
534
535    // Save the starting point, since we won't get that out of following.
536    p = bi.first();
537    td.fActualBreakPositions.addElement(p, status);  // Save result.
538    tag = bi.getRuleStatus();
539    td.fActualTags.addElement(tag, status);
540
541    for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
542        p = bi.following(i);
543        if (p != lastP) {
544            if (p == RuleBasedBreakIterator::DONE) {
545                break;
546            }
547            // We've reached a new break position.  Save it.
548            td.fActualBreakPositions.addElement(p, status);  // Save result.
549            tag = bi.getRuleStatus();
550            td.fActualTags.addElement(tag, status);
551            lastP = p;
552        }
553    }
554    // The loop normally exits by means of the break in the middle.
555    // Make sure that the index was at the correct position for the break iterator to have
556    //   returned DONE.
557    if (i != td.fDataToBreak.length()) {
558        errln("testFollowing():  iterator returned DONE prematurely.");
559    }
560
561    // Full check of all results.
562    td.checkResults("testFollowing", this);
563}
564
565
566
567void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
568    UErrorCode  status = U_ZERO_ERROR;
569    int32_t     p;
570    int32_t     tag;
571    int32_t     lastP  = 0x7ffffffe;
572    int         i;
573
574    logln("testPreceding():");
575    bi.setText(td.fDataToBreak);
576    td.clearResults();
577
578    p = bi.last();
579    td.fActualBreakPositions.addElement(p, status);
580    tag = bi.getRuleStatus();
581    td.fActualTags.addElement(tag, status);
582
583    for (i = td.fDataToBreak.length(); i>=-1; i--) {
584        p = bi.preceding(i);
585        if (p != lastP) {
586            if (p == RuleBasedBreakIterator::DONE) {
587                break;
588            }
589            // We've reached a new break position.  Save it.
590            td.fActualBreakPositions.insertElementAt(p, 0, status);
591            lastP = p;
592            tag = bi.getRuleStatus();
593            td.fActualTags.insertElementAt(tag, 0, status);
594        }
595    }
596    // The loop normally exits by means of the break in the middle.
597    // Make sure that the index was at the correct position for the break iterator to have
598    //   returned DONE.
599    if (i != 0) {
600        errln("testPreceding():  iterator returned DONE prematurely.");
601    }
602
603    // Full check of all results.
604    td.checkResults("testPreceding", this);
605}
606
607
608
609void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
610    UErrorCode  status = U_ZERO_ERROR;
611    int         i;
612    int32_t     tag;
613
614    logln("testIsBoundary():");
615    bi.setText(td.fDataToBreak);
616    td.clearResults();
617
618    for (i = 0; i <= td.fDataToBreak.length(); i++) {
619        if (bi.isBoundary(i)) {
620            td.fActualBreakPositions.addElement(i, status);  // Save result.
621            tag = bi.getRuleStatus();
622            td.fActualTags.addElement(tag, status);
623        }
624    }
625    td.checkResults("testIsBoundary: ", this);
626}
627
628
629
630void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
631{
632    iterator.setText(td.fDataToBreak);
633
634    RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
635    int32_t offset = iterator.first();
636    int32_t testOffset;
637    int32_t count = 0;
638
639    logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
640
641    if (*testIterator != iterator)
642        errln("clone() or operator!= failed: two clones compared unequal");
643
644    do {
645        testOffset = testIterator->first();
646        testOffset = testIterator->next(count);
647        if (offset != testOffset)
648            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
649
650        if (offset != RuleBasedBreakIterator::DONE) {
651            count++;
652            offset = iterator.next();
653
654            if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
655                errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
656                if (count > 10000 || offset == -1) {
657                    errln("operator== failed too many times. Stopping test.");
658                    if (offset == -1) {
659                        errln("Does (RuleBasedBreakIterator::DONE == -1)?");
660                    }
661                    return;
662                }
663            }
664        }
665    } while (offset != RuleBasedBreakIterator::DONE);
666
667    // now do it backwards...
668    offset = iterator.last();
669    count = 0;
670
671    do {
672        testOffset = testIterator->last();
673        testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
674        if (offset != testOffset)
675            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
676
677        if (offset != RuleBasedBreakIterator::DONE) {
678            count--;
679            offset = iterator.previous();
680        }
681    } while (offset != RuleBasedBreakIterator::DONE);
682
683    delete testIterator;
684}
685
686
687//---------------------------------------------
688//
689//     other tests
690//
691//---------------------------------------------
692void RBBITest::TestEmptyString()
693{
694    UnicodeString text = "";
695    UErrorCode status = U_ZERO_ERROR;
696
697    BITestData x(status);
698    ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
699    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
700    if (U_FAILURE(status))
701    {
702        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
703        return;
704    }
705    generalIteratorTest(*bi, x);
706    delete bi;
707}
708
709void RBBITest::TestGetAvailableLocales()
710{
711    int32_t locCount = 0;
712    const Locale* locList = BreakIterator::getAvailableLocales(locCount);
713
714    if (locCount == 0)
715        dataerrln("getAvailableLocales() returned an empty list!");
716    // Just make sure that it's returning good memory.
717    int32_t i;
718    for (i = 0; i < locCount; ++i) {
719        logln(locList[i].getName());
720    }
721}
722
723//Testing the BreakIterator::getDisplayName() function
724void RBBITest::TestGetDisplayName()
725{
726    UnicodeString   result;
727
728    BreakIterator::getDisplayName(Locale::getUS(), result);
729    if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
730        dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
731                + result);
732
733    BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
734    if (result != "French (France)")
735        dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
736                + result);
737}
738/**
739 * Test End Behaviour
740 * @bug 4068137
741 */
742void RBBITest::TestEndBehaviour()
743{
744    UErrorCode status = U_ZERO_ERROR;
745    UnicodeString testString("boo.");
746    BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
747    if (U_FAILURE(status))
748    {
749        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
750        return;
751    }
752    wb->setText(testString);
753
754    if (wb->first() != 0)
755        errln("Didn't get break at beginning of string.");
756    if (wb->next() != 3)
757        errln("Didn't get break before period in \"boo.\"");
758    if (wb->current() != 4 && wb->next() != 4)
759        errln("Didn't get break at end of string.");
760    delete wb;
761}
762/*
763 * @bug 4153072
764 */
765void RBBITest::TestBug4153072() {
766    UErrorCode status = U_ZERO_ERROR;
767    BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
768    if (U_FAILURE(status))
769    {
770        errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
771        return;
772    }
773    UnicodeString str("...Hello, World!...");
774    int32_t begin = 3;
775    int32_t end = str.length() - 3;
776    UBool onBoundary;
777
778    StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
779    iter->adoptText(textIterator);
780    int index;
781    // Note: with the switch to UText, there is no way to restrict the
782    //       iteration range to begin at an index other than zero.
783    //       String character iterators created with a non-zero bound are
784    //         treated by RBBI as being empty.
785    for (index = -1; index < begin + 1; ++index) {
786        onBoundary = iter->isBoundary(index);
787        if (index == 0?  !onBoundary : onBoundary) {
788            errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
789                            " and begin index = " + begin);
790        }
791    }
792    delete iter;
793}
794
795
796//
797// Test for problem reported by Ashok Matoria on 9 July 2007
798//    One.<kSoftHyphen><kSpace>Two.
799//
800//    Sentence break at start (0) and then on calling next() it breaks at
801//   'T' of "Two". Now, at this point if I do next() and
802//    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
803//
804void RBBITest::TestBug5775() {
805    UErrorCode status = U_ZERO_ERROR;
806    BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
807    TEST_ASSERT_SUCCESS(status);
808    if (U_FAILURE(status)) {
809        return;
810    }
811// Check for status first for better handling of no data errors.
812    TEST_ASSERT(bi != NULL);
813    if (bi == NULL) {
814        return;
815    }
816
817    UnicodeString s("One.\\u00ad Two.", -1, US_INV);
818    //               01234      56789
819    s = s.unescape();
820    bi->setText(s);
821    int pos = bi->next();
822    TEST_ASSERT(pos == 6);
823    pos = bi->next();
824    TEST_ASSERT(pos == 10);
825    pos = bi->previous();
826    TEST_ASSERT(pos == 6);
827    delete bi;
828}
829
830
831
832//------------------------------------------------------------------------------
833//
834//   RBBITest::Extended    Run  RBBI Tests from an external test data file
835//
836//------------------------------------------------------------------------------
837
838struct TestParams {
839    BreakIterator   *bi;                   // Break iterator is set while parsing test source.
840                                           //   Changed out whenever test data changes break type.
841
842    UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
843    UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
844    UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
845    UVector32       *srcCol;
846
847    UText           *textToBreak;          // UText, could be UTF8 or UTF16.
848    UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
849    CharString       utf8String;           // UTF-8 form of text to break.
850
851    TestParams(UErrorCode &status) : dataToBreak() {
852        bi               = NULL;
853        expectedBreaks   = new UVector32(status);
854        srcLine          = new UVector32(status);
855        srcCol           = new UVector32(status);
856        textToBreak      = NULL;
857        textMap          = new UVector32(status);
858    }
859
860    ~TestParams() {
861        delete bi;
862        delete expectedBreaks;
863        delete srcLine;
864        delete srcCol;
865        utext_close(textToBreak);
866        delete textMap;
867    }
868
869    int32_t getSrcLine(int32_t bp);
870    int32_t getExpectedBreak(int32_t bp);
871    int32_t getSrcCol(int32_t bp);
872
873    void setUTF16(UErrorCode &status);
874    void setUTF8(UErrorCode &status);
875};
876
877// Append a UnicodeString to a CharString with UTF-8 encoding.
878// Substitute any invalid chars.
879//   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
880static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
881    if (U_FAILURE(status)) {
882        return;
883    }
884    int32_t utf8Length;
885    u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
886                       src.getBuffer(), src.length(),   // UTF-16 data
887                       0xfffd, NULL,                    // Substitution char, number of subs.
888                       &status);
889    if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
890        return;
891    }
892    status = U_ZERO_ERROR;
893    int32_t capacity;
894    char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
895    u_strToUTF8WithSub(buffer, utf8Length, NULL,
896                       src.getBuffer(), src.length(),
897                       0xfffd, NULL, &status);
898    dest.append(buffer, utf8Length, status);
899}
900
901
902void TestParams::setUTF16(UErrorCode &status) {
903    textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
904    textMap->removeAllElements();
905    for (int32_t i=0; i<dataToBreak.length(); i++) {
906        if (i == dataToBreak.getChar32Start(i)) {
907            textMap->addElement(i, status);
908        } else {
909            textMap->addElement(-1, status);
910        }
911    }
912    textMap->addElement(dataToBreak.length(), status);
913    U_ASSERT(dataToBreak.length() + 1 == textMap->size());
914}
915
916
917void TestParams::setUTF8(UErrorCode &status) {
918    if (U_FAILURE(status)) {
919        return;
920    }
921    utf8String.clear();
922    CharStringAppend(utf8String, dataToBreak, status);
923    textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
924    if (U_FAILURE(status)) {
925        return;
926    }
927
928    textMap->removeAllElements();
929    int32_t utf16Index = 0;
930    for (;;) {
931        textMap->addElement(utf16Index, status);
932        UChar32 c32 = utext_current32(textToBreak);
933        if (c32 < 0) {
934            break;
935        }
936        utf16Index += U16_LENGTH(c32);
937        utext_next32(textToBreak);
938        while (textMap->size() < utext_getNativeIndex(textToBreak)) {
939            textMap->addElement(-1, status);
940        }
941    }
942    U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
943}
944
945
946int32_t TestParams::getSrcLine(int bp) {
947    if (bp >= textMap->size()) {
948        bp = textMap->size() - 1;
949    }
950    int32_t i = 0;
951    for(; bp >= 0 ; --bp) {
952        // Move to a character boundary if we are not on one already.
953        i = textMap->elementAti(bp);
954        if (i >= 0) {
955            break;
956        }
957    }
958    return srcLine->elementAti(i);
959}
960
961
962int32_t TestParams::getExpectedBreak(int bp) {
963    if (bp >= textMap->size()) {
964        return 0;
965    }
966    int32_t i = textMap->elementAti(bp);
967    int32_t retVal = 0;
968    if (i >= 0) {
969        retVal = expectedBreaks->elementAti(i);
970    }
971    return retVal;
972}
973
974
975int32_t TestParams::getSrcCol(int bp) {
976    if (bp >= textMap->size()) {
977        bp = textMap->size() - 1;
978    }
979    int32_t i = 0;
980    for(; bp >= 0; --bp) {
981        // Move bp to a character boundary if we are not on one already.
982        i = textMap->elementAti(bp);
983        if (i >= 0) {
984            break;
985        }
986    }
987    return srcCol->elementAti(i);
988}
989
990
991void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
992    int32_t    bp;
993    int32_t    prevBP;
994    int32_t    i;
995
996    TEST_ASSERT_SUCCESS(status);
997    if (U_FAILURE(status)) {
998        return;
999    }
1000
1001    if (t->bi == NULL) {
1002        return;
1003    }
1004
1005    t->bi->setText(t->textToBreak, status);
1006    //
1007    //  Run the iterator forward
1008    //
1009    prevBP = -1;
1010    for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1011        if (prevBP ==  bp) {
1012            // Fail for lack of forward progress.
1013            errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
1014                bp, t->getSrcLine(bp), t->getSrcCol(bp));
1015            break;
1016        }
1017
1018        // Check that there we didn't miss an expected break between the last one
1019        //  and this one.
1020        for (i=prevBP+1; i<bp; i++) {
1021            if (t->getExpectedBreak(i) != 0) {
1022                int expected[] = {0, i};
1023                printStringBreaks(t->dataToBreak, expected, 2);
1024                errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1025                      i, t->getSrcLine(i), t->getSrcCol(i));
1026            }
1027        }
1028
1029        // Check that the break we did find was expected
1030        if (t->getExpectedBreak(bp) == 0) {
1031            int expected[] = {0, bp};
1032            printStringBreaks(t->textToBreak, expected, 2);
1033            errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1034                bp, t->getSrcLine(bp), t->getSrcCol(bp));
1035        } else {
1036            // The break was expected.
1037            //   Check that the {nnn} tag value is correct.
1038            int32_t expectedTagVal = t->getExpectedBreak(bp);
1039            if (expectedTagVal == -1) {
1040                expectedTagVal = 0;
1041            }
1042            int32_t line = t->getSrcLine(bp);
1043            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1044            if (rs != expectedTagVal) {
1045                errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1046                      "          Actual, Expected status = %4d, %4d",
1047                    bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1048            }
1049        }
1050
1051        prevBP = bp;
1052    }
1053
1054    // Verify that there were no missed expected breaks after the last one found
1055    for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1056        if (t->getExpectedBreak(i) != 0) {
1057            errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1058                      i, t->getSrcLine(i), t->getSrcCol(i));
1059        }
1060    }
1061
1062    //
1063    //  Run the iterator backwards, verify that the same breaks are found.
1064    //
1065    prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
1066    for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1067        if (prevBP ==  bp) {
1068            // Fail for lack of progress.
1069            errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1070                bp, t->getSrcLine(bp), t->getSrcCol(bp));
1071            break;
1072        }
1073
1074        // Check that we didn't miss an expected break between the last one
1075        //  and this one.  (UVector returns zeros for index out of bounds.)
1076        for (i=prevBP-1; i>bp; i--) {
1077            if (t->getExpectedBreak(i) != 0) {
1078                errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1079                      i, t->getSrcLine(i), t->getSrcCol(i));
1080            }
1081        }
1082
1083        // Check that the break we did find was expected
1084        if (t->getExpectedBreak(bp) == 0) {
1085            errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1086                   bp, t->getSrcLine(bp), t->getSrcCol(bp));
1087        } else {
1088            // The break was expected.
1089            //   Check that the {nnn} tag value is correct.
1090            int32_t expectedTagVal = t->getExpectedBreak(bp);
1091            if (expectedTagVal == -1) {
1092                expectedTagVal = 0;
1093            }
1094            int line = t->getSrcLine(bp);
1095            int32_t rs = t->bi->getRuleStatus();
1096            if (rs != expectedTagVal) {
1097                errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1098                      "          Actual, Expected status = %4d, %4d",
1099                    bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1100            }
1101        }
1102
1103        prevBP = bp;
1104    }
1105
1106    // Verify that there were no missed breaks prior to the last one found
1107    for (i=prevBP-1; i>=0; i--) {
1108        if (t->getExpectedBreak(i) != 0) {
1109            errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1110                      i, t->getSrcLine(i), t->getSrcCol(i));
1111        }
1112    }
1113
1114    // Check isBoundary()
1115    for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1116        UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
1117        UBool boundaryFound    = t->bi->isBoundary(i);
1118        if (boundaryExpected != boundaryFound) {
1119            errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1120                  "        Expected, Actual= %s, %s",
1121                  i, t->getSrcLine(i), t->getSrcCol(i),
1122                  boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1123        }
1124    }
1125
1126    // Check following()
1127    for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1128        int32_t actualBreak = t->bi->following(i);
1129        int32_t expectedBreak = BreakIterator::DONE;
1130        for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1131            if (t->getExpectedBreak(j) != 0) {
1132                expectedBreak = j;
1133                break;
1134            }
1135        }
1136        if (expectedBreak != actualBreak) {
1137            errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1138                  "        Expected, Actual= %d, %d",
1139                  i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1140        }
1141    }
1142
1143    // Check preceding()
1144    for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
1145        int32_t actualBreak = t->bi->preceding(i);
1146        int32_t expectedBreak = BreakIterator::DONE;
1147
1148        // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1149        // preceding(trailing byte) will return the index of some preceding code point,
1150        // not the lead byte of the current code point, even though that has a smaller index.
1151        // Therefore, start looking at the expected break data not at i-1, but at
1152        // the start of code point index - 1.
1153        utext_setNativeIndex(t->textToBreak, i);
1154        int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1155        for (; j >= 0; j--) {
1156            if (t->getExpectedBreak(j) != 0) {
1157                expectedBreak = j;
1158                break;
1159            }
1160        }
1161        if (expectedBreak != actualBreak) {
1162            errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1163                  "        Expected, Actual= %d, %d",
1164                  i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1165        }
1166    }
1167}
1168
1169
1170void RBBITest::TestExtended() {
1171#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1172    UErrorCode      status  = U_ZERO_ERROR;
1173    Locale          locale("");
1174
1175    UnicodeString       rules;
1176    TestParams          tp(status);
1177
1178    RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@=-]*) *>"), 0, status);
1179    if (U_FAILURE(status)) {
1180        dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1181    }
1182
1183
1184    //
1185    //  Open and read the test data file.
1186    //
1187    const char *testDataDirectory = IntlTest::getSourceTestData(status);
1188    char testFileName[1000];
1189    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1190        errln("Can't open test data.  Path too long.");
1191        return;
1192    }
1193    strcpy(testFileName, testDataDirectory);
1194    strcat(testFileName, "rbbitst.txt");
1195
1196    int    len;
1197    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1198    if (U_FAILURE(status)) {
1199        return; /* something went wrong, error already output */
1200    }
1201
1202
1203
1204
1205    //
1206    //  Put the test data into a UnicodeString
1207    //
1208    UnicodeString testString(FALSE, testFile, len);
1209
1210    enum EParseState{
1211        PARSE_COMMENT,
1212        PARSE_TAG,
1213        PARSE_DATA,
1214        PARSE_NUM
1215    }
1216    parseState = PARSE_TAG;
1217
1218    EParseState savedState = PARSE_TAG;
1219
1220    static const UChar CH_LF        = 0x0a;
1221    static const UChar CH_CR        = 0x0d;
1222    static const UChar CH_HASH      = 0x23;
1223    /*static const UChar CH_PERIOD    = 0x2e;*/
1224    static const UChar CH_LT        = 0x3c;
1225    static const UChar CH_GT        = 0x3e;
1226    static const UChar CH_BACKSLASH = 0x5c;
1227    static const UChar CH_BULLET    = 0x2022;
1228
1229    int32_t    lineNum  = 1;
1230    int32_t    colStart = 0;
1231    int32_t    column   = 0;
1232    int32_t    charIdx  = 0;
1233
1234    int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1235
1236    for (charIdx = 0; charIdx < len; ) {
1237        status = U_ZERO_ERROR;
1238        UChar  c = testString.charAt(charIdx);
1239        charIdx++;
1240        if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1241            // treat CRLF as a unit
1242            c = CH_LF;
1243            charIdx++;
1244        }
1245        if (c == CH_LF || c == CH_CR) {
1246            lineNum++;
1247            colStart = charIdx;
1248        }
1249        column = charIdx - colStart + 1;
1250
1251        switch (parseState) {
1252        case PARSE_COMMENT:
1253            if (c == 0x0a || c == 0x0d) {
1254                parseState = savedState;
1255            }
1256            break;
1257
1258        case PARSE_TAG:
1259            {
1260            if (c == CH_HASH) {
1261                parseState = PARSE_COMMENT;
1262                savedState = PARSE_TAG;
1263                break;
1264            }
1265            if (u_isUWhiteSpace(c)) {
1266                break;
1267            }
1268            if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1269                delete tp.bi;
1270                tp.bi = BreakIterator::createWordInstance(locale,  status);
1271                charIdx += 5;
1272                break;
1273            }
1274            if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1275                delete tp.bi;
1276                tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1277                charIdx += 5;
1278                break;
1279            }
1280            if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1281                delete tp.bi;
1282                tp.bi = BreakIterator::createLineInstance(locale,  status);
1283                charIdx += 5;
1284                break;
1285            }
1286            if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1287                delete tp.bi;
1288                tp.bi = NULL;
1289                tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1290                charIdx += 5;
1291                break;
1292            }
1293            if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1294                delete tp.bi;
1295                tp.bi = BreakIterator::createTitleInstance(locale,  status);
1296                charIdx += 6;
1297                break;
1298            }
1299
1300            // <locale  loc_name>
1301            localeMatcher.reset(testString);
1302            if (localeMatcher.lookingAt(charIdx-1, status)) {
1303                UnicodeString localeName = localeMatcher.group(1, status);
1304                char localeName8[100];
1305                localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1306                locale = Locale::createFromName(localeName8);
1307                charIdx += localeMatcher.group(0, status).length() - 1;
1308                TEST_ASSERT_SUCCESS(status);
1309                break;
1310            }
1311            if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1312                parseState = PARSE_DATA;
1313                charIdx += 5;
1314                tp.dataToBreak = "";
1315                tp.expectedBreaks->removeAllElements();
1316                tp.srcCol ->removeAllElements();
1317                tp.srcLine->removeAllElements();
1318                break;
1319            }
1320
1321            errln("line %d: Tag expected in test file.", lineNum);
1322            parseState = PARSE_COMMENT;
1323            savedState = PARSE_DATA;
1324            goto end_test; // Stop the test.
1325            }
1326            break;
1327
1328        case PARSE_DATA:
1329            if (c == CH_BULLET) {
1330                int32_t  breakIdx = tp.dataToBreak.length();
1331                tp.expectedBreaks->setSize(breakIdx+1);
1332                tp.expectedBreaks->setElementAt(-1, breakIdx);
1333                tp.srcLine->setSize(breakIdx+1);
1334                tp.srcLine->setElementAt(lineNum, breakIdx);
1335                tp.srcCol ->setSize(breakIdx+1);
1336                tp.srcCol ->setElementAt(column, breakIdx);
1337                break;
1338            }
1339
1340            if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1341                // Add final entry to mappings from break location to source file position.
1342                //  Need one extra because last break position returned is after the
1343                //    last char in the data, not at the last char.
1344                tp.srcLine->addElement(lineNum, status);
1345                tp.srcCol ->addElement(column, status);
1346
1347                parseState = PARSE_TAG;
1348                charIdx += 6;
1349
1350                // RUN THE TEST!
1351                status = U_ZERO_ERROR;
1352                tp.setUTF16(status);
1353                executeTest(&tp, status);
1354                TEST_ASSERT_SUCCESS(status);
1355
1356                // Run again, this time with UTF-8 text wrapped in a UText.
1357                status = U_ZERO_ERROR;
1358                tp.setUTF8(status);
1359                TEST_ASSERT_SUCCESS(status);
1360                executeTest(&tp, status);
1361                break;
1362            }
1363
1364            if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1365                // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1366                // Get the code point from the name and insert it into the test data.
1367                //   (Damn, no API takes names in Unicode  !!!
1368                //    we've got to take it back to char *)
1369                int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1370                int32_t nameLength = nameEndIdx - (charIdx+2);
1371                char charNameBuf[200];
1372                UChar32 theChar = -1;
1373                if (nameEndIdx != -1) {
1374                    UErrorCode status = U_ZERO_ERROR;
1375                    testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1376                    charNameBuf[sizeof(charNameBuf)-1] = 0;
1377                    theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1378                    if (U_FAILURE(status)) {
1379                        theChar = -1;
1380                    }
1381                }
1382                if (theChar == -1) {
1383                    errln("Error in named character in test file at line %d, col %d",
1384                        lineNum, column);
1385                } else {
1386                    // Named code point was recognized.  Insert it
1387                    //   into the test data.
1388                    tp.dataToBreak.append(theChar);
1389                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
1390                        tp.srcLine->addElement(lineNum, status);
1391                        tp.srcCol ->addElement(column, status);
1392                    }
1393                }
1394                if (nameEndIdx > charIdx) {
1395                    charIdx = nameEndIdx+1;
1396
1397                }
1398                break;
1399            }
1400
1401
1402
1403
1404            if (testString.compare(charIdx-1, 2, "<>") == 0) {
1405                charIdx++;
1406                int32_t  breakIdx = tp.dataToBreak.length();
1407                tp.expectedBreaks->setSize(breakIdx+1);
1408                tp.expectedBreaks->setElementAt(-1, breakIdx);
1409                tp.srcLine->setSize(breakIdx+1);
1410                tp.srcLine->setElementAt(lineNum, breakIdx);
1411                tp.srcCol ->setSize(breakIdx+1);
1412                tp.srcCol ->setElementAt(column, breakIdx);
1413                break;
1414            }
1415
1416            if (c == CH_LT) {
1417                tagValue   = 0;
1418                parseState = PARSE_NUM;
1419                break;
1420            }
1421
1422            if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1423                parseState = PARSE_COMMENT;
1424                savedState = PARSE_DATA;
1425                break;
1426            }
1427
1428            if (c == CH_BACKSLASH) {
1429                // Check for \ at end of line, a line continuation.
1430                //     Advance over (discard) the newline
1431                UChar32 cp = testString.char32At(charIdx);
1432                if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1433                    // We have a CR LF
1434                    //  Need an extra increment of the input ptr to move over both of them
1435                    charIdx++;
1436                }
1437                if (cp == CH_LF || cp == CH_CR) {
1438                    lineNum++;
1439                    colStart = charIdx;
1440                    charIdx++;
1441                    break;
1442                }
1443
1444                // Let unescape handle the back slash.
1445                cp = testString.unescapeAt(charIdx);
1446                if (cp != -1) {
1447                    // Escape sequence was recognized.  Insert the char
1448                    //   into the test data.
1449                    tp.dataToBreak.append(cp);
1450                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
1451                        tp.srcLine->addElement(lineNum, status);
1452                        tp.srcCol ->addElement(column, status);
1453                    }
1454                    break;
1455                }
1456
1457
1458                // Not a recognized backslash escape sequence.
1459                // Take the next char as a literal.
1460                //  TODO:  Should this be an error?
1461                c = testString.charAt(charIdx);
1462                charIdx = testString.moveIndex32(charIdx, 1);
1463            }
1464
1465            // Normal, non-escaped data char.
1466            tp.dataToBreak.append(c);
1467
1468            // Save the mapping from offset in the data to line/column numbers in
1469            //   the original input file.  Will be used for better error messages only.
1470            //   If there's an expected break before this char, the slot in the mapping
1471            //     vector will already be set for this char; don't overwrite it.
1472            if (tp.dataToBreak.length() > tp.srcLine->size()) {
1473                tp.srcLine->addElement(lineNum, status);
1474                tp.srcCol ->addElement(column, status);
1475            }
1476            break;
1477
1478
1479        case PARSE_NUM:
1480            // We are parsing an expected numeric tag value, like <1234>,
1481            //   within a chunk of data.
1482            if (u_isUWhiteSpace(c)) {
1483                break;
1484            }
1485
1486            if (c == CH_GT) {
1487                // Finished the number.  Add the info to the expected break data,
1488                //   and switch parse state back to doing plain data.
1489                parseState = PARSE_DATA;
1490                if (tagValue == 0) {
1491                    tagValue = -1;
1492                }
1493                int32_t  breakIdx = tp.dataToBreak.length();
1494                tp.expectedBreaks->setSize(breakIdx+1);
1495                tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1496                tp.srcLine->setSize(breakIdx+1);
1497                tp.srcLine->setElementAt(lineNum, breakIdx);
1498                tp.srcCol ->setSize(breakIdx+1);
1499                tp.srcCol ->setElementAt(column, breakIdx);
1500                break;
1501            }
1502
1503            if (u_isdigit(c)) {
1504                tagValue = tagValue*10 + u_charDigitValue(c);
1505                break;
1506            }
1507
1508            errln("Syntax Error in test file at line %d, col %d",
1509                lineNum, column);
1510            parseState = PARSE_COMMENT;
1511            goto end_test; // Stop the test
1512            break;
1513        }
1514
1515
1516        if (U_FAILURE(status)) {
1517            dataerrln("ICU Error %s while parsing test file at line %d.",
1518                u_errorName(status), lineNum);
1519            status = U_ZERO_ERROR;
1520            goto end_test; // Stop the test
1521        }
1522
1523    }
1524
1525end_test:
1526    delete [] testFile;
1527#endif
1528}
1529
1530
1531//-------------------------------------------------------------------------------
1532//
1533//  TestDictRules   create a break iterator from source rules that includes a
1534//                  dictionary range.   Regression for bug #7130.  Source rules
1535//                  do not declare a break iterator type (word, line, sentence, etc.
1536//                  but the dictionary code, without a type, would loop.
1537//
1538//-------------------------------------------------------------------------------
1539void RBBITest::TestDictRules() {
1540    const char *rules =  "$dictionary = [a-z]; \n"
1541                         "!!forward; \n"
1542                         "$dictionary $dictionary; \n"
1543                         "!!reverse; \n"
1544                         "$dictionary $dictionary; \n";
1545    const char *text = "aa";
1546    UErrorCode status = U_ZERO_ERROR;
1547    UParseError parseError;
1548
1549    RuleBasedBreakIterator bi(rules, parseError, status);
1550    if (U_SUCCESS(status)) {
1551        UnicodeString utext = text;
1552        bi.setText(utext);
1553        int32_t position;
1554        int32_t loops;
1555        for (loops = 0; loops<10; loops++) {
1556            position = bi.next();
1557            if (position == RuleBasedBreakIterator::DONE) {
1558                break;
1559            }
1560        }
1561        TEST_ASSERT(loops == 1);
1562    } else {
1563        dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1564    }
1565}
1566
1567
1568
1569//-------------------------------------------------------------------------------
1570//
1571//    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1572//    return the datain one big UChar * buffer, which the caller must delete.
1573//
1574//    parameters:
1575//          fileName:   the name of the file, with no directory part.  The test data directory
1576//                      is assumed.
1577//          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1578//          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1579//                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1580//                      Pass NULL for the system default encoding.
1581//          status
1582//    returns:
1583//                      The file data, converted to UChar.
1584//                      The caller must delete this when done with
1585//                           delete [] theBuffer;
1586//
1587//    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1588//           Move this function to some common place.
1589//
1590//--------------------------------------------------------------------------------
1591UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1592    UChar       *retPtr  = NULL;
1593    char        *fileBuf = NULL;
1594    UConverter* conv     = NULL;
1595    FILE        *f       = NULL;
1596
1597    ulen = 0;
1598    if (U_FAILURE(status)) {
1599        return retPtr;
1600    }
1601
1602    //
1603    //  Open the file.
1604    //
1605    f = fopen(fileName, "rb");
1606    if (f == 0) {
1607        dataerrln("Error opening test data file %s\n", fileName);
1608        status = U_FILE_ACCESS_ERROR;
1609        return NULL;
1610    }
1611    //
1612    //  Read it in
1613    //
1614    int   fileSize;
1615    int   amt_read;
1616
1617    fseek( f, 0, SEEK_END);
1618    fileSize = ftell(f);
1619    fileBuf = new char[fileSize];
1620    fseek(f, 0, SEEK_SET);
1621    amt_read = fread(fileBuf, 1, fileSize, f);
1622    if (amt_read != fileSize || fileSize <= 0) {
1623        errln("Error reading test data file.");
1624        goto cleanUpAndReturn;
1625    }
1626
1627    //
1628    // Look for a Unicode Signature (BOM) on the data just read
1629    //
1630    int32_t        signatureLength;
1631    const char *   fileBufC;
1632    const char*    bomEncoding;
1633
1634    fileBufC = fileBuf;
1635    bomEncoding = ucnv_detectUnicodeSignature(
1636        fileBuf, fileSize, &signatureLength, &status);
1637    if(bomEncoding!=NULL ){
1638        fileBufC  += signatureLength;
1639        fileSize  -= signatureLength;
1640        encoding = bomEncoding;
1641    }
1642
1643    //
1644    // Open a converter to take the rule file to UTF-16
1645    //
1646    conv = ucnv_open(encoding, &status);
1647    if (U_FAILURE(status)) {
1648        goto cleanUpAndReturn;
1649    }
1650
1651    //
1652    // Convert the rules to UChar.
1653    //  Preflight first to determine required buffer size.
1654    //
1655    ulen = ucnv_toUChars(conv,
1656        NULL,           //  dest,
1657        0,              //  destCapacity,
1658        fileBufC,
1659        fileSize,
1660        &status);
1661    if (status == U_BUFFER_OVERFLOW_ERROR) {
1662        // Buffer Overflow is expected from the preflight operation.
1663        status = U_ZERO_ERROR;
1664
1665        retPtr = new UChar[ulen+1];
1666        ucnv_toUChars(conv,
1667            retPtr,       //  dest,
1668            ulen+1,
1669            fileBufC,
1670            fileSize,
1671            &status);
1672    }
1673
1674cleanUpAndReturn:
1675    fclose(f);
1676    delete []fileBuf;
1677    ucnv_close(conv);
1678    if (U_FAILURE(status)) {
1679        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1680        delete []retPtr;
1681        retPtr = 0;
1682        ulen   = 0;
1683    };
1684    return retPtr;
1685}
1686
1687
1688
1689//--------------------------------------------------------------------------------------------
1690//
1691//   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1692//
1693//-------------------------------------------------------------------------------------------
1694void RBBITest::TestUnicodeFiles() {
1695    RuleBasedBreakIterator  *bi;
1696    UErrorCode               status = U_ZERO_ERROR;
1697
1698    bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1699    TEST_ASSERT_SUCCESS(status);
1700    if (U_SUCCESS(status)) {
1701        runUnicodeTestData("GraphemeBreakTest.txt", bi);
1702    }
1703    delete bi;
1704
1705    bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1706    TEST_ASSERT_SUCCESS(status);
1707    if (U_SUCCESS(status)) {
1708        runUnicodeTestData("WordBreakTest.txt", bi);
1709    }
1710    delete bi;
1711
1712    bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1713    TEST_ASSERT_SUCCESS(status);
1714    if (U_SUCCESS(status)) {
1715        runUnicodeTestData("SentenceBreakTest.txt", bi);
1716    }
1717    delete bi;
1718
1719    bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1720    TEST_ASSERT_SUCCESS(status);
1721    if (U_SUCCESS(status)) {
1722        runUnicodeTestData("LineBreakTest.txt", bi);
1723    }
1724    delete bi;
1725}
1726
1727
1728// Check for test cases from the Unicode test data files that are known to fail
1729// and should be skipped because ICU is not yet able to fully implement the spec.
1730// See ticket #7270.
1731
1732UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1733    static const UChar badTestCases[][4] = {                     // Line Numbers from Unicode 7.0.0 file.
1734        {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000},   // Line 5198
1735        {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000},   // Line 5202
1736        {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000},   // Line 5214
1737        {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000},   // Line 5246
1738        {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000},   // Line 5298
1739        {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000}    // Line 5302
1740    };
1741    if (strcmp(fileName, "LineBreakTest.txt") != 0) {
1742        return FALSE;
1743    }
1744
1745    for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
1746        if (testCase == UnicodeString(badTestCases[i])) {
1747            return logKnownIssue("7270");
1748        }
1749    }
1750    return FALSE;
1751}
1752
1753
1754//--------------------------------------------------------------------------------------------
1755//
1756//   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1757//
1758//-------------------------------------------------------------------------------------------
1759void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1760#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1761    UErrorCode  status = U_ZERO_ERROR;
1762
1763    //
1764    //  Open and read the test data file, put it into a UnicodeString.
1765    //
1766    const char *testDataDirectory = IntlTest::getSourceTestData(status);
1767    char testFileName[1000];
1768    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1769        dataerrln("Can't open test data.  Path too long.");
1770        return;
1771    }
1772    strcpy(testFileName, testDataDirectory);
1773    strcat(testFileName, fileName);
1774
1775    logln("Opening data file %s\n", fileName);
1776
1777    int    len;
1778    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1779    if (status != U_FILE_ACCESS_ERROR) {
1780        TEST_ASSERT_SUCCESS(status);
1781        TEST_ASSERT(testFile != NULL);
1782    }
1783    if (U_FAILURE(status) || testFile == NULL) {
1784        return; /* something went wrong, error already output */
1785    }
1786    UnicodeString testFileAsString(TRUE, testFile, len);
1787
1788    //
1789    //  Parse the test data file using a regular expression.
1790    //  Each kind of token is recognized in its own capture group; what type of item was scanned
1791    //     is identified by which group had a match.
1792    //
1793    //    Caputure Group #                  1          2            3            4           5
1794    //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1795    //
1796    UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1797    RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1798    UnicodeString   testString;
1799    UVector32       breakPositions(status);
1800    int             lineNumber = 1;
1801    TEST_ASSERT_SUCCESS(status);
1802    if (U_FAILURE(status)) {
1803        return;
1804    }
1805
1806    //
1807    //  Scan through each test case, building up the string to be broken in testString,
1808    //   and the positions that should be boundaries in the breakPositions vector.
1809    //
1810    int spin = 0;
1811    while (tokenMatcher.find()) {
1812      	if(tokenMatcher.hitEnd()) {
1813          /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1814             This occurred when the text file was corrupt (wasn't marked as UTF-8)
1815             and caused an infinite loop here on EBCDIC systems!
1816          */
1817          fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1818          //	   return;
1819      	}
1820        if (tokenMatcher.start(1, status) >= 0) {
1821            // Scanned a divide sign, indicating a break position in the test data.
1822            if (testString.length()>0) {
1823                breakPositions.addElement(testString.length(), status);
1824            }
1825        }
1826        else if (tokenMatcher.start(2, status) >= 0) {
1827            // Scanned an 'x', meaning no break at this position in the test data
1828            //   Nothing to be done here.
1829            }
1830        else if (tokenMatcher.start(3, status) >= 0) {
1831            // Scanned Hex digits.  Convert them to binary, append to the character data string.
1832            const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1833            int length = hexNumber.length();
1834            if (length<=8) {
1835                char buf[10];
1836                hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1837                UChar32 c = (UChar32)strtol(buf, NULL, 16);
1838                if (c<=0x10ffff) {
1839                    testString.append(c);
1840                } else {
1841                    errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1842                       fileName, lineNumber);
1843                }
1844            } else {
1845                errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1846                       fileName, lineNumber);
1847             }
1848        }
1849        else if (tokenMatcher.start(4, status) >= 0) {
1850            // Scanned to end of a line, possibly skipping over a comment in the process.
1851            //   If the line from the file contained test data, run the test now.
1852            if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1853                checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1854            }
1855
1856            // Clear out this test case.
1857            //    The string and breakPositions vector will be refilled as the next
1858            //       test case is parsed.
1859            testString.remove();
1860            breakPositions.removeAllElements();
1861            lineNumber++;
1862        } else {
1863            // Scanner catchall.  Something unrecognized appeared on the line.
1864            char token[16];
1865            UnicodeString uToken = tokenMatcher.group(0, status);
1866            uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1867            token[sizeof(token)-1] = 0;
1868            errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1869
1870            // Clean up, in preparation for continuing with the next line.
1871            testString.remove();
1872            breakPositions.removeAllElements();
1873            lineNumber++;
1874        }
1875        TEST_ASSERT_SUCCESS(status);
1876        if (U_FAILURE(status)) {
1877            break;
1878        }
1879    }
1880
1881    delete [] testFile;
1882 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1883}
1884
1885//--------------------------------------------------------------------------------------------
1886//
1887//   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1888//                            test data files.  Do only a simple, forward-only check -
1889//                            this test is mostly to check that ICU and the Unicode
1890//                            data agree with each other.
1891//
1892//--------------------------------------------------------------------------------------------
1893void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1894                         const UnicodeString &testString,   // Text data to be broken
1895                         UVector32 *breakPositions,         // Positions where breaks should be found.
1896                         RuleBasedBreakIterator *bi) {
1897    int32_t pos;                 // Break Position in the test string
1898    int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1899    int32_t expectedPos;         // Expected break position (index into test string)
1900
1901    bi->setText(testString);
1902    pos = bi->first();
1903    pos = bi->next();
1904
1905    while (pos != BreakIterator::DONE) {
1906        if (expectedI >= breakPositions->size()) {
1907            errln("Test file \"%s\", line %d, unexpected break found at position %d",
1908                testFileName, lineNumber, pos);
1909            break;
1910        }
1911        expectedPos = breakPositions->elementAti(expectedI);
1912        if (pos < expectedPos) {
1913            errln("Test file \"%s\", line %d, unexpected break found at position %d",
1914                testFileName, lineNumber, pos);
1915            break;
1916        }
1917        if (pos > expectedPos) {
1918            errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1919                testFileName, lineNumber, expectedPos);
1920            break;
1921        }
1922        pos = bi->next();
1923        expectedI++;
1924    }
1925
1926    if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1927        errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1928            testFileName, lineNumber, breakPositions->elementAti(expectedI));
1929    }
1930}
1931
1932
1933
1934#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1935//---------------------------------------------------------------------------------------
1936//
1937//   classs RBBIMonkeyKind
1938//
1939//      Monkey Test for Break Iteration
1940//      Abstract interface class.   Concrete derived classes independently
1941//      implement the break rules for different iterator types.
1942//
1943//      The Monkey Test itself uses doesn't know which type of break iterator it is
1944//      testing, but works purely in terms of the interface defined here.
1945//
1946//---------------------------------------------------------------------------------------
1947class RBBIMonkeyKind {
1948public:
1949    // Return a UVector of UnicodeSets, representing the character classes used
1950    //   for this type of iterator.
1951    virtual  UVector  *charClasses() = 0;
1952
1953    // Set the test text on which subsequent calls to next() will operate
1954    virtual  void      setText(const UnicodeString &s) = 0;
1955
1956    // Find the next break postion, starting from the prev break position, or from zero.
1957    // Return -1 after reaching end of string.
1958    virtual  int32_t   next(int32_t i) = 0;
1959
1960    virtual ~RBBIMonkeyKind();
1961    UErrorCode       deferredStatus;
1962
1963
1964protected:
1965    RBBIMonkeyKind();
1966
1967private:
1968};
1969
1970RBBIMonkeyKind::RBBIMonkeyKind() {
1971    deferredStatus = U_ZERO_ERROR;
1972}
1973
1974RBBIMonkeyKind::~RBBIMonkeyKind() {
1975}
1976
1977
1978//----------------------------------------------------------------------------------------
1979//
1980//   Random Numbers.  Similar to standard lib rand() and srand()
1981//                    Not using library to
1982//                      1.  Get same results on all platforms.
1983//                      2.  Get access to current seed, to more easily reproduce failures.
1984//
1985//---------------------------------------------------------------------------------------
1986static uint32_t m_seed = 1;
1987
1988static uint32_t m_rand()
1989{
1990    m_seed = m_seed * 1103515245 + 12345;
1991    return (uint32_t)(m_seed/65536) % 32768;
1992}
1993
1994
1995//------------------------------------------------------------------------------------------
1996//
1997//   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1998//                             of RBBIMonkeyKind.
1999//
2000//------------------------------------------------------------------------------------------
2001class RBBICharMonkey: public RBBIMonkeyKind {
2002public:
2003    RBBICharMonkey();
2004    virtual          ~RBBICharMonkey();
2005    virtual  UVector *charClasses();
2006    virtual  void     setText(const UnicodeString &s);
2007    virtual  int32_t  next(int32_t i);
2008private:
2009    UVector   *fSets;
2010
2011    UnicodeSet  *fCRLFSet;
2012    UnicodeSet  *fControlSet;
2013    UnicodeSet  *fExtendSet;
2014    UnicodeSet  *fRegionalIndicatorSet;
2015    UnicodeSet  *fPrependSet;
2016    UnicodeSet  *fSpacingSet;
2017    UnicodeSet  *fLSet;
2018    UnicodeSet  *fVSet;
2019    UnicodeSet  *fTSet;
2020    UnicodeSet  *fLVSet;
2021    UnicodeSet  *fLVTSet;
2022    UnicodeSet  *fHangulSet;
2023    UnicodeSet  *fAnySet;
2024
2025    const UnicodeString *fText;
2026};
2027
2028
2029RBBICharMonkey::RBBICharMonkey() {
2030    UErrorCode  status = U_ZERO_ERROR;
2031
2032    fText = NULL;
2033
2034    fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2035    fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
2036    fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
2037    fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
2038    fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2039    fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2040    fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2041    fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2042    fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2043    fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2044    fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2045    fHangulSet  = new UnicodeSet();
2046    fHangulSet->addAll(*fLSet);
2047    fHangulSet->addAll(*fVSet);
2048    fHangulSet->addAll(*fTSet);
2049    fHangulSet->addAll(*fLVSet);
2050    fHangulSet->addAll(*fLVTSet);
2051    fAnySet     = new UnicodeSet(0, 0x10ffff);
2052
2053    fSets       = new UVector(status);
2054    fSets->addElement(fCRLFSet,    status);
2055    fSets->addElement(fControlSet, status);
2056    fSets->addElement(fExtendSet,  status);
2057    fSets->addElement(fRegionalIndicatorSet, status);
2058    if (!fPrependSet->isEmpty()) {
2059        fSets->addElement(fPrependSet, status);
2060    }
2061    fSets->addElement(fSpacingSet, status);
2062    fSets->addElement(fHangulSet,  status);
2063    fSets->addElement(fAnySet,     status);
2064    if (U_FAILURE(status)) {
2065        deferredStatus = status;
2066    }
2067}
2068
2069
2070void RBBICharMonkey::setText(const UnicodeString &s) {
2071    fText = &s;
2072}
2073
2074
2075
2076int32_t RBBICharMonkey::next(int32_t prevPos) {
2077    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2078                              //   break position being tested.  The candidate break
2079                              //   location is before p2.
2080
2081    int     breakPos = -1;
2082
2083    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2084
2085    if (U_FAILURE(deferredStatus)) {
2086        return -1;
2087    }
2088
2089    // Previous break at end of string.  return DONE.
2090    if (prevPos >= fText->length()) {
2091        return -1;
2092    }
2093    p0 = p1 = p2 = p3 = prevPos;
2094    c3 =  fText->char32At(prevPos);
2095    c0 = c1 = c2 = 0;
2096    (void)p0;   // suppress set but not used warning.
2097    (void)c0;
2098
2099    // Loop runs once per "significant" character position in the input text.
2100    for (;;) {
2101        // Move all of the positions forward in the input string.
2102        p0 = p1;  c0 = c1;
2103        p1 = p2;  c1 = c2;
2104        p2 = p3;  c2 = c3;
2105
2106        // Advancd p3 by one codepoint
2107        p3 = fText->moveIndex32(p3, 1);
2108        c3 = fText->char32At(p3);
2109
2110        if (p1 == p2) {
2111            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2112            continue;
2113        }
2114        if (p2 == fText->length()) {
2115            // Reached end of string.  Always a break position.
2116            break;
2117        }
2118
2119        // Rule  GB3   CR x LF
2120        //     No Extend or Format characters may appear between the CR and LF,
2121        //     which requires the additional check for p2 immediately following p1.
2122        //
2123        if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2124            continue;
2125        }
2126
2127        // Rule (GB4).   ( Control | CR | LF ) <break>
2128        if (fControlSet->contains(c1) ||
2129            c1 == 0x0D ||
2130            c1 == 0x0A)  {
2131            break;
2132        }
2133
2134        // Rule (GB5)    <break>  ( Control | CR | LF )
2135        //
2136        if (fControlSet->contains(c2) ||
2137            c2 == 0x0D ||
2138            c2 == 0x0A)  {
2139            break;
2140        }
2141
2142
2143        // Rule (GB6)  L x ( L | V | LV | LVT )
2144        if (fLSet->contains(c1) &&
2145               (fLSet->contains(c2)  ||
2146                fVSet->contains(c2)  ||
2147                fLVSet->contains(c2) ||
2148                fLVTSet->contains(c2))) {
2149            continue;
2150        }
2151
2152        // Rule (GB7)    ( LV | V )  x  ( V | T )
2153        if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2154            (fVSet->contains(c2) || fTSet->contains(c2)))  {
2155            continue;
2156        }
2157
2158        // Rule (GB8)    ( LVT | T)  x T
2159        if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2160            fTSet->contains(c2))  {
2161            continue;
2162        }
2163
2164        // Rule (GB8a)    Regional_Indicator x Regional_Indicator
2165        if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2166            continue;
2167        }
2168
2169        // Rule (GB9)    Numeric x ALetter
2170        if (fExtendSet->contains(c2))  {
2171            continue;
2172        }
2173
2174        // Rule (GB9a)   x  SpacingMark
2175        if (fSpacingSet->contains(c2)) {
2176            continue;
2177        }
2178
2179        // Rule (GB9b)   Prepend x
2180        if (fPrependSet->contains(c1)) {
2181            continue;
2182        }
2183
2184        // Rule (GB10)  Any  <break>  Any
2185        break;
2186    }
2187
2188    breakPos = p2;
2189    return breakPos;
2190}
2191
2192
2193
2194UVector  *RBBICharMonkey::charClasses() {
2195    return fSets;
2196}
2197
2198
2199RBBICharMonkey::~RBBICharMonkey() {
2200    delete fSets;
2201    delete fCRLFSet;
2202    delete fControlSet;
2203    delete fExtendSet;
2204    delete fRegionalIndicatorSet;
2205    delete fPrependSet;
2206    delete fSpacingSet;
2207    delete fLSet;
2208    delete fVSet;
2209    delete fTSet;
2210    delete fLVSet;
2211    delete fLVTSet;
2212    delete fHangulSet;
2213    delete fAnySet;
2214}
2215
2216//------------------------------------------------------------------------------------------
2217//
2218//   class RBBIWordMonkey      Word Break specific implementation
2219//                             of RBBIMonkeyKind.
2220//
2221//------------------------------------------------------------------------------------------
2222class RBBIWordMonkey: public RBBIMonkeyKind {
2223public:
2224    RBBIWordMonkey();
2225    virtual          ~RBBIWordMonkey();
2226    virtual  UVector *charClasses();
2227    virtual  void     setText(const UnicodeString &s);
2228    virtual int32_t   next(int32_t i);
2229private:
2230    UVector      *fSets;
2231
2232    UnicodeSet  *fCRSet;
2233    UnicodeSet  *fLFSet;
2234    UnicodeSet  *fNewlineSet;
2235    UnicodeSet  *fRegionalIndicatorSet;
2236    UnicodeSet  *fKatakanaSet;
2237    UnicodeSet  *fHebrew_LetterSet;
2238    UnicodeSet  *fALetterSet;
2239    // TODO(jungshik): Do we still need this change?
2240    // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
2241    UnicodeSet  *fSingle_QuoteSet;
2242    UnicodeSet  *fDouble_QuoteSet;
2243    UnicodeSet  *fMidNumLetSet;
2244    UnicodeSet  *fMidLetterSet;
2245    UnicodeSet  *fMidNumSet;
2246    UnicodeSet  *fNumericSet;
2247    UnicodeSet  *fFormatSet;
2248    UnicodeSet  *fOtherSet;
2249    UnicodeSet  *fExtendSet;
2250    UnicodeSet  *fExtendNumLetSet;
2251    UnicodeSet  *fDictionaryCjkSet;
2252
2253    const UnicodeString  *fText;
2254};
2255
2256
2257RBBIWordMonkey::RBBIWordMonkey()
2258{
2259    UErrorCode  status = U_ZERO_ERROR;
2260
2261    fSets            = new UVector(status);
2262
2263    fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2264    fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2265    fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2266    fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2267    // Exclude Hangul syllables from ALetterSet during testing.
2268    // Leave CJK dictionary characters out from the monkey tests!
2269#if 0
2270    fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
2271                                      "[\\p{Line_Break = Complex_Context}"
2272                                      "-\\p{Grapheme_Cluster_Break = Extend}"
2273                                      "-\\p{Grapheme_Cluster_Break = Control}"
2274                                      "]]",
2275                                      status);
2276#endif
2277    fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2278    fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2279    fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2280    fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2281    fALetterSet->removeAll(*fDictionaryCjkSet);
2282    fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
2283    fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
2284    fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2285    fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2286    fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2287    // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2288    // we should figure out why
2289    fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2290    fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2291    fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2292    fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2293
2294    fOtherSet        = new UnicodeSet();
2295    if(U_FAILURE(status)) {
2296      deferredStatus = status;
2297      return;
2298    }
2299
2300    fOtherSet->complement();
2301    fOtherSet->removeAll(*fCRSet);
2302    fOtherSet->removeAll(*fLFSet);
2303    fOtherSet->removeAll(*fNewlineSet);
2304    fOtherSet->removeAll(*fKatakanaSet);
2305    fOtherSet->removeAll(*fHebrew_LetterSet);
2306    fOtherSet->removeAll(*fALetterSet);
2307    fOtherSet->removeAll(*fSingle_QuoteSet);
2308    fOtherSet->removeAll(*fDouble_QuoteSet);
2309    fOtherSet->removeAll(*fMidLetterSet);
2310    fOtherSet->removeAll(*fMidNumSet);
2311    fOtherSet->removeAll(*fNumericSet);
2312    fOtherSet->removeAll(*fExtendNumLetSet);
2313    fOtherSet->removeAll(*fFormatSet);
2314    fOtherSet->removeAll(*fExtendSet);
2315    fOtherSet->removeAll(*fRegionalIndicatorSet);
2316    // Inhibit dictionary characters from being tested at all.
2317    fOtherSet->removeAll(*fDictionaryCjkSet);
2318    fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2319
2320    fSets->addElement(fCRSet,                status);
2321    fSets->addElement(fLFSet,                status);
2322    fSets->addElement(fNewlineSet,           status);
2323    fSets->addElement(fRegionalIndicatorSet, status);
2324    fSets->addElement(fHebrew_LetterSet,     status);
2325    fSets->addElement(fALetterSet,           status);
2326    fSets->addElement(fSingle_QuoteSet,      status);
2327    fSets->addElement(fDouble_QuoteSet,      status);
2328    //fSets->addElement(fKatakanaSet,          status); //TODO: work out how to test katakana
2329    fSets->addElement(fMidLetterSet,         status);
2330    fSets->addElement(fMidNumLetSet,         status);
2331    fSets->addElement(fMidNumSet,            status);
2332    fSets->addElement(fNumericSet,           status);
2333    fSets->addElement(fFormatSet,            status);
2334    fSets->addElement(fExtendSet,            status);
2335    fSets->addElement(fOtherSet,             status);
2336    fSets->addElement(fExtendNumLetSet,      status);
2337
2338    if (U_FAILURE(status)) {
2339        deferredStatus = status;
2340    }
2341}
2342
2343void RBBIWordMonkey::setText(const UnicodeString &s) {
2344    fText       = &s;
2345}
2346
2347
2348int32_t RBBIWordMonkey::next(int32_t prevPos) {
2349    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2350                              //   break position being tested.  The candidate break
2351                              //   location is before p2.
2352
2353    int     breakPos = -1;
2354
2355    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2356
2357    if (U_FAILURE(deferredStatus)) {
2358        return -1;
2359    }
2360
2361    // Prev break at end of string.  return DONE.
2362    if (prevPos >= fText->length()) {
2363        return -1;
2364    }
2365    p0 = p1 = p2 = p3 = prevPos;
2366    c3 =  fText->char32At(prevPos);
2367    c0 = c1 = c2 = 0;
2368    (void)p0;       // Suppress set but not used warning.
2369
2370    // Loop runs once per "significant" character position in the input text.
2371    for (;;) {
2372        // Move all of the positions forward in the input string.
2373        p0 = p1;  c0 = c1;
2374        p1 = p2;  c1 = c2;
2375        p2 = p3;  c2 = c3;
2376
2377        // Advancd p3 by    X(Extend | Format)*   Rule 4
2378        //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2379        do {
2380            p3 = fText->moveIndex32(p3, 1);
2381            c3 = fText->char32At(p3);
2382            if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2383               break;
2384            };
2385        }
2386        while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2387
2388
2389        if (p1 == p2) {
2390            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2391            continue;
2392        }
2393        if (p2 == fText->length()) {
2394            // Reached end of string.  Always a break position.
2395            break;
2396        }
2397
2398        // Rule  (3)   CR x LF
2399        //     No Extend or Format characters may appear between the CR and LF,
2400        //     which requires the additional check for p2 immediately following p1.
2401        //
2402        if (c1==0x0D && c2==0x0A) {
2403            continue;
2404        }
2405
2406        // Rule (3a)  Break before and after newlines (including CR and LF)
2407        //
2408        if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2409            break;
2410        };
2411        if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2412            break;
2413        };
2414
2415        // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2416        if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2417            (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2418            continue;
2419        }
2420
2421        // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2422        //
2423        if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2424             (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2425             (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2426            continue;
2427        }
2428
2429        // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
2430        if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2431            (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2432            (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2433            continue;
2434        }
2435
2436        // Rule (7a)     Hebrew_Letter x Single_Quote
2437        if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2438            continue;
2439        }
2440
2441        // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
2442        if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2443            continue;
2444        }
2445
2446        // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
2447        if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2448            continue;
2449        }
2450
2451        // Rule (8)    Numeric x Numeric
2452        if (fNumericSet->contains(c1) &&
2453            fNumericSet->contains(c2))  {
2454            continue;
2455        }
2456
2457        // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
2458        if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2459            fNumericSet->contains(c2))  {
2460            continue;
2461        }
2462
2463        // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
2464        if (fNumericSet->contains(c1) &&
2465            (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2466            continue;
2467        }
2468
2469        // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
2470        if (fNumericSet->contains(c0) &&
2471            (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2472            fNumericSet->contains(c2)) {
2473            continue;
2474        }
2475
2476        // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2477        if (fNumericSet->contains(c1) &&
2478            (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2479            fNumericSet->contains(c3)) {
2480            continue;
2481        }
2482
2483        // Rule (13)  Katakana x Katakana
2484        if (fKatakanaSet->contains(c1) &&
2485            fKatakanaSet->contains(c2))  {
2486            continue;
2487        }
2488
2489        // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2490        if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2491             fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2492             fExtendNumLetSet->contains(c2)) {
2493                continue;
2494        }
2495
2496        // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2497        if (fExtendNumLetSet->contains(c1) &&
2498                (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2499                 fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2500            continue;
2501        }
2502
2503        // Rule 13c
2504        if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2505            continue;
2506        }
2507
2508        // Rule 14.  Break found here.
2509        break;
2510    }
2511
2512    breakPos = p2;
2513    return breakPos;
2514}
2515
2516
2517UVector  *RBBIWordMonkey::charClasses() {
2518    return fSets;
2519}
2520
2521
2522RBBIWordMonkey::~RBBIWordMonkey() {
2523    delete fSets;
2524    delete fCRSet;
2525    delete fLFSet;
2526    delete fNewlineSet;
2527    delete fKatakanaSet;
2528    delete fHebrew_LetterSet;
2529    delete fALetterSet;
2530    delete fSingle_QuoteSet;
2531    delete fDouble_QuoteSet;
2532    delete fMidNumLetSet;
2533    delete fMidLetterSet;
2534    delete fMidNumSet;
2535    delete fNumericSet;
2536    delete fFormatSet;
2537    delete fExtendSet;
2538    delete fExtendNumLetSet;
2539    delete fRegionalIndicatorSet;
2540    delete fDictionaryCjkSet;
2541    delete fOtherSet;
2542}
2543
2544
2545
2546
2547//------------------------------------------------------------------------------------------
2548//
2549//   class RBBISentMonkey      Sentence Break specific implementation
2550//                             of RBBIMonkeyKind.
2551//
2552//------------------------------------------------------------------------------------------
2553class RBBISentMonkey: public RBBIMonkeyKind {
2554public:
2555    RBBISentMonkey();
2556    virtual          ~RBBISentMonkey();
2557    virtual  UVector *charClasses();
2558    virtual  void     setText(const UnicodeString &s);
2559    virtual int32_t   next(int32_t i);
2560private:
2561    int               moveBack(int posFrom);
2562    int               moveForward(int posFrom);
2563    UChar32           cAt(int pos);
2564
2565    UVector      *fSets;
2566
2567    UnicodeSet  *fSepSet;
2568    UnicodeSet  *fFormatSet;
2569    UnicodeSet  *fSpSet;
2570    UnicodeSet  *fLowerSet;
2571    UnicodeSet  *fUpperSet;
2572    UnicodeSet  *fOLetterSet;
2573    UnicodeSet  *fNumericSet;
2574    UnicodeSet  *fATermSet;
2575    UnicodeSet  *fSContinueSet;
2576    UnicodeSet  *fSTermSet;
2577    UnicodeSet  *fCloseSet;
2578    UnicodeSet  *fOtherSet;
2579    UnicodeSet  *fExtendSet;
2580
2581    const UnicodeString  *fText;
2582
2583};
2584
2585RBBISentMonkey::RBBISentMonkey()
2586{
2587    UErrorCode  status = U_ZERO_ERROR;
2588
2589    fSets            = new UVector(status);
2590
2591    //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2592    //                       set and made into character classes of their own.  For the monkey impl,
2593    //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2594    fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2595    fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2596    fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2597    fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2598    fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2599    fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2600    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2601    fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2602    fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2603    fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2604    fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2605    fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2606    fOtherSet        = new UnicodeSet();
2607
2608    if(U_FAILURE(status)) {
2609      deferredStatus = status;
2610      return;
2611    }
2612
2613    fOtherSet->complement();
2614    fOtherSet->removeAll(*fSepSet);
2615    fOtherSet->removeAll(*fFormatSet);
2616    fOtherSet->removeAll(*fSpSet);
2617    fOtherSet->removeAll(*fLowerSet);
2618    fOtherSet->removeAll(*fUpperSet);
2619    fOtherSet->removeAll(*fOLetterSet);
2620    fOtherSet->removeAll(*fNumericSet);
2621    fOtherSet->removeAll(*fATermSet);
2622    fOtherSet->removeAll(*fSContinueSet);
2623    fOtherSet->removeAll(*fSTermSet);
2624    fOtherSet->removeAll(*fCloseSet);
2625    fOtherSet->removeAll(*fExtendSet);
2626
2627    fSets->addElement(fSepSet,       status);
2628    fSets->addElement(fFormatSet,    status);
2629    fSets->addElement(fSpSet,        status);
2630    fSets->addElement(fLowerSet,     status);
2631    fSets->addElement(fUpperSet,     status);
2632    fSets->addElement(fOLetterSet,   status);
2633    fSets->addElement(fNumericSet,   status);
2634    fSets->addElement(fATermSet,     status);
2635    fSets->addElement(fSContinueSet, status);
2636    fSets->addElement(fSTermSet,     status);
2637    fSets->addElement(fCloseSet,     status);
2638    fSets->addElement(fOtherSet,     status);
2639    fSets->addElement(fExtendSet,    status);
2640
2641    if (U_FAILURE(status)) {
2642        deferredStatus = status;
2643    }
2644}
2645
2646
2647
2648void RBBISentMonkey::setText(const UnicodeString &s) {
2649    fText       = &s;
2650}
2651
2652UVector  *RBBISentMonkey::charClasses() {
2653    return fSets;
2654}
2655
2656
2657//  moveBack()   Find the "significant" code point preceding the index i.
2658//               Skips over ($Extend | $Format)* .
2659//
2660int RBBISentMonkey::moveBack(int i) {
2661    if (i <= 0) {
2662        return -1;
2663    }
2664    UChar32   c;
2665    int32_t   j = i;
2666    do {
2667        j = fText->moveIndex32(j, -1);
2668        c = fText->char32At(j);
2669    }
2670    while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2671    return j;
2672
2673 }
2674
2675
2676int RBBISentMonkey::moveForward(int i) {
2677    if (i>=fText->length()) {
2678        return fText->length();
2679    }
2680    UChar32   c;
2681    int32_t   j = i;
2682    do {
2683        j = fText->moveIndex32(j, 1);
2684        c = cAt(j);
2685    }
2686    while (fFormatSet->contains(c) || fExtendSet->contains(c));
2687    return j;
2688}
2689
2690UChar32 RBBISentMonkey::cAt(int pos) {
2691    if (pos<0 || pos>=fText->length()) {
2692        return -1;
2693    } else {
2694        return fText->char32At(pos);
2695    }
2696}
2697
2698int32_t RBBISentMonkey::next(int32_t prevPos) {
2699    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2700                              //   break position being tested.  The candidate break
2701                              //   location is before p2.
2702
2703    int     breakPos = -1;
2704
2705    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2706    UChar32 c;
2707
2708    if (U_FAILURE(deferredStatus)) {
2709        return -1;
2710    }
2711
2712    // Prev break at end of string.  return DONE.
2713    if (prevPos >= fText->length()) {
2714        return -1;
2715    }
2716    p0 = p1 = p2 = p3 = prevPos;
2717    c3 =  fText->char32At(prevPos);
2718    c0 = c1 = c2 = 0;
2719    (void)p0;     // Suppress set but not used warning.
2720
2721    // Loop runs once per "significant" character position in the input text.
2722    for (;;) {
2723        // Move all of the positions forward in the input string.
2724        p0 = p1;  c0 = c1;
2725        p1 = p2;  c1 = c2;
2726        p2 = p3;  c2 = c3;
2727
2728        // Advancd p3 by    X(Extend | Format)*   Rule 4
2729        p3 = moveForward(p3);
2730        c3 = cAt(p3);
2731
2732        // Rule (3)  CR x LF
2733        if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2734            continue;
2735        }
2736
2737        // Rule (4).   Sep  <break>
2738        if (fSepSet->contains(c1)) {
2739            p2 = p1+1;   // Separators don't combine with Extend or Format.
2740            break;
2741        }
2742
2743        if (p2 >= fText->length()) {
2744            // Reached end of string.  Always a break position.
2745            break;
2746        }
2747
2748        if (p2 == prevPos) {
2749            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2750            continue;
2751        }
2752
2753        // Rule (6).   ATerm x Numeric
2754        if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2755            continue;
2756        }
2757
2758        // Rule (7).  Upper ATerm  x  Uppper
2759        if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2760            continue;
2761        }
2762
2763        // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2764        //           Note:  STerm | ATerm are added to the negated part of the expression by a
2765        //                  note to the Unicode 5.0 documents.
2766        int p8 = p1;
2767        while (fSpSet->contains(cAt(p8))) {
2768            p8 = moveBack(p8);
2769        }
2770        while (fCloseSet->contains(cAt(p8))) {
2771            p8 = moveBack(p8);
2772        }
2773        if (fATermSet->contains(cAt(p8))) {
2774            p8=p2;
2775            for (;;) {
2776                c = cAt(p8);
2777                if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2778                    fLowerSet->contains(c) || fSepSet->contains(c) ||
2779                    fATermSet->contains(c) || fSTermSet->contains(c))  {
2780                    break;
2781                }
2782                p8 = moveForward(p8);
2783            }
2784            if (fLowerSet->contains(cAt(p8))) {
2785                continue;
2786            }
2787        }
2788
2789        // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2790        if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2791            p8 = p1;
2792            while (fSpSet->contains(cAt(p8))) {
2793                p8 = moveBack(p8);
2794            }
2795            while (fCloseSet->contains(cAt(p8))) {
2796                p8 = moveBack(p8);
2797            }
2798            c = cAt(p8);
2799            if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2800                continue;
2801            }
2802        }
2803
2804        // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2805        int p9 = p1;
2806        while (fCloseSet->contains(cAt(p9))) {
2807            p9 = moveBack(p9);
2808        }
2809        c = cAt(p9);
2810        if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2811            if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2812                continue;
2813            }
2814        }
2815
2816        // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2817        int p10 = p1;
2818        while (fSpSet->contains(cAt(p10))) {
2819            p10 = moveBack(p10);
2820        }
2821        while (fCloseSet->contains(cAt(p10))) {
2822            p10 = moveBack(p10);
2823        }
2824        if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2825            if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2826                continue;
2827            }
2828        }
2829
2830        // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2831        int p11 = p1;
2832        if (fSepSet->contains(cAt(p11))) {
2833            p11 = moveBack(p11);
2834        }
2835        while (fSpSet->contains(cAt(p11))) {
2836            p11 = moveBack(p11);
2837        }
2838        while (fCloseSet->contains(cAt(p11))) {
2839            p11 = moveBack(p11);
2840        }
2841        if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2842            break;
2843        }
2844
2845        //  Rule (12)  Any x Any
2846        continue;
2847    }
2848    breakPos = p2;
2849    return breakPos;
2850}
2851
2852RBBISentMonkey::~RBBISentMonkey() {
2853    delete fSets;
2854    delete fSepSet;
2855    delete fFormatSet;
2856    delete fSpSet;
2857    delete fLowerSet;
2858    delete fUpperSet;
2859    delete fOLetterSet;
2860    delete fNumericSet;
2861    delete fATermSet;
2862    delete fSContinueSet;
2863    delete fSTermSet;
2864    delete fCloseSet;
2865    delete fOtherSet;
2866    delete fExtendSet;
2867}
2868
2869
2870
2871//-------------------------------------------------------------------------------------------
2872//
2873//  RBBILineMonkey
2874//
2875//-------------------------------------------------------------------------------------------
2876
2877class RBBILineMonkey: public RBBIMonkeyKind {
2878public:
2879    RBBILineMonkey();
2880    virtual          ~RBBILineMonkey();
2881    virtual  UVector *charClasses();
2882    virtual  void     setText(const UnicodeString &s);
2883    virtual  int32_t  next(int32_t i);
2884    virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2885private:
2886    UVector      *fSets;
2887
2888    UnicodeSet  *fBK;
2889    UnicodeSet  *fCR;
2890    UnicodeSet  *fLF;
2891    UnicodeSet  *fCM;
2892    UnicodeSet  *fNL;
2893    UnicodeSet  *fSG;
2894    UnicodeSet  *fWJ;
2895    UnicodeSet  *fZW;
2896    UnicodeSet  *fGL;
2897    UnicodeSet  *fCB;
2898    UnicodeSet  *fSP;
2899    UnicodeSet  *fB2;
2900    UnicodeSet  *fBA;
2901    UnicodeSet  *fBB;
2902    UnicodeSet  *fHY;
2903    UnicodeSet  *fH2;
2904    UnicodeSet  *fH3;
2905    UnicodeSet  *fCL;
2906    UnicodeSet  *fCP;
2907    UnicodeSet  *fEX;
2908    UnicodeSet  *fIN;
2909    UnicodeSet  *fJL;
2910    UnicodeSet  *fJV;
2911    UnicodeSet  *fJT;
2912    UnicodeSet  *fNS;
2913    UnicodeSet  *fOP;
2914    UnicodeSet  *fQU;
2915    UnicodeSet  *fIS;
2916    UnicodeSet  *fNU;
2917    UnicodeSet  *fPO;
2918    UnicodeSet  *fPR;
2919    UnicodeSet  *fSY;
2920    UnicodeSet  *fAI;
2921    UnicodeSet  *fAL;
2922    UnicodeSet  *fCJ;
2923    UnicodeSet  *fHL;
2924    UnicodeSet  *fID;
2925    UnicodeSet  *fRI;
2926    UnicodeSet  *fSA;
2927    UnicodeSet  *fXX;
2928
2929    BreakIterator        *fCharBI;
2930    const UnicodeString  *fText;
2931    RegexMatcher         *fNumberMatcher;
2932};
2933
2934
2935RBBILineMonkey::RBBILineMonkey()
2936{
2937    UErrorCode  status = U_ZERO_ERROR;
2938
2939    fSets  = new UVector(status);
2940
2941    fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2942    fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2943    fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2944    fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2945    fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2946    fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2947    fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2948    fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2949    fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2950    fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2951    fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2952    fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2953    fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2954    fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2955    fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2956    fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2957    fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2958    fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2959    fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2960    fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2961    fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2962    fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2963    fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2964    fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2965    fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2966    fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2967    fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2968    fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2969    fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2970    fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2971    fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2972    fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2973    fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2974    fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2975    fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2976    fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2977    fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2978    fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2979    fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2980    fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2981
2982    if (U_FAILURE(status)) {
2983        deferredStatus = status;
2984        fCharBI = NULL;
2985        fNumberMatcher = NULL;
2986        return;
2987    }
2988
2989    fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2990    fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2991    fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
2992    fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2993
2994    fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2995
2996    fSets->addElement(fBK, status);
2997    fSets->addElement(fCR, status);
2998    fSets->addElement(fLF, status);
2999    fSets->addElement(fCM, status);
3000    fSets->addElement(fNL, status);
3001    fSets->addElement(fWJ, status);
3002    fSets->addElement(fZW, status);
3003    fSets->addElement(fGL, status);
3004    fSets->addElement(fCB, status);
3005    fSets->addElement(fSP, status);
3006    fSets->addElement(fB2, status);
3007    fSets->addElement(fBA, status);
3008    fSets->addElement(fBB, status);
3009    fSets->addElement(fHY, status);
3010    fSets->addElement(fH2, status);
3011    fSets->addElement(fH3, status);
3012    fSets->addElement(fCL, status);
3013    fSets->addElement(fCP, status);
3014    fSets->addElement(fEX, status);
3015    fSets->addElement(fIN, status);
3016    fSets->addElement(fJL, status);
3017    fSets->addElement(fJT, status);
3018    fSets->addElement(fJV, status);
3019    fSets->addElement(fNS, status);
3020    fSets->addElement(fOP, status);
3021    fSets->addElement(fQU, status);
3022    fSets->addElement(fIS, status);
3023    fSets->addElement(fNU, status);
3024    fSets->addElement(fPO, status);
3025    fSets->addElement(fPR, status);
3026    fSets->addElement(fSY, status);
3027    fSets->addElement(fAI, status);
3028    fSets->addElement(fAL, status);
3029    fSets->addElement(fHL, status);
3030    fSets->addElement(fID, status);
3031    fSets->addElement(fWJ, status);
3032    fSets->addElement(fRI, status);
3033    fSets->addElement(fSA, status);
3034    fSets->addElement(fSG, status);
3035
3036    const char *rules =
3037            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3038            "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3039            "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3040            "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3041            "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3042            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3043
3044    fNumberMatcher = new RegexMatcher(
3045        UnicodeString(rules, -1, US_INV), 0, status);
3046
3047    fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3048
3049    if (U_FAILURE(status)) {
3050        deferredStatus = status;
3051    }
3052}
3053
3054
3055void RBBILineMonkey::setText(const UnicodeString &s) {
3056    fText       = &s;
3057    fCharBI->setText(s);
3058    fNumberMatcher->reset(s);
3059}
3060
3061//
3062//  rule9Adjust
3063//     Line Break TR rules 9 and 10 implementation.
3064//     This deals with combining marks and other sequences that
3065//     that must be treated as if they were something other than what they actually are.
3066//
3067//     This is factored out into a separate function because it must be applied twice for
3068//     each potential break, once to the chars before the position being checked, then
3069//     again to the text following the possible break.
3070//
3071void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3072    if (pos == -1) {
3073        // Invalid initial position.  Happens during the warmup iteration of the
3074        //   main loop in next().
3075        return;
3076    }
3077
3078    int32_t  nPos = *nextPos;
3079
3080    // LB 9  Keep combining sequences together.
3081    //  advance over any CM class chars.  Note that Line Break CM is different
3082    //  from the normal Grapheme Extend property.
3083    if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3084          *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3085        for (;;) {
3086            *nextChar = fText->char32At(nPos);
3087            if (!fCM->contains(*nextChar)) {
3088                break;
3089            }
3090            nPos = fText->moveIndex32(nPos, 1);
3091        }
3092    }
3093
3094
3095    // LB 9 Treat X CM* as if it were x.
3096    //       No explicit action required.
3097
3098    // LB 10  Treat any remaining combining mark as AL
3099    if (fCM->contains(*posChar)) {
3100        *posChar = 0x41;   // thisChar = 'A';
3101    }
3102
3103    // Push the updated nextPos and nextChar back to our caller.
3104    // This only makes a difference if posChar got bigger by consuming a
3105    // combining sequence.
3106    *nextPos  = nPos;
3107    *nextChar = fText->char32At(nPos);
3108}
3109
3110
3111
3112int32_t RBBILineMonkey::next(int32_t startPos) {
3113    UErrorCode status = U_ZERO_ERROR;
3114    int32_t    pos;       //  Index of the char following a potential break position
3115    UChar32    thisChar;  //  Character at above position "pos"
3116
3117    int32_t    prevPos;   //  Index of the char preceding a potential break position
3118    UChar32    prevChar;  //  Character at above position.  Note that prevChar
3119                          //   and thisChar may not be adjacent because combining
3120                          //   characters between them will be ignored.
3121
3122    int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
3123    UChar32    prevCharX2;
3124
3125    int32_t    nextPos;   //  Index of the next character following pos.
3126                          //     Usually skips over combining marks.
3127    int32_t    nextCPPos; //  Index of the code point following "pos."
3128                          //     May point to a combining mark.
3129    int32_t    tPos;      //  temp value.
3130    UChar32    c;
3131
3132    if (U_FAILURE(deferredStatus)) {
3133        return -1;
3134    }
3135
3136    if (startPos >= fText->length()) {
3137        return -1;
3138    }
3139
3140
3141    // Initial values for loop.  Loop will run the first time without finding breaks,
3142    //                           while the invalid values shift out and the "this" and
3143    //                           "prev" positions are filled in with good values.
3144    pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
3145    thisChar = prevChar  = prevCharX2 = 0;
3146    nextPos  = nextCPPos = startPos;
3147
3148
3149    // Loop runs once per position in the test text, until a break position
3150    //  is found.
3151    for (;;) {
3152        prevPosX2 = prevPos;
3153        prevCharX2 = prevChar;
3154
3155        prevPos   = pos;
3156        prevChar  = thisChar;
3157
3158        pos       = nextPos;
3159        thisChar  = fText->char32At(pos);
3160
3161        nextCPPos = fText->moveIndex32(pos, 1);
3162        nextPos   = nextCPPos;
3163
3164        // Rule LB2 - Break at end of text.
3165        if (pos >= fText->length()) {
3166            break;
3167        }
3168
3169        // Rule LB 9 - adjust for combining sequences.
3170        //             We do this one out-of-order because the adjustment does not change anything
3171        //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3172        //             be applied.
3173        rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3174        nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3175        c = fText->char32At(nextPos);
3176        rule9Adjust(pos,     &thisChar, &nextPos, &c);
3177
3178        // If the loop is still warming up - if we haven't shifted the initial
3179        //   -1 positions out of prevPos yet - loop back to advance the
3180        //    position in the input without any further looking for breaks.
3181        if (prevPos == -1) {
3182            continue;
3183        }
3184
3185        // LB 4  Always break after hard line breaks,
3186        if (fBK->contains(prevChar)) {
3187            break;
3188        }
3189
3190        // LB 5  Break after CR, LF, NL, but not inside CR LF
3191        if (prevChar == 0x0d && thisChar == 0x0a) {
3192            continue;
3193        }
3194        if (prevChar == 0x0d ||
3195            prevChar == 0x0a ||
3196            prevChar == 0x85)  {
3197            break;
3198        }
3199
3200        // LB 6  Don't break before hard line breaks
3201        if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3202            fBK->contains(thisChar)) {
3203                continue;
3204        }
3205
3206
3207        // LB 7  Don't break before spaces or zero-width space.
3208        if (fSP->contains(thisChar)) {
3209            continue;
3210        }
3211
3212        if (fZW->contains(thisChar)) {
3213            continue;
3214        }
3215
3216        // LB 8  Break after zero width space
3217        if (fZW->contains(prevChar)) {
3218            break;
3219        }
3220
3221        // LB 9, 10  Already done, at top of loop.
3222        //
3223
3224
3225        // LB 11  Do not break before or after WORD JOINER and related characters.
3226        //    x  WJ
3227        //    WJ  x
3228        //
3229        if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3230            continue;
3231        }
3232
3233        // LB 12
3234        //    GL  x
3235        if (fGL->contains(prevChar)) {
3236            continue;
3237        }
3238
3239        // LB 12a
3240        //    [^SP BA HY] x GL
3241        if (!(fSP->contains(prevChar) ||
3242              fBA->contains(prevChar) ||
3243              fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3244            continue;
3245        }
3246
3247
3248
3249        // LB 13  Don't break before closings.
3250        //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3251        //        fall into LB 17 and the more general number regular expression.
3252        //
3253        if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3254            (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3255                                         fEX->contains(thisChar)  ||
3256            (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3257            (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3258            continue;
3259        }
3260
3261        // LB 14 Don't break after OP SP*
3262        //       Scan backwards, checking for this sequence.
3263        //       The OP char could include combining marks, so we actually check for
3264        //           OP CM* SP*
3265        //       Another Twist: The Rule 67 fixes may have changed a SP CM
3266        //       sequence into a ID char, so before scanning back through spaces,
3267        //       verify that prevChar is indeed a space.  The prevChar variable
3268        //       may differ from fText[prevPos]
3269        tPos = prevPos;
3270        if (fSP->contains(prevChar)) {
3271            while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3272                tPos=fText->moveIndex32(tPos, -1);
3273            }
3274        }
3275        while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3276            tPos=fText->moveIndex32(tPos, -1);
3277        }
3278        if (fOP->contains(fText->char32At(tPos))) {
3279            continue;
3280        }
3281
3282
3283        // LB 15    QU SP* x OP
3284        if (fOP->contains(thisChar)) {
3285            // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3286            int tPos = prevPos;
3287            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3288                tPos = fText->moveIndex32(tPos, -1);
3289            }
3290            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3291                tPos = fText->moveIndex32(tPos, -1);
3292            }
3293            if (fQU->contains(fText->char32At(tPos))) {
3294                continue;
3295            }
3296        }
3297
3298
3299
3300        // LB 16   (CL | CP) SP* x NS
3301        //    Scan backwards for SP* CM* (CL | CP)
3302        if (fNS->contains(thisChar)) {
3303            int tPos = prevPos;
3304            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3305                tPos = fText->moveIndex32(tPos, -1);
3306            }
3307            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3308                tPos = fText->moveIndex32(tPos, -1);
3309            }
3310            if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3311                continue;
3312            }
3313        }
3314
3315
3316        // LB 17        B2 SP* x B2
3317        if (fB2->contains(thisChar)) {
3318            //  Scan backwards, checking for the B2 CM* SP* sequence.
3319            tPos = prevPos;
3320            if (fSP->contains(prevChar)) {
3321                while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3322                    tPos=fText->moveIndex32(tPos, -1);
3323                }
3324            }
3325            while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3326                tPos=fText->moveIndex32(tPos, -1);
3327            }
3328            if (fB2->contains(fText->char32At(tPos))) {
3329                continue;
3330            }
3331        }
3332
3333
3334        // LB 18    break after space
3335        if (fSP->contains(prevChar)) {
3336            break;
3337        }
3338
3339        // LB 19
3340        //    x   QU
3341        //    QU  x
3342        if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3343            continue;
3344        }
3345
3346        // LB 20  Break around a CB
3347        if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3348            break;
3349        }
3350
3351        // LB 21
3352        if (fBA->contains(thisChar) ||
3353            fHY->contains(thisChar) ||
3354            fNS->contains(thisChar) ||
3355            fBB->contains(prevChar) )   {
3356            continue;
3357        }
3358
3359        // LB 21a
3360        //   HL (HY | BA) x
3361        if (fHL->contains(prevCharX2) &&
3362                (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3363            continue;
3364        }
3365
3366        // LB 21b
3367        //   SY x HL
3368        if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3369            continue;
3370        }
3371
3372        // LB 22
3373        if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3374            (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3375            (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3376            (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3377            (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3378            continue;
3379        }
3380
3381
3382        // LB 23    ID x PO
3383        //          AL x NU
3384        //          HL x NU
3385        //          NU x AL
3386        if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3387            (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3388            (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3389            (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3390            (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
3391            continue;
3392        }
3393
3394        // LB 24  Do not break between prefix and letters or ideographs.
3395        //        PR x ID
3396        //        PR x (AL | HL)
3397        //        PO x (AL | HL)
3398        if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3399            (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3400            (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
3401            continue;
3402        }
3403
3404
3405
3406        // LB 25    Numbers
3407        if (fNumberMatcher->lookingAt(prevPos, status)) {
3408            if (U_FAILURE(status)) {
3409                break;
3410            }
3411            // Matched a number.  But could have been just a single digit, which would
3412            //    not represent a "no break here" between prevChar and thisChar
3413            int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3414            if (numEndIdx > pos) {
3415                // Number match includes at least our two chars being checked
3416                if (numEndIdx > nextPos) {
3417                    // Number match includes additional chars.  Update pos and nextPos
3418                    //   so that next loop iteration will continue at the end of the number,
3419                    //   checking for breaks between last char in number & whatever follows.
3420                    pos = nextPos = numEndIdx;
3421                    do {
3422                        pos = fText->moveIndex32(pos, -1);
3423                        thisChar = fText->char32At(pos);
3424                    } while (fCM->contains(thisChar));
3425                }
3426                continue;
3427            }
3428        }
3429
3430
3431        // LB 26 Do not break a Korean syllable.
3432        if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3433                                        fJV->contains(thisChar) ||
3434                                        fH2->contains(thisChar) ||
3435                                        fH3->contains(thisChar))) {
3436                                            continue;
3437                                        }
3438
3439        if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3440            (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3441                continue;
3442        }
3443
3444        if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3445            fJT->contains(thisChar)) {
3446                continue;
3447        }
3448
3449        // LB 27 Treat a Korean Syllable Block the same as ID.
3450        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3451            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3452            fIN->contains(thisChar)) {
3453                continue;
3454            }
3455        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3456            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3457            fPO->contains(thisChar)) {
3458                continue;
3459            }
3460        if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3461            fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3462                continue;
3463            }
3464
3465
3466
3467        // LB 28  Do not break between alphabetics ("at").
3468        if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3469            continue;
3470        }
3471
3472        // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3473        if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3474            continue;
3475        }
3476
3477        // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3478        //          (AL | NU) x OP
3479        //          CP x (AL | NU)
3480        if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3481            continue;
3482        }
3483        if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3484            continue;
3485        }
3486
3487        // LB30a  Do not break between regional indicators.
3488        //        RI x RI
3489        if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3490            continue;
3491        }
3492
3493        // LB 31    Break everywhere else
3494        break;
3495
3496    }
3497
3498    return pos;
3499}
3500
3501
3502UVector  *RBBILineMonkey::charClasses() {
3503    return fSets;
3504}
3505
3506
3507RBBILineMonkey::~RBBILineMonkey() {
3508    delete fSets;
3509
3510    delete fBK;
3511    delete fCR;
3512    delete fLF;
3513    delete fCM;
3514    delete fNL;
3515    delete fWJ;
3516    delete fZW;
3517    delete fGL;
3518    delete fCB;
3519    delete fSP;
3520    delete fB2;
3521    delete fBA;
3522    delete fBB;
3523    delete fHY;
3524    delete fH2;
3525    delete fH3;
3526    delete fCL;
3527    delete fCP;
3528    delete fEX;
3529    delete fIN;
3530    delete fJL;
3531    delete fJV;
3532    delete fJT;
3533    delete fNS;
3534    delete fOP;
3535    delete fQU;
3536    delete fIS;
3537    delete fNU;
3538    delete fPO;
3539    delete fPR;
3540    delete fSY;
3541    delete fAI;
3542    delete fAL;
3543    delete fCJ;
3544    delete fHL;
3545    delete fID;
3546    delete fRI;
3547    delete fSA;
3548    delete fSG;
3549    delete fXX;
3550
3551    delete fCharBI;
3552    delete fNumberMatcher;
3553}
3554
3555
3556//-------------------------------------------------------------------------------------------
3557//
3558//   TestMonkey
3559//
3560//     params
3561//       seed=nnnnn        Random number starting seed.
3562//                         Setting the seed allows errors to be reproduced.
3563//       loop=nnn          Looping count.  Controls running time.
3564//                         -1:  run forever.
3565//                          0 or greater:  run length.
3566//
3567//       type = char | word | line | sent | title
3568//
3569//-------------------------------------------------------------------------------------------
3570
3571static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3572    int32_t val = defaultVal;
3573    name.append(" *= *(-?\\d+)");
3574    UErrorCode status = U_ZERO_ERROR;
3575    RegexMatcher m(name, params, 0, status);
3576    if (m.find()) {
3577        // The param exists.  Convert the string to an int.
3578        char valString[100];
3579        int32_t paramLength = m.end(1, status) - m.start(1, status);
3580        if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3581            paramLength = (int32_t)(sizeof(valString)-2);
3582        }
3583        params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3584        val = strtol(valString,  NULL, 10);
3585
3586        // Delete this parameter from the params string.
3587        m.reset();
3588        params = m.replaceFirst("", status);
3589    }
3590    U_ASSERT(U_SUCCESS(status));
3591    return val;
3592}
3593#endif
3594
3595#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3596static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3597                                    BreakIterator *bi,
3598                                    int expected[],
3599                                    int expectedcount)
3600{
3601    int count = 0;
3602    int i = 0;
3603    int forward[50];
3604    bi->setText(ustr);
3605    for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3606        forward[count] = i;
3607        if (count < expectedcount && expected[count] != i) {
3608            test->errln("break forward test failed: expected %d but got %d",
3609                        expected[count], i);
3610            break;
3611        }
3612        count ++;
3613    }
3614    if (count != expectedcount) {
3615        printStringBreaks(ustr, expected, expectedcount);
3616        test->errln("break forward test failed: missed %d match",
3617                    expectedcount - count);
3618        return;
3619    }
3620    // testing boundaries
3621    for (i = 1; i < expectedcount; i ++) {
3622        int j = expected[i - 1];
3623        if (!bi->isBoundary(j)) {
3624            printStringBreaks(ustr, expected, expectedcount);
3625            test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3626            return;
3627        }
3628        for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3629            if (bi->isBoundary(j)) {
3630                printStringBreaks(ustr, expected, expectedcount);
3631                test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3632                return;
3633            }
3634        }
3635    }
3636
3637    for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3638        count --;
3639        if (forward[count] != i) {
3640            printStringBreaks(ustr, expected, expectedcount);
3641            test->errln("happy break test previous() failed: expected %d but got %d",
3642                        forward[count], i);
3643            break;
3644        }
3645    }
3646    if (count != 0) {
3647        printStringBreaks(ustr, expected, expectedcount);
3648        test->errln("break test previous() failed: missed a match");
3649        return;
3650    }
3651
3652    // testing preceding
3653    for (i = 0; i < expectedcount - 1; i ++) {
3654        // int j = expected[i] + 1;
3655        int j = ustr.moveIndex32(expected[i], 1);
3656        for (; j <= expected[i + 1]; j ++) {
3657            if (bi->preceding(j) != expected[i]) {
3658                printStringBreaks(ustr, expected, expectedcount);
3659                test->errln("preceding(): Not expecting boundary at position %d", j);
3660                return;
3661            }
3662        }
3663    }
3664}
3665#endif
3666
3667void RBBITest::TestWordBreaks(void)
3668{
3669#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3670
3671    Locale        locale("en");
3672    UErrorCode    status = U_ZERO_ERROR;
3673    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3674    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3675    // Replaced any C+J characters in a row with a random sequence of characters
3676    // of the same length to make our C+J segmentation not get in the way.
3677    static const char *strlist[] =
3678    {
3679    "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3680    "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3681    "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3682    "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3683    "\\uac00\\u3588\\u009c\\u0953\\u194b",
3684    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3685    "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3686    "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3687    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3688    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3689    "\\u2027\\U000e0067\\u0a47\\u00b7",
3690    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3691    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3692    "\\u0589\\U000e006e\\u0a42\\U000104a5",
3693    "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3694    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3695    "\\u0027\\u11af\\U000e0057\\u0602",
3696    "\\U0001d7f2\\U000e007\\u0004\\u0589",
3697    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3698    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3699    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3700    "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3701    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3702    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3703    "\\u0233\\U000e0020\\u0a69\\u0d6a",
3704    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3705    "\\u18f4\\U000e0049\\u20e7\\u2027",
3706    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3707    "\\ua183\\u102d\\u0bec\\u003a",
3708    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3709    "\\u003a\\u0e57\\u0fad\\u002e",
3710    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3711    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3712    "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3713    "\\u003a\\u0664\\u00b7\\u1fba",
3714    "\\u003b\\u0027\\u00b7\\u47a3",
3715    "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3716    "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3717    "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3718    };
3719    int loop;
3720    if (U_FAILURE(status)) {
3721        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3722        return;
3723    }
3724    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3725        // printf("looping %d\n", loop);
3726        UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3727        // RBBICharMonkey monkey;
3728        RBBIWordMonkey monkey;
3729
3730        int expected[50];
3731        int expectedcount = 0;
3732
3733        monkey.setText(ustr);
3734        int i;
3735        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3736            expected[expectedcount ++] = i;
3737        }
3738
3739        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3740    }
3741    delete bi;
3742#endif
3743}
3744
3745void RBBITest::TestWordBoundary(void)
3746{
3747    // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3748    Locale        locale("en");
3749    UErrorCode    status = U_ZERO_ERROR;
3750    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3751    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3752    UChar         str[50];
3753    static const char *strlist[] =
3754    {
3755    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3756    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3757    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3758    "\\u2027\\U000e0067\\u0a47\\u00b7",
3759    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3760    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3761    "\\u0589\\U000e006e\\u0a42\\U000104a5",
3762    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3763    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3764    "\\u0027\\u11af\\U000e0057\\u0602",
3765    "\\U0001d7f2\\U000e007\\u0004\\u0589",
3766    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3767    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3768    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3769    "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3770    "\\U000e0065\\u302c\\u09ee\\U000e0068",
3771    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3772    "\\u0233\\U000e0020\\u0a69\\u0d6a",
3773    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3774    "\\u58f4\\U000e0049\\u20e7\\u2027",
3775    "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3776    "\\ua183\\u102d\\u0bec\\u003a",
3777    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3778    "\\u003a\\u0e57\\u0fad\\u002e",
3779    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3780    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3781    "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3782    "\\u003a\\u0664\\u00b7\\u1fba",
3783    "\\u003b\\u0027\\u00b7\\u47a3",
3784    };
3785    int loop;
3786    if (U_FAILURE(status)) {
3787        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3788        return;
3789    }
3790    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3791        // printf("looping %d\n", loop);
3792        u_unescape(strlist[loop], str, 20);
3793        UnicodeString ustr(str);
3794        int forward[50];
3795        int count = 0;
3796
3797        bi->setText(ustr);
3798        int prev = 0;
3799        int i;
3800        for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3801            forward[count ++] = i;
3802            if (i > prev) {
3803                int j;
3804                for (j = prev + 1; j < i; j ++) {
3805                    if (bi->isBoundary(j)) {
3806                        printStringBreaks(ustr, forward, count);
3807                        errln("happy boundary test failed: expected %d not a boundary",
3808                               j);
3809                        return;
3810                    }
3811                }
3812            }
3813            if (!bi->isBoundary(i)) {
3814                printStringBreaks(ustr, forward, count);
3815                errln("happy boundary test failed: expected %d a boundary",
3816                       i);
3817                return;
3818            }
3819            prev = i;
3820        }
3821    }
3822    delete bi;
3823}
3824
3825void RBBITest::TestLineBreaks(void)
3826{
3827#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3828    Locale        locale("en");
3829    UErrorCode    status = U_ZERO_ERROR;
3830    BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3831    const int32_t  STRSIZE = 50;
3832    UChar         str[STRSIZE];
3833    static const char *strlist[] =
3834    {
3835     "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3836     "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3837             "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3838     "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3839             "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3840     "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3841     "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3842     "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3843     "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3844     "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3845     "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3846     "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3847     "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3848     "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3849     "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3850     "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3851     "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3852     "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3853     "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3854     "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3855     "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3856     "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3857     "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3858     "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3859     "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3860     "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3861     "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3862     "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3863     "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3864     "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3865     "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3866     "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3867     "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3868     "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3869     "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3870     "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3871     "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3872     "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3873     "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3874     "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3875     "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3876     "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3877         "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3878         "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3879         "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3880     "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3881         "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3882    };
3883    int loop;
3884    TEST_ASSERT_SUCCESS(status);
3885    if (U_FAILURE(status)) {
3886        return;
3887    }
3888    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3889        // printf("looping %d\n", loop);
3890        int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3891        if (t >= STRSIZE) {
3892            TEST_ASSERT(FALSE);
3893            continue;
3894        }
3895
3896
3897        UnicodeString ustr(str);
3898        RBBILineMonkey monkey;
3899        if (U_FAILURE(monkey.deferredStatus)) {
3900            continue;
3901        }
3902
3903        const int EXPECTEDSIZE = 50;
3904        int expected[EXPECTEDSIZE];
3905        int expectedcount = 0;
3906
3907        monkey.setText(ustr);
3908        int i;
3909        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3910            if (expectedcount >= EXPECTEDSIZE) {
3911                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3912                return;
3913            }
3914            expected[expectedcount ++] = i;
3915        }
3916
3917        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3918    }
3919    delete bi;
3920#endif
3921}
3922
3923void RBBITest::TestSentBreaks(void)
3924{
3925#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3926    Locale        locale("en");
3927    UErrorCode    status = U_ZERO_ERROR;
3928    BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3929    UChar         str[200];
3930    static const char *strlist[] =
3931    {
3932     "Now\ris\nthe\r\ntime\n\rfor\r\r",
3933     "This\n",
3934     "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3935     "\"Sentence ending with a quote.\" Bye.",
3936     "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3937     "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3938     "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3939     "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3940     "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3941     "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3942     "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3943             "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3944             "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3945             "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3946     "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3947             "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3948             "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3949             "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3950             "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3951             "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3952    };
3953    int loop;
3954    if (U_FAILURE(status)) {
3955        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3956        return;
3957    }
3958    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3959        u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3960        UnicodeString ustr(str);
3961
3962        RBBISentMonkey monkey;
3963        if (U_FAILURE(monkey.deferredStatus)) {
3964            continue;
3965        }
3966
3967        const int EXPECTEDSIZE = 50;
3968        int expected[EXPECTEDSIZE];
3969        int expectedcount = 0;
3970
3971        monkey.setText(ustr);
3972        int i;
3973        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3974            if (expectedcount >= EXPECTEDSIZE) {
3975                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3976                return;
3977            }
3978            expected[expectedcount ++] = i;
3979        }
3980
3981        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3982    }
3983    delete bi;
3984#endif
3985}
3986
3987void RBBITest::TestMonkey(char *params) {
3988#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3989
3990    UErrorCode     status    = U_ZERO_ERROR;
3991    int32_t        loopCount = 500;
3992    int32_t        seed      = 1;
3993    UnicodeString  breakType = "all";
3994    Locale         locale("en");
3995    UBool          useUText  = FALSE;
3996
3997    if (quick == FALSE) {
3998        loopCount = 10000;
3999    }
4000
4001    if (params) {
4002        UnicodeString p(params);
4003        loopCount = getIntParam("loop", p, loopCount);
4004        seed      = getIntParam("seed", p, seed);
4005
4006        RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4007        if (m.find()) {
4008            breakType = m.group(1, status);
4009            m.reset();
4010            p = m.replaceFirst("", status);
4011        }
4012
4013        RegexMatcher u(" *utext", p, 0, status);
4014        if (u.find()) {
4015            useUText = TRUE;
4016            u.reset();
4017            p = u.replaceFirst("", status);
4018        }
4019
4020
4021        // m.reset(p);
4022        if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4023            // Each option is stripped out of the option string as it is processed.
4024            // All options have been checked.  The option string should have been completely emptied..
4025            char buf[100];
4026            p.extract(buf, sizeof(buf), NULL, status);
4027            buf[sizeof(buf)-1] = 0;
4028            errln("Unrecognized or extra parameter:  %s\n", buf);
4029            return;
4030        }
4031
4032    }
4033
4034    if (breakType == "char" || breakType == "all") {
4035        RBBICharMonkey  m;
4036        BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4037        if (U_SUCCESS(status)) {
4038            RunMonkey(bi, m, "char", seed, loopCount, useUText);
4039            if (breakType == "all" && useUText==FALSE) {
4040                // Also run a quick test with UText when "all" is specified
4041                RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4042            }
4043        }
4044        else {
4045            errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4046        }
4047        delete bi;
4048    }
4049
4050    if (breakType == "word" || breakType == "all") {
4051        logln("Word Break Monkey Test");
4052        RBBIWordMonkey  m;
4053        BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4054        if (U_SUCCESS(status)) {
4055            RunMonkey(bi, m, "word", seed, loopCount, useUText);
4056        }
4057        else {
4058            errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4059        }
4060        delete bi;
4061    }
4062
4063    if (breakType == "line" || breakType == "all") {
4064        logln("Line Break Monkey Test");
4065        RBBILineMonkey  m;
4066        BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4067        if (loopCount >= 10) {
4068            loopCount = loopCount / 5;   // Line break runs slower than the others.
4069        }
4070        if (U_SUCCESS(status)) {
4071            RunMonkey(bi, m, "line", seed, loopCount, useUText);
4072        }
4073        else {
4074            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4075        }
4076        delete bi;
4077    }
4078
4079    if (breakType == "sent" || breakType == "all"  ) {
4080        logln("Sentence Break Monkey Test");
4081        RBBISentMonkey  m;
4082        BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4083        if (loopCount >= 10) {
4084            loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4085        }
4086        if (U_SUCCESS(status)) {
4087            RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4088        }
4089        else {
4090            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4091        }
4092        delete bi;
4093    }
4094
4095#endif
4096}
4097
4098//
4099//  Run a RBBI monkey test.  Common routine, for all break iterator types.
4100//    Parameters:
4101//       bi      - the break iterator to use
4102//       mk      - MonkeyKind, abstraction for obtaining expected results
4103//       name    - Name of test (char, word, etc.) for use in error messages
4104//       seed    - Seed for starting random number generator (parameter from user)
4105//       numIterations
4106//
4107void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4108                         int32_t numIterations, UBool useUText) {
4109
4110#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4111
4112    const int32_t    TESTSTRINGLEN = 500;
4113    UnicodeString    testText;
4114    int32_t          numCharClasses;
4115    UVector          *chClasses;
4116    int              expected[TESTSTRINGLEN*2 + 1];
4117    int              expectedCount = 0;
4118    char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4119    char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4120    char             reverseBreaks[TESTSTRINGLEN*2+1];
4121    char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4122    char             followingBreaks[TESTSTRINGLEN*2+1];
4123    char             precedingBreaks[TESTSTRINGLEN*2+1];
4124    int              i;
4125    int              loopCount = 0;
4126
4127    m_seed = seed;
4128
4129    numCharClasses = mk.charClasses()->size();
4130    chClasses      = mk.charClasses();
4131
4132    // Check for errors that occured during the construction of the MonkeyKind object.
4133    //  Can't report them where they occured because errln() is a method coming from intlTest,
4134    //  and is not visible outside of RBBITest :-(
4135    if (U_FAILURE(mk.deferredStatus)) {
4136        errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4137        return;
4138    }
4139
4140    // Verify that the character classes all have at least one member.
4141    for (i=0; i<numCharClasses; i++) {
4142        UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4143        if (s == NULL || s->size() == 0) {
4144            errln("Character Class #%d is null or of zero size.", i);
4145            return;
4146        }
4147    }
4148
4149    while (loopCount < numIterations || numIterations == -1) {
4150        if (numIterations == -1 && loopCount % 10 == 0) {
4151            // If test is running in an infinite loop, display a periodic tic so
4152            //   we can tell that it is making progress.
4153            fprintf(stderr, ".");
4154        }
4155        // Save current random number seed, so that we can recreate the random numbers
4156        //   for this loop iteration in event of an error.
4157        seed = m_seed;
4158
4159        // Populate a test string with data.
4160        testText.truncate(0);
4161        for (i=0; i<TESTSTRINGLEN; i++) {
4162            int32_t  aClassNum = m_rand() % numCharClasses;
4163            UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4164            int32_t   charIdx = m_rand() % classSet->size();
4165            UChar32   c = classSet->charAt(charIdx);
4166            if (c < 0) {   // TODO:  deal with sets containing strings.
4167                errln("c < 0");
4168                break;
4169            }
4170            testText.append(c);
4171        }
4172
4173        // Calculate the expected results for this test string.
4174        mk.setText(testText);
4175        memset(expectedBreaks, 0, sizeof(expectedBreaks));
4176        expectedBreaks[0] = 1;
4177        int32_t breakPos = 0;
4178        expectedCount = 0;
4179        for (;;) {
4180            breakPos = mk.next(breakPos);
4181            if (breakPos == -1) {
4182                break;
4183            }
4184            if (breakPos > testText.length()) {
4185                errln("breakPos > testText.length()");
4186            }
4187            expectedBreaks[breakPos] = 1;
4188            U_ASSERT(expectedCount<testText.length());
4189            expected[expectedCount ++] = breakPos;
4190            (void)expected;   // Set but not used warning.
4191                              // TODO (andy): check it out.
4192        }
4193
4194        // Find the break positions using forward iteration
4195        memset(forwardBreaks, 0, sizeof(forwardBreaks));
4196        if (useUText) {
4197            UErrorCode status = U_ZERO_ERROR;
4198            UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4199            // testUText = utext_openUnicodeString(testUText, &testText, &status);
4200            bi->setText(testUText, status);
4201            TEST_ASSERT_SUCCESS(status);
4202            utext_close(testUText);   // The break iterator does a shallow clone of the UText
4203                                      //  This UText can be closed immediately, so long as the
4204                                      //  testText string continues to exist.
4205        } else {
4206            bi->setText(testText);
4207        }
4208
4209        for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4210            if (i < 0 || i > testText.length()) {
4211                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4212                break;
4213            }
4214            forwardBreaks[i] = 1;
4215        }
4216
4217        // Find the break positions using reverse iteration
4218        memset(reverseBreaks, 0, sizeof(reverseBreaks));
4219        for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4220            if (i < 0 || i > testText.length()) {
4221                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4222                break;
4223            }
4224            reverseBreaks[i] = 1;
4225        }
4226
4227        // Find the break positions using isBoundary() tests.
4228        memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4229        U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4230        for (i=0; i<=testText.length(); i++) {
4231            isBoundaryBreaks[i] = bi->isBoundary(i);
4232        }
4233
4234
4235        // Find the break positions using the following() function.
4236        // printf(".");
4237        memset(followingBreaks, 0, sizeof(followingBreaks));
4238        int32_t   lastBreakPos = 0;
4239        followingBreaks[0] = 1;
4240        for (i=0; i<testText.length(); i++) {
4241            breakPos = bi->following(i);
4242            if (breakPos <= i ||
4243                breakPos < lastBreakPos ||
4244                breakPos > testText.length() ||
4245                (breakPos > lastBreakPos && lastBreakPos > i)) {
4246                errln("%s break monkey test: "
4247                    "Out of range value returned by BreakIterator::following().\n"
4248                        "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4249                         name, seed, i, breakPos, lastBreakPos);
4250                break;
4251            }
4252            followingBreaks[breakPos] = 1;
4253            lastBreakPos = breakPos;
4254        }
4255
4256        // Find the break positions using the preceding() function.
4257        memset(precedingBreaks, 0, sizeof(precedingBreaks));
4258        lastBreakPos = testText.length();
4259        precedingBreaks[testText.length()] = 1;
4260        for (i=testText.length(); i>0; i--) {
4261            breakPos = bi->preceding(i);
4262            if (breakPos >= i ||
4263                breakPos > lastBreakPos ||
4264                (breakPos < 0 && testText.getChar32Start(i)>0) ||
4265                (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4266                errln("%s break monkey test: "
4267                    "Out of range value returned by BreakIterator::preceding().\n"
4268                    "index=%d;  prev returned %d; lastBreak=%d" ,
4269                    name,  i, breakPos, lastBreakPos);
4270                if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4271                    precedingBreaks[i] = 2;   // Forces an error.
4272                }
4273            } else {
4274                if (breakPos >= 0) {
4275                    precedingBreaks[breakPos] = 1;
4276                }
4277                lastBreakPos = breakPos;
4278            }
4279        }
4280
4281        // Compare the expected and actual results.
4282        for (i=0; i<=testText.length(); i++) {
4283            const char *errorType = NULL;
4284            if  (forwardBreaks[i] != expectedBreaks[i]) {
4285                errorType = "next()";
4286            } else if (reverseBreaks[i] != forwardBreaks[i]) {
4287                errorType = "previous()";
4288            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4289                errorType = "isBoundary()";
4290            } else if (followingBreaks[i] != expectedBreaks[i]) {
4291                errorType = "following()";
4292            } else if (precedingBreaks[i] != expectedBreaks[i]) {
4293                errorType = "preceding()";
4294            }
4295
4296
4297            if (errorType != NULL) {
4298                // Format a range of the test text that includes the failure as
4299                //  a data item that can be included in the rbbi test data file.
4300
4301                // Start of the range is the last point where expected and actual results
4302                //   both agreed that there was a break position.
4303                int startContext = i;
4304                int32_t count = 0;
4305                for (;;) {
4306                    if (startContext==0) { break; }
4307                    startContext --;
4308                    if (expectedBreaks[startContext] != 0) {
4309                        if (count == 2) break;
4310                        count ++;
4311                    }
4312                }
4313
4314                // End of range is two expected breaks past the start position.
4315                int endContext = i + 1;
4316                int ci;
4317                for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4318                    for (;;) {
4319                        if (endContext >= testText.length()) {break;}
4320                        if (expectedBreaks[endContext-1] != 0) {
4321                            if (count == 0) break;
4322                            count --;
4323                        }
4324                        endContext ++;
4325                    }
4326                }
4327
4328                // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4329                UnicodeString errorText = "<data>";
4330                /***if (strcmp(errorType, "next()") == 0) {
4331                    startContext = 0;
4332                    endContext = testText.length();
4333
4334                    printStringBreaks(testText, expected, expectedCount);
4335                }***/
4336
4337                for (ci=startContext; ci<endContext;) {
4338                    UnicodeString hexChars("0123456789abcdef");
4339                    UChar32  c;
4340                    int      bn;
4341                    c = testText.char32At(ci);
4342                    if (ci == i) {
4343                        // This is the location of the error.
4344                        errorText.append("<?>");
4345                    } else if (expectedBreaks[ci] != 0) {
4346                        // This a non-error expected break position.
4347                        errorText.append("\\");
4348                    }
4349                    if (c < 0x10000) {
4350                        errorText.append("\\u");
4351                        for (bn=12; bn>=0; bn-=4) {
4352                            errorText.append(hexChars.charAt((c>>bn)&0xf));
4353                        }
4354                    } else {
4355                        errorText.append("\\U");
4356                        for (bn=28; bn>=0; bn-=4) {
4357                            errorText.append(hexChars.charAt((c>>bn)&0xf));
4358                        }
4359                    }
4360                    ci = testText.moveIndex32(ci, 1);
4361                }
4362                errorText.append("\\");
4363                errorText.append("</data>\n");
4364
4365                // Output the error
4366                char  charErrorTxt[500];
4367                UErrorCode status = U_ZERO_ERROR;
4368                errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4369                charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4370                const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4371
4372                errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4373                    name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4374                    errorType, seed, i, charErrorTxt);
4375                break;
4376            }
4377        }
4378
4379        loopCount++;
4380    }
4381#endif
4382}
4383
4384
4385//  Bug 5532.  UTF-8 based UText fails in dictionary code.
4386//             This test checks the initial patch,
4387//             which is to just keep it from crashing.  Correct word boundaries
4388//             await a proper fix to the dictionary code.
4389//
4390void RBBITest::TestBug5532(void)  {
4391   // Text includes a mixture of Thai and Latin.
4392   const unsigned char utf8Data[] = {
4393           0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4394           0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4395           0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4396           0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4397           0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4398           0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4399           0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4400           0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4401           0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4402           0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4403           0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4404
4405    UErrorCode status = U_ZERO_ERROR;
4406    UText utext=UTEXT_INITIALIZER;
4407    utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4408    TEST_ASSERT_SUCCESS(status);
4409
4410    BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4411    TEST_ASSERT_SUCCESS(status);
4412    if (U_SUCCESS(status)) {
4413        bi->setText(&utext, status);
4414        TEST_ASSERT_SUCCESS(status);
4415
4416        int32_t breakCount = 0;
4417        int32_t previousBreak = -1;
4418        for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4419            // For now, just make sure that the break iterator doesn't hang.
4420            TEST_ASSERT(previousBreak < bi->current());
4421            previousBreak = bi->current();
4422        }
4423        TEST_ASSERT(breakCount > 0);
4424    }
4425    delete bi;
4426    utext_close(&utext);
4427}
4428
4429
4430void RBBITest::TestBug9983(void)  {
4431    UnicodeString text = UnicodeString("\\u002A"  // * Other
4432                                       "\\uFF65"  //   Other
4433                                       "\\u309C"  //   Katakana
4434                                       "\\uFF9F"  //   Extend
4435                                       "\\uFF65"  //   Other
4436                                       "\\u0020"  //   Other
4437                                       "\\u0000").unescape();
4438
4439    UErrorCode status = U_ZERO_ERROR;
4440    LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4441        BreakIterator::createWordInstance(Locale::getRoot(), status)));
4442    TEST_ASSERT_SUCCESS(status);
4443    LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4444        BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4445    TEST_ASSERT_SUCCESS(status);
4446    if (U_FAILURE(status)) {
4447        return;
4448    }
4449    int32_t offset, rstatus, iterationCount;
4450
4451    brkiter->setText(text);
4452    brkiter->last();
4453    iterationCount = 0;
4454    while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4455        iterationCount++;
4456        rstatus = brkiter->getRuleStatus();
4457        (void)rstatus;     // Suppress set but not used warning.
4458        if (iterationCount >= 10) {
4459           break;
4460        }
4461    }
4462    TEST_ASSERT(iterationCount == 6);
4463
4464    brkiterPOSIX->setText(text);
4465    brkiterPOSIX->last();
4466    iterationCount = 0;
4467    while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4468        iterationCount++;
4469        rstatus = brkiterPOSIX->getRuleStatus();
4470        (void)rstatus;     // Suppress set but not used warning.
4471        if (iterationCount >= 10) {
4472           break;
4473        }
4474    }
4475    TEST_ASSERT(iterationCount == 6);
4476}
4477
4478
4479//
4480//  TestDebug    -  A place-holder test for debugging purposes.
4481//                  For putting in fragments of other tests that can be invoked
4482//                  for tracing  without a lot of unwanted extra stuff happening.
4483//
4484void RBBITest::TestDebug(void) {
4485#if 0
4486    UErrorCode   status = U_ZERO_ERROR;
4487    int pos = 0;
4488    int ruleStatus = 0;
4489
4490    RuleBasedBreakIterator* bi =
4491       // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4492       // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4493       (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4494    UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4495    // UnicodeString s("Aaa.  Bcd");
4496    s = s.unescape();
4497    bi->setText(s);
4498    UBool r = bi->isBoundary(8);
4499    printf("%s", r?"true":"false");
4500    return;
4501    pos = bi->last();
4502    do {
4503        // ruleStatus = bi->getRuleStatus();
4504        printf("%d\t%d\n", pos, ruleStatus);
4505        pos = bi->previous();
4506    } while (pos != BreakIterator::DONE);
4507#endif
4508}
4509
4510void RBBITest::TestProperties() {
4511    UErrorCode errorCode = U_ZERO_ERROR;
4512    UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4513    if (!prependSet.isEmpty()) {
4514        errln(
4515            "[:GCB=Prepend:] is not empty any more. "
4516            "Uncomment relevant lines in source/data/brkitr/char.txt and "
4517            "change this test to the opposite condition.");
4518    }
4519}
4520
4521#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
4522