1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2009, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6/************************************************************************
7*   Date        Name        Description
8*   12/15/99    Madhu        Creation.
9*   01/12/2000  Madhu        Updated for changed API and added new tests
10************************************************************************/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_BREAK_ITERATION
15
16#include "unicode/utypes.h"
17#include "unicode/brkiter.h"
18#include "unicode/rbbi.h"
19#include "unicode/uchar.h"
20#include "unicode/utf16.h"
21#include "unicode/ucnv.h"
22#include "unicode/schriter.h"
23#include "unicode/uniset.h"
24#include "unicode/regex.h"        // TODO: make conditional on regexp being built.
25#include "unicode/ustring.h"
26#include "unicode/utext.h"
27#include "intltest.h"
28#include "rbbitst.h"
29#include <string.h>
30#include "uvector.h"
31#include "uvectr32.h"
32#include "triedict.h"
33#include <string.h>
34#include <stdio.h>
35#include <stdlib.h>
36
37#define TEST_ASSERT(x) {if (!(x)) { \
38    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
39
40#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
41    errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
42
43
44//---------------------------------------------
45// runIndexedTest
46//---------------------------------------------
47
48void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
49{
50    if (exec) logln("TestSuite RuleBasedBreakIterator: ");
51
52    switch (index) {
53        case 0: name = "TestBug4153072";
54            if(exec) TestBug4153072();                         break;
55        case 1: name = "TestJapaneseLineBreak";
56            if(exec) TestJapaneseLineBreak();                  break;
57        case 2: name = "TestStatusReturn";
58            if(exec) TestStatusReturn();                       break;
59        case 3: name = "TestUnicodeFiles";
60            if(exec) TestUnicodeFiles();                       break;
61        case 4: name = "TestEmptyString";
62            if(exec) TestEmptyString();                        break;
63
64        case 5: name = "TestGetAvailableLocales";
65            if(exec) TestGetAvailableLocales();                break;
66
67        case 6: name = "TestGetDisplayName";
68            if(exec) TestGetDisplayName();                     break;
69
70        case 7: name = "TestEndBehaviour";
71            if(exec) TestEndBehaviour();                       break;
72        case 8: name = "TestMixedThaiLineBreak";
73             // BEGIN android-removed
74             // Disable all Thai breakiterator tests.
75             /* if(exec) TestMixedThaiLineBreak();    */       break;
76             // END android-removed
77        case 9: name = "TestThaiLineBreak";
78             // BEGIN android-removed
79             // Disable all Thai breakiterator tests.
80             /* if(exec) TestThaiLineBreak();         */       break;
81             // END android-removed
82        case 10: name = "TestMaiyamok";
83             // BEGIN android-removed
84             // Disable all Thai breakiterator tests.
85             /* if(exec) TestMaiyamok();              */       break;
86             // END android-removed
87        case 11: name = "TestWordBreaks";
88             if(exec) TestWordBreaks();                        break;
89        case 12: name = "TestWordBoundary";
90             if(exec) TestWordBoundary();                      break;
91        case 13: name = "TestLineBreaks";
92             if(exec) TestLineBreaks();                        break;
93        case 14: name = "TestSentBreaks";
94             if(exec) TestSentBreaks();                        break;
95        case 15: name = "TestExtended";
96             if(exec) TestExtended();                          break;
97        case 16: name = "TestMonkey";
98             if(exec) {
99 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
100               TestMonkey(params);
101 #else
102               logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
103 #endif
104             }
105                                                               break;
106        case 17: name = "TestBug3818";
107             // BEGIN android-removed
108             // Disable all Thai breakiterator tests.
109             /* if(exec) TestBug3818();                 */     break;
110             // END android-removed
111        case 18: name = "TestJapaneseWordBreak";
112            if(exec) TestJapaneseWordBreak();                  break;
113        case 19: name = "TestDebug";
114            if(exec) TestDebug();                              break;
115        case 20: name = "TestTrieDict";
116            if(exec) TestTrieDict();                           break;
117        case 21: name = "TestBug5775";
118            if (exec) TestBug5775();                           break;
119        case 22: name = "TestThaiBreaks";
120             // BEGIN android-removed
121             // Disable all Thai breakiterator tests.
122             /* if (exec) TestThaiBreaks();             */     break;
123             // END android-removed
124        case 23: name = "TestTailoredBreaks";
125            if (exec) TestTailoredBreaks();                    break;
126
127        default: name = ""; break; //needed to end loop
128    }
129}
130
131
132//---------------------------------------------------------------------------
133//
134//   class BITestData   Holds a set of Break iterator test data and results
135//                      Includes
136//                         - the string data to be broken
137//                         - a vector of the expected break positions.
138//                         - a vector of source line numbers for the data,
139//                               (to help see where errors occured.)
140//                         - The expected break tag values.
141//                         - Vectors of actual break positions and tag values.
142//                         - Functions for comparing actual with expected and
143//                            reporting errors.
144//
145//----------------------------------------------------------------------------
146class BITestData {
147public:
148    UnicodeString    fDataToBreak;
149    UVector          fExpectedBreakPositions;
150    UVector          fExpectedTags;
151    UVector          fLineNum;
152    UVector          fActualBreakPositions;   // Test Results.
153    UVector          fActualTags;
154
155    BITestData(UErrorCode &status);
156    void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
157    void             checkResults(const char *heading, RBBITest *test);
158    void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
159    void             clearResults();
160};
161
162//
163// Constructor.
164//
165BITestData::BITestData(UErrorCode &status)
166: fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
167  fActualTags(status)
168{
169}
170
171//
172// addDataChunk.   Add a section (non-breaking) piece if data to the test data.
173//                 The macro form collects the line number, which is helpful
174//                 when tracking down failures.
175//
176//                 A null data item is inserted at the start of each test's data
177//                  to put the starting zero into the data list.  The position saved for
178//                  each non-null item is its ending position.
179//
180#define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
181void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
182    if (U_FAILURE(status)) {return;}
183    if (data != NULL) {
184        fDataToBreak.append(CharsToUnicodeString(data));
185    }
186    fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
187    fExpectedTags.addElement(tag, status);
188    fLineNum.addElement(lineNum, status);
189}
190
191
192//
193//  checkResults.   Compare the actual and expected break positions, report any differences.
194//
195void BITestData::checkResults(const char *heading, RBBITest *test) {
196    int32_t   expectedIndex = 0;
197    int32_t   actualIndex = 0;
198
199    for (;;) {
200        // If we've run through both the expected and actual results vectors, we're done.
201        //   break out of the loop.
202        if (expectedIndex >= fExpectedBreakPositions.size() &&
203            actualIndex   >= fActualBreakPositions.size()) {
204            break;
205        }
206
207
208        if (expectedIndex >= fExpectedBreakPositions.size()) {
209            err(heading, test, expectedIndex-1, actualIndex);
210            actualIndex++;
211            continue;
212        }
213
214        if (actualIndex >= fActualBreakPositions.size()) {
215            err(heading, test, expectedIndex, actualIndex-1);
216            expectedIndex++;
217            continue;
218        }
219
220        if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
221            err(heading, test, expectedIndex, actualIndex);
222            // Try to resync the positions of the indices, to avoid a rash of spurious erros.
223            if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
224                actualIndex++;
225            } else {
226                expectedIndex++;
227            }
228            continue;
229        }
230
231        if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
232            test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
233                heading, fLineNum.elementAt(expectedIndex),
234                fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
235        }
236
237        actualIndex++;
238        expectedIndex++;
239    }
240}
241
242//
243//  err   -  An error was found.  Report it, along with information about where the
244//                                incorrectly broken test data appeared in the source file.
245//
246void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
247{
248    int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
249    int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
250    int32_t   o        = 0;
251    int32_t   line     = fLineNum.elementAti(expectedIdx);
252    if (expectedIdx > 0) {
253        // The line numbers are off by one because a premature break occurs somewhere
254        //    within the previous item, rather than at the start of the current (expected) item.
255        //    We want to report the offset of the unexpected break from the start of
256        //      this previous item.
257        o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
258    }
259    if (actual < expected) {
260        test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
261    } else {
262        test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
263    }
264}
265
266
267void BITestData::clearResults() {
268    fActualBreakPositions.removeAllElements();
269    fActualTags.removeAllElements();
270}
271
272
273//-----------------------------------------------------------------------------------
274//
275//    Cannned Test Characters
276//
277//-----------------------------------------------------------------------------------
278
279static const UChar cannedTestArray[] = {
280    0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
281    0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
282    0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
283    0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
284    0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
285    0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
286    0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
287    0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
288};
289
290static UnicodeString* cannedTestChars = 0;
291
292#define  halfNA     "\\u0928\\u094d\\u200d"
293#define  halfSA     "\\u0938\\u094d\\u200d"
294#define  halfCHA    "\\u091a\\u094d\\u200d"
295#define  halfKA     "\\u0915\\u094d\\u200d"
296#define  deadTA     "\\u0924\\u094d"
297
298//--------------------------------------------------------------------------------------
299//
300//    RBBITest    constructor and destructor
301//
302//--------------------------------------------------------------------------------------
303
304RBBITest::RBBITest() {
305    UnicodeString temp(cannedTestArray);
306    cannedTestChars = new UnicodeString();
307    *cannedTestChars += (UChar)0x0000;
308    *cannedTestChars += temp;
309}
310
311
312RBBITest::~RBBITest() {
313    delete cannedTestChars;
314}
315
316
317static const int T_NUMBER = 100;
318static const int T_LETTER = 200;
319static const int T_H_OR_K = 300;
320static const int T_IDEO   = 400;
321
322
323
324
325
326
327//--------------------------------------------------------------------
328//Testing the BreakIterator for devanagari script
329//--------------------------------------------------------------------
330
331#define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
332#define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
333#define deadTTHA "\\u0920\\u094d"
334#define deadPA   "\\u092a\\u094d"
335#define deadSA   "\\u0938\\u094d"
336#define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
337
338
339
340
341
342
343//-----------------------------------------------------------------------------------
344//
345//   Test for status {tag} return value from break rules.
346//        TODO:  a more thorough test.
347//
348//-----------------------------------------------------------------------------------
349void RBBITest::TestStatusReturn() {
350     UnicodeString rulesString1("$Letters = [:L:];\n"
351                                  "$Numbers = [:N:];\n"
352                                  "$Letters+{1};\n"
353                                  "$Numbers+{2};\n"
354                                  "Help\\ {4}/me\\!;\n"
355                                  "[^$Letters $Numbers];\n"
356                                  "!.*;\n", -1, US_INV);
357     UnicodeString testString1  = "abc123..abc Help me Help me!";
358                                // 01234567890123456789012345678
359     int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
360     int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
361
362     UErrorCode status=U_ZERO_ERROR;
363     UParseError    parseError;
364
365     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
366     if(U_FAILURE(status)) {
367         dataerrln("FAIL : in construction - %s", u_errorName(status));
368     } else {
369         int32_t  pos;
370         int32_t  i = 0;
371         bi->setText(testString1);
372         for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
373             if (pos != bounds1[i]) {
374                 errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
375                 break;
376             }
377
378             int tag = bi->getRuleStatus();
379             if (tag != brkStatus[i]) {
380                 errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
381                 break;
382             }
383             i++;
384         }
385     }
386     delete bi;
387}
388
389
390static void printStringBreaks(UnicodeString ustr, int expected[],
391                              int expectedcount)
392{
393    UErrorCode status = U_ZERO_ERROR;
394    char name[100];
395    printf("code    alpha extend alphanum type word sent line name\n");
396    int j;
397    for (j = 0; j < ustr.length(); j ++) {
398        if (expectedcount > 0) {
399            int k;
400            for (k = 0; k < expectedcount; k ++) {
401                if (j == expected[k]) {
402                    printf("------------------------------------------------ %d\n",
403                           j);
404                }
405            }
406        }
407        UChar32 c = ustr.char32At(j);
408        if (c > 0xffff) {
409            j ++;
410        }
411        u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
412        printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
413                           u_isUAlphabetic(c),
414                           u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
415                           u_isalnum(c),
416                           u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
417                                                  u_charType(c),
418                                                  U_SHORT_PROPERTY_NAME),
419                           u_getPropertyValueName(UCHAR_WORD_BREAK,
420                                                  u_getIntPropertyValue(c,
421                                                          UCHAR_WORD_BREAK),
422                                                  U_SHORT_PROPERTY_NAME),
423                           u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
424                                   u_getIntPropertyValue(c,
425                                           UCHAR_SENTENCE_BREAK),
426                                   U_SHORT_PROPERTY_NAME),
427                           u_getPropertyValueName(UCHAR_LINE_BREAK,
428                                   u_getIntPropertyValue(c,
429                                           UCHAR_LINE_BREAK),
430                                   U_SHORT_PROPERTY_NAME),
431                           name);
432    }
433}
434
435void RBBITest::TestThaiLineBreak() {
436    UErrorCode status = U_ZERO_ERROR;
437    BITestData thaiLineSelection(status);
438
439    // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
440    // represents elided letters at the end of a long word.  It should be bound to
441    // the end of the word and not treated as an independent punctuation mark.
442
443
444    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
445    ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
446    ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
447    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
448    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
449//        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
450//        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
451    ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
452    // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
453    ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
454    ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
455    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
456    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
457    ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
458    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
459
460    // the one time where the paiyannoi occurs somewhere other than at the end
461    // of a word is in the Thai abbrevation for "etc.", which both begins and
462    // ends with a paiyannoi
463    ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
464    ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
465    ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
466
467    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
468        Locale("th"), status);
469    if (U_FAILURE(status))
470    {
471        errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status));
472        return;
473    }
474
475    generalIteratorTest(*e, thaiLineSelection);
476    delete e;
477}
478
479
480
481void RBBITest::TestMixedThaiLineBreak()
482{
483    UErrorCode   status = U_ZERO_ERROR;
484    BITestData   thaiLineSelection(status);
485
486    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
487
488
489    // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
490    // start
491
492    ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
493    ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
494    ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
495    ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
496    ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
497    ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
498    ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
499    ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
500    ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
501    ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
502    ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
503    ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
504    ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
505    ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
506    ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
507    ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
508
509    // @suwit - end of changes
510
511
512    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
513    if (U_FAILURE(status))
514    {
515        errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status));
516        return;
517    }
518
519
520    generalIteratorTest(*e, thaiLineSelection);
521    delete e;
522}
523
524
525void RBBITest::TestMaiyamok()
526{
527    UErrorCode status = U_ZERO_ERROR;
528    BITestData   thaiLineSelection(status);
529    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
530    // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
531    // word".  Instead of appearing as a word unto itself, however, it's kept together
532    // with the word before it
533    ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
534    ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
535    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
536    ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
537    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
538    ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
539    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
540    ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
541    ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
542
543    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
544        Locale("th"), status);
545
546    if (U_FAILURE(status))
547    {
548        errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status));
549        return;
550    }
551    generalIteratorTest(*e, thaiLineSelection);
552    delete e;
553}
554
555
556
557void RBBITest::TestBug3818() {
558    UErrorCode  status = U_ZERO_ERROR;
559
560    // Four Thai words...
561    static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
562                                           0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
563    UnicodeString  thaiStr(thaiWordData);
564
565    RuleBasedBreakIterator* bi =
566        (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
567    if (U_FAILURE(status) || bi == NULL) {
568        errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
569        return;
570    }
571    bi->setText(thaiStr);
572
573    int32_t  startOfSecondWord = bi->following(1);
574    if (startOfSecondWord != 4) {
575        errln("Fail at file %s, line %d expected start of word at 4, got %d",
576            __FILE__, __LINE__, startOfSecondWord);
577    }
578    startOfSecondWord = bi->following(0);
579    if (startOfSecondWord != 4) {
580        errln("Fail at file %s, line %d expected start of word at 4, got %d",
581            __FILE__, __LINE__, startOfSecondWord);
582    }
583    delete bi;
584}
585
586
587void RBBITest::TestJapaneseWordBreak() {
588    UErrorCode status = U_ZERO_ERROR;
589    BITestData   japaneseWordSelection(status);
590
591    ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
592    ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
593    ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
594    ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
595    ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
596    ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
597    ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
598
599    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
600        Locale("ja"), status);
601    if (U_FAILURE(status))
602    {
603        errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
604        return;
605    }
606
607    generalIteratorTest(*e, japaneseWordSelection);
608    delete e;
609}
610
611void RBBITest::TestTrieDict() {
612    UErrorCode      status  = U_ZERO_ERROR;
613
614    //
615    //  Open and read the test data file.
616    //
617    const char *testDataDirectory = IntlTest::getSourceTestData(status);
618    char testFileName[1000];
619    if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
620        errln("Can't open test data.  Path too long.");
621        return;
622    }
623    strcpy(testFileName, testDataDirectory);
624    strcat(testFileName, "riwords.txt");
625
626    // Items needing deleting at the end
627    MutableTrieDictionary *mutableDict = NULL;
628    CompactTrieDictionary *compactDict = NULL;
629    UnicodeSet            *breaks      = NULL;
630    UChar                 *testFile    = NULL;
631    StringEnumeration     *enumer1     = NULL;
632    StringEnumeration     *enumer2     = NULL;
633    MutableTrieDictionary *mutable2    = NULL;
634    StringEnumeration     *cloneEnum   = NULL;
635    CompactTrieDictionary *compact2    = NULL;
636
637
638    const UnicodeString *originalWord = NULL;
639    const UnicodeString *cloneWord    = NULL;
640    UChar *current;
641    UChar *word;
642    UChar uc;
643    int32_t wordLen;
644    int32_t wordCount;
645    int32_t testCount;
646
647    int    len;
648    testFile = ReadAndConvertFile(testFileName, len, NULL, status);
649    if (U_FAILURE(status)) {
650        goto cleanup; /* something went wrong, error already output */
651    }
652
653    mutableDict = new MutableTrieDictionary(0x0E1C, status);
654    if (U_FAILURE(status)) {
655        errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
656        goto cleanup;
657    }
658
659    breaks = new UnicodeSet;
660    breaks->add(0x000A);     // Line Feed
661    breaks->add(0x000D);     // Carriage Return
662    breaks->add(0x2028);     // Line Separator
663    breaks->add(0x2029);     // Paragraph Separator
664
665    // Now add each non-comment line of the file as a word.
666    current = testFile;
667    word = current;
668    uc = *current++;
669    wordLen = 0;
670    wordCount = 0;
671
672    while (uc) {
673        if (uc == 0x0023) {     // #comment line, skip
674            while (uc && !breaks->contains(uc)) {
675                uc = *current++;
676            }
677        }
678        else while (uc && !breaks->contains(uc)) {
679            ++wordLen;
680            uc = *current++;
681        }
682        if (wordLen > 0) {
683            mutableDict->addWord(word, wordLen, status);
684            if (U_FAILURE(status)) {
685                errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
686                goto cleanup;
687            }
688            wordCount += 1;
689        }
690
691        // Find beginning of next line
692        while (uc && breaks->contains(uc)) {
693            uc = *current++;
694        }
695        word = current-1;
696        wordLen = 0;
697    }
698
699    if (wordCount < 50) {
700        errln("Word count (%d) unreasonably small\n", wordCount);
701        goto cleanup;
702    }
703
704    enumer1 = mutableDict->openWords(status);
705    if (U_FAILURE(status)) {
706        errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
707        goto cleanup;
708    }
709
710    testCount = 0;
711    if (wordCount != (testCount = enumer1->count(status))) {
712        errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
713            testCount, wordCount, u_errorName(status));
714        goto cleanup;
715    }
716
717    // Now compact it
718    compactDict = new CompactTrieDictionary(*mutableDict, status);
719    if (U_FAILURE(status)) {
720        errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
721        goto cleanup;
722    }
723
724    enumer2 = compactDict->openWords(status);
725    if (U_FAILURE(status)) {
726        errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
727        goto cleanup;
728    }
729
730    if (wordCount != (testCount = enumer2->count(status))) {
731        errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
732            testCount, wordCount, u_errorName(status));
733        goto cleanup;
734    }
735
736    if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
737        errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
738    }
739    delete enumer1;
740    enumer1 = NULL;
741    delete enumer2;
742    enumer2 = NULL;
743
744    // Now un-compact it
745    mutable2 = compactDict->cloneMutable(status);
746    if (U_FAILURE(status)) {
747        errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
748        goto cleanup;
749    }
750
751    cloneEnum = mutable2->openWords(status);
752    if (U_FAILURE(status)) {
753        errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
754        goto cleanup;
755    }
756
757    if (wordCount != (testCount = cloneEnum->count(status))) {
758        errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
759            testCount, wordCount, u_errorName(status));
760        goto cleanup;
761    }
762
763    // Compact original dictionary to clone. Note that we can only compare the same kind of
764    // dictionary as the order of the enumerators is not guaranteed to be the same between
765    // different kinds
766    enumer1 = mutableDict->openWords(status);
767    if (U_FAILURE(status)) {
768        errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
769        goto cleanup;
770     }
771
772    originalWord = enumer1->snext(status);
773    cloneWord = cloneEnum->snext(status);
774    while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
775        if (*originalWord != *cloneWord) {
776            errln("Original and cloned MutableTrieDictionary word mismatch\n");
777            goto cleanup;
778        }
779        originalWord = enumer1->snext(status);
780        cloneWord = cloneEnum->snext(status);
781    }
782
783    if (U_FAILURE(status)) {
784        errln("Enumeration failed: %s\n", u_errorName(status));
785        goto cleanup;
786    }
787
788    if (originalWord != cloneWord) {
789        errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
790        goto cleanup;
791    }
792
793    // Test the data copying constructor for CompactTrieDict, and the data access APIs.
794    compact2 = new CompactTrieDictionary(compactDict->data(), status);
795    if (U_FAILURE(status)) {
796        errln("CompactTrieDictionary(const void *,...) failed\n");
797        goto cleanup;
798    }
799
800    if (compact2->dataSize() == 0) {
801        errln("CompactTrieDictionary->dataSize() == 0\n");
802        goto cleanup;
803    }
804
805    // Now count the words via the second dictionary
806    delete enumer1;
807    enumer1 = compact2->openWords(status);
808    if (U_FAILURE(status)) {
809        errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
810        goto cleanup;
811    }
812
813    if (wordCount != (testCount = enumer1->count(status))) {
814        errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
815            testCount, wordCount, u_errorName(status));
816        goto cleanup;
817    }
818
819cleanup:
820    delete compactDict;
821    delete mutableDict;
822    delete breaks;
823    delete[] testFile;
824    delete enumer1;
825    delete mutable2;
826    delete cloneEnum;
827    delete compact2;
828}
829
830
831//----------------------------------------------------------------------------
832//
833// generalIteratorTest      Given a break iterator and a set of test data,
834//                          Run the tests and report the results.
835//
836//----------------------------------------------------------------------------
837void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
838{
839
840    bi.setText(td.fDataToBreak);
841
842    testFirstAndNext(bi, td);
843
844    testLastAndPrevious(bi, td);
845
846    testFollowing(bi, td);
847    testPreceding(bi, td);
848    testIsBoundary(bi, td);
849    doMultipleSelectionTest(bi, td);
850}
851
852
853//
854//   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
855//                       kind of loop.
856//
857void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
858{
859    UErrorCode  status = U_ZERO_ERROR;
860    int32_t     p;
861    int32_t     lastP = -1;
862    int32_t     tag;
863
864    logln("Test first and next");
865    bi.setText(td.fDataToBreak);
866    td.clearResults();
867
868    for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
869        td.fActualBreakPositions.addElement(p, status);  // Save result.
870        tag = bi.getRuleStatus();
871        td.fActualTags.addElement(tag, status);
872        if (p <= lastP) {
873            // If the iterator is not making forward progress, stop.
874            //  No need to raise an error here, it'll be detected in the normal check of results.
875            break;
876        }
877        lastP = p;
878    }
879    td.checkResults("testFirstAndNext", this);
880}
881
882
883//
884//  TestLastAndPrevious.   Run the iterator backwards, starting with last().
885//
886void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
887{
888    UErrorCode  status = U_ZERO_ERROR;
889    int32_t     p;
890    int32_t     lastP  = 0x7ffffffe;
891    int32_t     tag;
892
893    logln("Test last and previous");
894    bi.setText(td.fDataToBreak);
895    td.clearResults();
896
897    for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
898        // Save break position.  Insert it at start of vector of results, shoving
899        //    already-saved results further towards the end.
900        td.fActualBreakPositions.insertElementAt(p, 0, status);
901        // bi.previous();   // TODO:  Why does this fix things up????
902        // bi.next();
903        tag = bi.getRuleStatus();
904        td.fActualTags.insertElementAt(tag, 0, status);
905        if (p >= lastP) {
906            // If the iterator is not making progress, stop.
907            //  No need to raise an error here, it'll be detected in the normal check of results.
908            break;
909        }
910        lastP = p;
911    }
912    td.checkResults("testLastAndPrevious", this);
913}
914
915
916void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
917{
918    UErrorCode  status = U_ZERO_ERROR;
919    int32_t     p;
920    int32_t     tag;
921    int32_t     lastP  = -2;     // A value that will never be returned as a break position.
922                                 //   cannot be -1; that is returned for DONE.
923    int         i;
924
925    logln("testFollowing():");
926    bi.setText(td.fDataToBreak);
927    td.clearResults();
928
929    // Save the starting point, since we won't get that out of following.
930    p = bi.first();
931    td.fActualBreakPositions.addElement(p, status);  // Save result.
932    tag = bi.getRuleStatus();
933    td.fActualTags.addElement(tag, status);
934
935    for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
936        p = bi.following(i);
937        if (p != lastP) {
938            if (p == RuleBasedBreakIterator::DONE) {
939                break;
940            }
941            // We've reached a new break position.  Save it.
942            td.fActualBreakPositions.addElement(p, status);  // Save result.
943            tag = bi.getRuleStatus();
944            td.fActualTags.addElement(tag, status);
945            lastP = p;
946        }
947    }
948    // The loop normally exits by means of the break in the middle.
949    // Make sure that the index was at the correct position for the break iterator to have
950    //   returned DONE.
951    if (i != td.fDataToBreak.length()) {
952        errln("testFollowing():  iterator returned DONE prematurely.");
953    }
954
955    // Full check of all results.
956    td.checkResults("testFollowing", this);
957}
958
959
960
961void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
962    UErrorCode  status = U_ZERO_ERROR;
963    int32_t     p;
964    int32_t     tag;
965    int32_t     lastP  = 0x7ffffffe;
966    int         i;
967
968    logln("testPreceding():");
969    bi.setText(td.fDataToBreak);
970    td.clearResults();
971
972    p = bi.last();
973    td.fActualBreakPositions.addElement(p, status);
974    tag = bi.getRuleStatus();
975    td.fActualTags.addElement(tag, status);
976
977    for (i = td.fDataToBreak.length(); i>=-1; i--) {
978        p = bi.preceding(i);
979        if (p != lastP) {
980            if (p == RuleBasedBreakIterator::DONE) {
981                break;
982            }
983            // We've reached a new break position.  Save it.
984            td.fActualBreakPositions.insertElementAt(p, 0, status);
985            lastP = p;
986            tag = bi.getRuleStatus();
987            td.fActualTags.insertElementAt(tag, 0, status);
988        }
989    }
990    // The loop normally exits by means of the break in the middle.
991    // Make sure that the index was at the correct position for the break iterator to have
992    //   returned DONE.
993    if (i != 0) {
994        errln("testPreceding():  iterator returned DONE prematurely.");
995    }
996
997    // Full check of all results.
998    td.checkResults("testPreceding", this);
999}
1000
1001
1002
1003void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
1004    UErrorCode  status = U_ZERO_ERROR;
1005    int         i;
1006    int32_t     tag;
1007
1008    logln("testIsBoundary():");
1009    bi.setText(td.fDataToBreak);
1010    td.clearResults();
1011
1012    for (i = 0; i <= td.fDataToBreak.length(); i++) {
1013        if (bi.isBoundary(i)) {
1014            td.fActualBreakPositions.addElement(i, status);  // Save result.
1015            tag = bi.getRuleStatus();
1016            td.fActualTags.addElement(tag, status);
1017        }
1018    }
1019    td.checkResults("testIsBoundary: ", this);
1020}
1021
1022
1023
1024void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
1025{
1026    iterator.setText(td.fDataToBreak);
1027
1028    RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
1029    int32_t offset = iterator.first();
1030    int32_t testOffset;
1031    int32_t count = 0;
1032
1033    logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
1034
1035    if (*testIterator != iterator)
1036        errln("clone() or operator!= failed: two clones compared unequal");
1037
1038    do {
1039        testOffset = testIterator->first();
1040        testOffset = testIterator->next(count);
1041        if (offset != testOffset)
1042            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1043
1044        if (offset != RuleBasedBreakIterator::DONE) {
1045            count++;
1046            offset = iterator.next();
1047
1048            if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
1049                errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
1050                if (count > 10000 || offset == -1) {
1051                    errln("operator== failed too many times. Stopping test.");
1052                    if (offset == -1) {
1053                        errln("Does (RuleBasedBreakIterator::DONE == -1)?");
1054                    }
1055                    return;
1056                }
1057            }
1058        }
1059    } while (offset != RuleBasedBreakIterator::DONE);
1060
1061    // now do it backwards...
1062    offset = iterator.last();
1063    count = 0;
1064
1065    do {
1066        testOffset = testIterator->last();
1067        testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
1068        if (offset != testOffset)
1069            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1070
1071        if (offset != RuleBasedBreakIterator::DONE) {
1072            count--;
1073            offset = iterator.previous();
1074        }
1075    } while (offset != RuleBasedBreakIterator::DONE);
1076
1077    delete testIterator;
1078}
1079
1080
1081//---------------------------------------------
1082//
1083//     other tests
1084//
1085//---------------------------------------------
1086void RBBITest::TestEmptyString()
1087{
1088    UnicodeString text = "";
1089    UErrorCode status = U_ZERO_ERROR;
1090
1091    BITestData x(status);
1092    ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
1093    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
1094    if (U_FAILURE(status))
1095    {
1096        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
1097        return;
1098    }
1099    generalIteratorTest(*bi, x);
1100    delete bi;
1101}
1102
1103void RBBITest::TestGetAvailableLocales()
1104{
1105    int32_t locCount = 0;
1106    const Locale* locList = BreakIterator::getAvailableLocales(locCount);
1107
1108    if (locCount == 0)
1109        dataerrln("getAvailableLocales() returned an empty list!");
1110    // Just make sure that it's returning good memory.
1111    int32_t i;
1112    for (i = 0; i < locCount; ++i) {
1113        logln(locList[i].getName());
1114    }
1115}
1116
1117//Testing the BreakIterator::getDisplayName() function
1118void RBBITest::TestGetDisplayName()
1119{
1120    UnicodeString   result;
1121
1122    BreakIterator::getDisplayName(Locale::getUS(), result);
1123    if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
1124        dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1125                + result);
1126
1127    BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
1128    if (result != "French (France)")
1129        dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1130                + result);
1131}
1132/**
1133 * Test End Behaviour
1134 * @bug 4068137
1135 */
1136void RBBITest::TestEndBehaviour()
1137{
1138    UErrorCode status = U_ZERO_ERROR;
1139    UnicodeString testString("boo.");
1140    BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
1141    if (U_FAILURE(status))
1142    {
1143        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
1144        return;
1145    }
1146    wb->setText(testString);
1147
1148    if (wb->first() != 0)
1149        errln("Didn't get break at beginning of string.");
1150    if (wb->next() != 3)
1151        errln("Didn't get break before period in \"boo.\"");
1152    if (wb->current() != 4 && wb->next() != 4)
1153        errln("Didn't get break at end of string.");
1154    delete wb;
1155}
1156/*
1157 * @bug 4153072
1158 */
1159void RBBITest::TestBug4153072() {
1160    UErrorCode status = U_ZERO_ERROR;
1161    BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
1162    if (U_FAILURE(status))
1163    {
1164        errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
1165        return;
1166    }
1167    UnicodeString str("...Hello, World!...");
1168    int32_t begin = 3;
1169    int32_t end = str.length() - 3;
1170    UBool onBoundary;
1171
1172    StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
1173    iter->adoptText(textIterator);
1174    int index;
1175    // Note: with the switch to UText, there is no way to restrict the
1176    //       iteration range to begin at an index other than zero.
1177    //       String character iterators created with a non-zero bound are
1178    //         treated by RBBI as being empty.
1179    for (index = -1; index < begin + 1; ++index) {
1180        onBoundary = iter->isBoundary(index);
1181        if (index == 0?  !onBoundary : onBoundary) {
1182            errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
1183                            " and begin index = " + begin);
1184        }
1185    }
1186    delete iter;
1187}
1188
1189
1190//
1191// Test for problem reported by Ashok Matoria on 9 July 2007
1192//    One.<kSoftHyphen><kSpace>Two.
1193//
1194//    Sentence break at start (0) and then on calling next() it breaks at
1195//   'T' of "Two". Now, at this point if I do next() and
1196//    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
1197//
1198void RBBITest::TestBug5775() {
1199    UErrorCode status = U_ZERO_ERROR;
1200    BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1201    TEST_ASSERT_SUCCESS(status);
1202    if (U_FAILURE(status)) {
1203        return;
1204    }
1205// Check for status first for better handling of no data errors.
1206    TEST_ASSERT(bi != NULL);
1207    if (bi == NULL) {
1208        return;
1209    }
1210
1211    UnicodeString s("One.\\u00ad Two.", -1, US_INV);
1212    //               01234      56789
1213    s = s.unescape();
1214    bi->setText(s);
1215    int pos = bi->next();
1216    TEST_ASSERT(pos == 6);
1217    pos = bi->next();
1218    TEST_ASSERT(pos == 10);
1219    pos = bi->previous();
1220    TEST_ASSERT(pos == 6);
1221    delete bi;
1222}
1223
1224
1225
1226/**
1227 * Test Japanese Line Break
1228 * @bug 4095322
1229 */
1230void RBBITest::TestJapaneseLineBreak()
1231{
1232#if 0
1233    // Test needs updating some more...   Dump it for now.
1234
1235
1236    // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
1237    //        as opening and closing punctuation for line breaking.
1238    //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
1239    //        from these tests.    6-13-2002
1240    //
1241    UErrorCode status = U_ZERO_ERROR;
1242    UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
1243    UnicodeString precedingChars = CharsToUnicodeString(
1244        //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1245        "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1246    UnicodeString followingChars = CharsToUnicodeString(
1247        // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1248        ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1249        // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1250        ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1251        "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1252    BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
1253
1254    int32_t i;
1255    if (U_FAILURE(status))
1256    {
1257        errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1258        return;
1259    }
1260
1261    for (i = 0; i < precedingChars.length(); i++) {
1262        testString.setCharAt(1, precedingChars[i]);
1263        iter->setText(testString);
1264        int32_t j = iter->first();
1265        if (j != 0)
1266            errln("ja line break failure: failed to start at 0");
1267        j = iter->next();
1268        if (j != 1)
1269            errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
1270                        + "' (" + ((int)(precedingChars[i])) + ")");
1271        j = iter->next();
1272        if (j != 3)
1273            errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
1274                        + "' (" + ((int)(precedingChars[i])) + ")");
1275    }
1276
1277    for (i = 0; i < followingChars.length(); i++) {
1278        testString.setCharAt(1, followingChars[i]);
1279        iter->setText(testString);
1280        int j = iter->first();
1281        if (j != 0)
1282            errln("ja line break failure: failed to start at 0");
1283        j = iter->next();
1284        if (j != 2)
1285            errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
1286                        + "' (" + ((int)(followingChars[i])) + ")");
1287        j = iter->next();
1288        if (j != 3)
1289            errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
1290                        + "' (" + ((int)(followingChars[i])) + ")");
1291    }
1292    delete iter;
1293#endif
1294}
1295
1296
1297//------------------------------------------------------------------------------
1298//
1299//   RBBITest::Extended    Run  RBBI Tests from an external test data file
1300//
1301//------------------------------------------------------------------------------
1302
1303struct TestParams {
1304    BreakIterator   *bi;
1305    UnicodeString    dataToBreak;
1306    UVector32       *expectedBreaks;
1307    UVector32       *srcLine;
1308    UVector32       *srcCol;
1309};
1310
1311void RBBITest::executeTest(TestParams *t) {
1312    int32_t    bp;
1313    int32_t    prevBP;
1314    int32_t    i;
1315
1316    if (t->bi == NULL) {
1317        return;
1318    }
1319
1320    t->bi->setText(t->dataToBreak);
1321    //
1322    //  Run the iterator forward
1323    //
1324    prevBP = -1;
1325    for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1326        if (prevBP ==  bp) {
1327            // Fail for lack of forward progress.
1328            errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
1329                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1330            break;
1331        }
1332
1333        // Check that there were we didn't miss an expected break between the last one
1334        //  and this one.
1335        for (i=prevBP+1; i<bp; i++) {
1336            if (t->expectedBreaks->elementAti(i) != 0) {
1337                int expected[] = {0, i};
1338                printStringBreaks(t->dataToBreak, expected, 2);
1339                errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1340                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1341            }
1342        }
1343
1344        // Check that the break we did find was expected
1345        if (t->expectedBreaks->elementAti(bp) == 0) {
1346            int expected[] = {0, bp};
1347            printStringBreaks(t->dataToBreak, expected, 2);
1348            errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1349                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1350        } else {
1351            // The break was expected.
1352            //   Check that the {nnn} tag value is correct.
1353            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1354            if (expectedTagVal == -1) {
1355                expectedTagVal = 0;
1356            }
1357            int32_t line = t->srcLine->elementAti(bp);
1358            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1359            if (rs != expectedTagVal) {
1360                errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1361                      "          Actual, Expected status = %4d, %4d",
1362                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1363            }
1364        }
1365
1366
1367        prevBP = bp;
1368    }
1369
1370    // Verify that there were no missed expected breaks after the last one found
1371    for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
1372        if (t->expectedBreaks->elementAti(i) != 0) {
1373            errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1374                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1375        }
1376    }
1377
1378    //
1379    //  Run the iterator backwards, verify that the same breaks are found.
1380    //
1381    prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
1382    for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1383        if (prevBP ==  bp) {
1384            // Fail for lack of progress.
1385            errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1386                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1387            break;
1388        }
1389
1390        // Check that there were we didn't miss an expected break between the last one
1391        //  and this one.  (UVector returns zeros for index out of bounds.)
1392        for (i=prevBP-1; i>bp; i--) {
1393            if (t->expectedBreaks->elementAti(i) != 0) {
1394                errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1395                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1396            }
1397        }
1398
1399        // Check that the break we did find was expected
1400        if (t->expectedBreaks->elementAti(bp) == 0) {
1401            errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1402                   bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1403        } else {
1404            // The break was expected.
1405            //   Check that the {nnn} tag value is correct.
1406            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1407            if (expectedTagVal == -1) {
1408                expectedTagVal = 0;
1409            }
1410            int line = t->srcLine->elementAti(bp);
1411            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1412            if (rs != expectedTagVal) {
1413                errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1414                      "          Actual, Expected status = %4d, %4d",
1415                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1416            }
1417        }
1418
1419        prevBP = bp;
1420    }
1421
1422    // Verify that there were no missed breaks prior to the last one found
1423    for (i=prevBP-1; i>=0; i--) {
1424        if (t->expectedBreaks->elementAti(i) != 0) {
1425            errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1426                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1427        }
1428    }
1429}
1430
1431
1432void RBBITest::TestExtended() {
1433#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1434    UErrorCode      status  = U_ZERO_ERROR;
1435    Locale          locale("");
1436
1437    UnicodeString       rules;
1438    TestParams          tp;
1439    tp.bi             = NULL;
1440    tp.expectedBreaks = new UVector32(status);
1441    tp.srcLine        = new UVector32(status);
1442    tp.srcCol         = new UVector32(status);
1443
1444    RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
1445    if (U_FAILURE(status)) {
1446        dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1447    }
1448
1449
1450    //
1451    //  Open and read the test data file.
1452    //
1453    const char *testDataDirectory = IntlTest::getSourceTestData(status);
1454    char testFileName[1000];
1455    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1456        errln("Can't open test data.  Path too long.");
1457        return;
1458    }
1459    strcpy(testFileName, testDataDirectory);
1460    strcat(testFileName, "rbbitst.txt");
1461
1462    int    len;
1463    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1464    if (U_FAILURE(status)) {
1465        return; /* something went wrong, error already output */
1466    }
1467
1468
1469
1470
1471    //
1472    //  Put the test data into a UnicodeString
1473    //
1474    UnicodeString testString(FALSE, testFile, len);
1475
1476    enum EParseState{
1477        PARSE_COMMENT,
1478        PARSE_TAG,
1479        PARSE_DATA,
1480        PARSE_NUM
1481    }
1482    parseState = PARSE_TAG;
1483
1484    EParseState savedState = PARSE_TAG;
1485
1486    static const UChar CH_LF        = 0x0a;
1487    static const UChar CH_CR        = 0x0d;
1488    static const UChar CH_HASH      = 0x23;
1489    /*static const UChar CH_PERIOD    = 0x2e;*/
1490    static const UChar CH_LT        = 0x3c;
1491    static const UChar CH_GT        = 0x3e;
1492    static const UChar CH_BACKSLASH = 0x5c;
1493    static const UChar CH_BULLET    = 0x2022;
1494
1495    int32_t    lineNum  = 1;
1496    int32_t    colStart = 0;
1497    int32_t    column   = 0;
1498    int32_t    charIdx  = 0;
1499
1500    int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1501
1502    for (charIdx = 0; charIdx < len; ) {
1503        status = U_ZERO_ERROR;
1504        UChar  c = testString.charAt(charIdx);
1505        charIdx++;
1506        if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1507            // treat CRLF as a unit
1508            c = CH_LF;
1509            charIdx++;
1510        }
1511        if (c == CH_LF || c == CH_CR) {
1512            lineNum++;
1513            colStart = charIdx;
1514        }
1515        column = charIdx - colStart + 1;
1516
1517        switch (parseState) {
1518        case PARSE_COMMENT:
1519            if (c == 0x0a || c == 0x0d) {
1520                parseState = savedState;
1521            }
1522            break;
1523
1524        case PARSE_TAG:
1525            {
1526            if (c == CH_HASH) {
1527                parseState = PARSE_COMMENT;
1528                savedState = PARSE_TAG;
1529                break;
1530            }
1531            if (u_isUWhiteSpace(c)) {
1532                break;
1533            }
1534            if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1535                delete tp.bi;
1536                tp.bi = BreakIterator::createWordInstance(locale,  status);
1537                charIdx += 5;
1538                break;
1539            }
1540            if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1541                delete tp.bi;
1542                tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1543                charIdx += 5;
1544                break;
1545            }
1546            if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1547                delete tp.bi;
1548                tp.bi = BreakIterator::createLineInstance(locale,  status);
1549                charIdx += 5;
1550                break;
1551            }
1552            if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1553                delete tp.bi;
1554                tp.bi = NULL;
1555                tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1556                charIdx += 5;
1557                break;
1558            }
1559            if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1560                delete tp.bi;
1561                tp.bi = BreakIterator::createTitleInstance(locale,  status);
1562                charIdx += 6;
1563                break;
1564            }
1565
1566            // <locale  loc_name>
1567            localeMatcher.reset(testString);
1568            if (localeMatcher.lookingAt(charIdx-1, status)) {
1569                UnicodeString localeName = localeMatcher.group(1, status);
1570                char localeName8[100];
1571                localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1572                locale = Locale::createFromName(localeName8);
1573                charIdx += localeMatcher.group(0, status).length();
1574                TEST_ASSERT_SUCCESS(status);
1575                break;
1576            }
1577            if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1578                parseState = PARSE_DATA;
1579                charIdx += 5;
1580                tp.dataToBreak = "";
1581                tp.expectedBreaks->removeAllElements();
1582                tp.srcCol ->removeAllElements();
1583                tp.srcLine->removeAllElements();
1584                break;
1585            }
1586
1587            errln("line %d: Tag expected in test file.", lineNum);
1588            parseState = PARSE_COMMENT;
1589            savedState = PARSE_DATA;
1590            goto end_test; // Stop the test.
1591            }
1592            break;
1593
1594        case PARSE_DATA:
1595            if (c == CH_BULLET) {
1596                int32_t  breakIdx = tp.dataToBreak.length();
1597                tp.expectedBreaks->setSize(breakIdx+1);
1598                tp.expectedBreaks->setElementAt(-1, breakIdx);
1599                tp.srcLine->setSize(breakIdx+1);
1600                tp.srcLine->setElementAt(lineNum, breakIdx);
1601                tp.srcCol ->setSize(breakIdx+1);
1602                tp.srcCol ->setElementAt(column, breakIdx);
1603                break;
1604            }
1605
1606            if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1607                // Add final entry to mappings from break location to source file position.
1608                //  Need one extra because last break position returned is after the
1609                //    last char in the data, not at the last char.
1610                tp.srcLine->addElement(lineNum, status);
1611                tp.srcCol ->addElement(column, status);
1612
1613                parseState = PARSE_TAG;
1614                charIdx += 6;
1615
1616                // RUN THE TEST!
1617                executeTest(&tp);
1618                break;
1619            }
1620
1621            if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1622                // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1623                // Get the code point from the name and insert it into the test data.
1624                //   (Damn, no API takes names in Unicode  !!!
1625                //    we've got to take it back to char *)
1626                int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1627                int32_t nameLength = nameEndIdx - (charIdx+2);
1628                char charNameBuf[200];
1629                UChar32 theChar = -1;
1630                if (nameEndIdx != -1) {
1631                    UErrorCode status = U_ZERO_ERROR;
1632                    testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1633                    charNameBuf[sizeof(charNameBuf)-1] = 0;
1634                    theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1635                    if (U_FAILURE(status)) {
1636                        theChar = -1;
1637                    }
1638                }
1639                if (theChar == -1) {
1640                    errln("Error in named character in test file at line %d, col %d",
1641                        lineNum, column);
1642                } else {
1643                    // Named code point was recognized.  Insert it
1644                    //   into the test data.
1645                    tp.dataToBreak.append(theChar);
1646                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
1647                        tp.srcLine->addElement(lineNum, status);
1648                        tp.srcCol ->addElement(column, status);
1649                    }
1650                }
1651                if (nameEndIdx > charIdx) {
1652                    charIdx = nameEndIdx+1;
1653
1654                }
1655                break;
1656            }
1657
1658
1659
1660
1661            if (testString.compare(charIdx-1, 2, "<>") == 0) {
1662                charIdx++;
1663                int32_t  breakIdx = tp.dataToBreak.length();
1664                tp.expectedBreaks->setSize(breakIdx+1);
1665                tp.expectedBreaks->setElementAt(-1, breakIdx);
1666                tp.srcLine->setSize(breakIdx+1);
1667                tp.srcLine->setElementAt(lineNum, breakIdx);
1668                tp.srcCol ->setSize(breakIdx+1);
1669                tp.srcCol ->setElementAt(column, breakIdx);
1670                break;
1671            }
1672
1673            if (c == CH_LT) {
1674                tagValue   = 0;
1675                parseState = PARSE_NUM;
1676                break;
1677            }
1678
1679            if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1680                parseState = PARSE_COMMENT;
1681                savedState = PARSE_DATA;
1682                break;
1683            }
1684
1685            if (c == CH_BACKSLASH) {
1686                // Check for \ at end of line, a line continuation.
1687                //     Advance over (discard) the newline
1688                UChar32 cp = testString.char32At(charIdx);
1689                if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1690                    // We have a CR LF
1691                    //  Need an extra increment of the input ptr to move over both of them
1692                    charIdx++;
1693                }
1694                if (cp == CH_LF || cp == CH_CR) {
1695                    lineNum++;
1696                    colStart = charIdx;
1697                    charIdx++;
1698                    break;
1699                }
1700
1701                // Let unescape handle the back slash.
1702                cp = testString.unescapeAt(charIdx);
1703                if (cp != -1) {
1704                    // Escape sequence was recognized.  Insert the char
1705                    //   into the test data.
1706                    tp.dataToBreak.append(cp);
1707                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
1708                        tp.srcLine->addElement(lineNum, status);
1709                        tp.srcCol ->addElement(column, status);
1710                    }
1711                    break;
1712                }
1713
1714
1715                // Not a recognized backslash escape sequence.
1716                // Take the next char as a literal.
1717                //  TODO:  Should this be an error?
1718                c = testString.charAt(charIdx);
1719                charIdx = testString.moveIndex32(charIdx, 1);
1720            }
1721
1722            // Normal, non-escaped data char.
1723            tp.dataToBreak.append(c);
1724
1725            // Save the mapping from offset in the data to line/column numbers in
1726            //   the original input file.  Will be used for better error messages only.
1727            //   If there's an expected break before this char, the slot in the mapping
1728            //     vector will already be set for this char; don't overwrite it.
1729            if (tp.dataToBreak.length() > tp.srcLine->size()) {
1730                tp.srcLine->addElement(lineNum, status);
1731                tp.srcCol ->addElement(column, status);
1732            }
1733            break;
1734
1735
1736        case PARSE_NUM:
1737            // We are parsing an expected numeric tag value, like <1234>,
1738            //   within a chunk of data.
1739            if (u_isUWhiteSpace(c)) {
1740                break;
1741            }
1742
1743            if (c == CH_GT) {
1744                // Finished the number.  Add the info to the expected break data,
1745                //   and switch parse state back to doing plain data.
1746                parseState = PARSE_DATA;
1747                if (tagValue == 0) {
1748                    tagValue = -1;
1749                }
1750                int32_t  breakIdx = tp.dataToBreak.length();
1751                tp.expectedBreaks->setSize(breakIdx+1);
1752                tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1753                tp.srcLine->setSize(breakIdx+1);
1754                tp.srcLine->setElementAt(lineNum, breakIdx);
1755                tp.srcCol ->setSize(breakIdx+1);
1756                tp.srcCol ->setElementAt(column, breakIdx);
1757                break;
1758            }
1759
1760            if (u_isdigit(c)) {
1761                tagValue = tagValue*10 + u_charDigitValue(c);
1762                break;
1763            }
1764
1765            errln("Syntax Error in test file at line %d, col %d",
1766                lineNum, column);
1767            parseState = PARSE_COMMENT;
1768            goto end_test; // Stop the test
1769            break;
1770        }
1771
1772
1773        if (U_FAILURE(status)) {
1774            errln("ICU Error %s while parsing test file at line %d.",
1775                u_errorName(status), lineNum);
1776            status = U_ZERO_ERROR;
1777            goto end_test; // Stop the test
1778        }
1779
1780    }
1781
1782end_test:
1783    delete tp.bi;
1784    delete tp.expectedBreaks;
1785    delete tp.srcLine;
1786    delete tp.srcCol;
1787    delete [] testFile;
1788#endif
1789}
1790
1791void RBBITest::TestThaiBreaks() {
1792    UErrorCode status=U_ZERO_ERROR;
1793    BreakIterator* b;
1794    Locale locale = Locale("th");
1795    int32_t p, index;
1796    UChar c[]= {
1797            0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
1798            0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
1799            0x0E16, 0x0E49, 0x0E33
1800    };
1801    int32_t expectedWordResult[] = {
1802            2, 3, 6, 10, 11, 15, 17, 20, 22
1803    };
1804    int32_t expectedLineResult[] = {
1805            3, 6, 11, 15, 17, 20, 22
1806    };
1807    int32_t size = sizeof(c)/sizeof(UChar);
1808    UnicodeString text=UnicodeString(c);
1809
1810    b = BreakIterator::createWordInstance(locale, status);
1811    if (U_FAILURE(status)) {
1812        errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
1813        return;
1814    }
1815    b->setText(text);
1816    p = index = 0;
1817    while ((p=b->next())!=BreakIterator::DONE && p < size) {
1818        if (p != expectedWordResult[index++]) {
1819            errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
1820        }
1821    }
1822    delete b;
1823
1824    b = BreakIterator::createLineInstance(locale, status);
1825    if (U_FAILURE(status)) {
1826        printf("Unable to create thai line break iterator.\n");
1827        return;
1828    }
1829    b->setText(text);
1830    p = index = 0;
1831    while ((p=b->next())!=BreakIterator::DONE && p < size) {
1832        if (p != expectedLineResult[index++]) {
1833            errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
1834        }
1835    }
1836
1837    delete b;
1838}
1839
1840// UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
1841// Words don't include colon or period (cldrbug #1969).
1842static const char    posxWordText[]     = "Can't have breaks in xx:yy or struct.field for CS-types.";
1843static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
1844static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21,         26, 27, 29, 30,         42, 43, 46, 47, 49, 50, 55, 56 };
1845
1846// UBreakIteratorType UBRK_WORD, Locale "ja"
1847// Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
1848static const char    jaWordText[]     = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
1849                                        "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
1850static const int32_t jaWordTOffsets[] = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };
1851static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
1852
1853// UBreakIteratorType UBRK_SENTENCE, Locale "el"
1854// Add break after Greek question mark (cldrbug #2069).
1855static const char    elSentText[]     = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "
1856                                        "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3";
1857static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 };
1858static const int32_t elSentROffsets[] = {        20, 27, 35, 36 };
1859
1860// UBreakIteratorType UBRK_CHARACTER, Locale "th"
1861// Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
1862static const char    thCharText[]     = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 "
1863                                        "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) "
1864                                        "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 ";
1865static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,
1866                                          12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,
1867                                          29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };
1868static const int32_t thCharROffsets[] = { 1,    3, 5, 6, 7, 8, 9,     11,
1869                                          12, 13, 15,     17, 19, 20, 22,     24,     26, 27, 28,
1870                                          29,     32, 33, 35, 37, 38,     40, 41 };
1871
1872typedef struct {
1873    UBreakIteratorType  type;
1874    const char *        locale;
1875    const char *        escapedText;
1876    const int32_t *     tailoredOffsets;
1877    int32_t             tailoredOffsetsCount;
1878    const int32_t *     rootOffsets;
1879    int32_t             rootOffsetsCount;
1880} TailoredBreakItem;
1881
1882#define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0]))
1883
1884static const TailoredBreakItem tbItems[] = {
1885    { UBRK_WORD,      "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) },
1886    { UBRK_WORD,      "ja",          jaWordText,   ARRAY_PTR_LEN(jaWordTOffsets),   ARRAY_PTR_LEN(jaWordROffsets)   },
1887    { UBRK_SENTENCE,  "el",          elSentText,   ARRAY_PTR_LEN(elSentTOffsets),   ARRAY_PTR_LEN(elSentROffsets)   },
1888    { UBRK_CHARACTER, "th",          thCharText,   ARRAY_PTR_LEN(thCharTOffsets),   ARRAY_PTR_LEN(thCharROffsets)   },
1889    { UBRK_CHARACTER, NULL,          NULL,         NULL,0,                          NULL,0                          } // terminator
1890};
1891
1892static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) {
1893    while (count-- > 0) {
1894        int writeCount;
1895        sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */
1896        buffer += writeCount;
1897        buflen -= writeCount;
1898    }
1899}
1900
1901enum { kMaxOffsetCount = 128 };
1902
1903void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) {
1904    brkitr->setText( CharsToUnicodeString(escapedText) );
1905    int32_t foundOffsets[kMaxOffsetCount];
1906    int32_t offset, foundOffsetsCount = 0;
1907    // do forwards iteration test
1908    while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) {
1909        foundOffsets[foundOffsetsCount++] = offset;
1910    }
1911    if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) {
1912        // log error for forwards test
1913        char formatExpect[512], formatFound[512];
1914        formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
1915        formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets);
1916        errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n",
1917                type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound);
1918    } else {
1919        // do backwards iteration test
1920        --foundOffsetsCount; // back off one from the end offset
1921        while ( foundOffsetsCount > 0 ) {
1922            offset = brkitr->previous();
1923            if ( offset != foundOffsets[--foundOffsetsCount] ) {
1924                // log error for backwards test
1925                char formatExpect[512];
1926                formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
1927                errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n",
1928                        type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]);
1929                break;
1930            }
1931        }
1932    }
1933}
1934
1935void RBBITest::TestTailoredBreaks() {
1936    const TailoredBreakItem * tbItemPtr;
1937    Locale rootLocale = Locale("root");
1938    for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) {
1939        Locale testLocale = Locale(tbItemPtr->locale);
1940        BreakIterator * tailoredBrkiter;
1941        BreakIterator * rootBrkiter;
1942        UErrorCode status = U_ZERO_ERROR;
1943        switch (tbItemPtr->type) {
1944            case UBRK_CHARACTER:
1945                tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status);
1946                rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status);
1947                break;
1948            case UBRK_WORD:
1949                tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status);
1950                rootBrkiter = BreakIterator::createWordInstance(rootLocale, status);
1951                break;
1952            case UBRK_LINE:
1953                tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status);
1954                rootBrkiter = BreakIterator::createLineInstance(rootLocale, status);
1955                break;
1956            case UBRK_SENTENCE:
1957                tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status);
1958                rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status);
1959                break;
1960            default:
1961                status = U_UNSUPPORTED_ERROR;
1962                break;
1963        }
1964        if (U_FAILURE(status)) {
1965            errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status));
1966            continue;
1967        }
1968        TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount);
1969        TBTest(rootBrkiter,     (int)(tbItemPtr->type), "root",            tbItemPtr->escapedText, tbItemPtr->rootOffsets,     tbItemPtr->rootOffsetsCount);
1970
1971        delete rootBrkiter;
1972        delete tailoredBrkiter;
1973    }
1974}
1975
1976
1977//-------------------------------------------------------------------------------
1978//
1979//    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1980//    return the datain one big UChar * buffer, which the caller must delete.
1981//
1982//    parameters:
1983//          fileName:   the name of the file, with no directory part.  The test data directory
1984//                      is assumed.
1985//          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1986//          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1987//                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1988//                      Pass NULL for the system default encoding.
1989//          status
1990//    returns:
1991//                      The file data, converted to UChar.
1992//                      The caller must delete this when done with
1993//                           delete [] theBuffer;
1994//
1995//    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1996//           Move this function to some common place.
1997//
1998//--------------------------------------------------------------------------------
1999UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
2000    UChar       *retPtr  = NULL;
2001    char        *fileBuf = NULL;
2002    UConverter* conv     = NULL;
2003    FILE        *f       = NULL;
2004
2005    ulen = 0;
2006    if (U_FAILURE(status)) {
2007        return retPtr;
2008    }
2009
2010    //
2011    //  Open the file.
2012    //
2013    f = fopen(fileName, "rb");
2014    if (f == 0) {
2015        dataerrln("Error opening test data file %s\n", fileName);
2016        status = U_FILE_ACCESS_ERROR;
2017        return NULL;
2018    }
2019    //
2020    //  Read it in
2021    //
2022    int   fileSize;
2023    int   amt_read;
2024
2025    fseek( f, 0, SEEK_END);
2026    fileSize = ftell(f);
2027    fileBuf = new char[fileSize];
2028    fseek(f, 0, SEEK_SET);
2029    amt_read = fread(fileBuf, 1, fileSize, f);
2030    if (amt_read != fileSize || fileSize <= 0) {
2031        errln("Error reading test data file.");
2032        goto cleanUpAndReturn;
2033    }
2034
2035    //
2036    // Look for a Unicode Signature (BOM) on the data just read
2037    //
2038    int32_t        signatureLength;
2039    const char *   fileBufC;
2040    const char*    bomEncoding;
2041
2042    fileBufC = fileBuf;
2043    bomEncoding = ucnv_detectUnicodeSignature(
2044        fileBuf, fileSize, &signatureLength, &status);
2045    if(bomEncoding!=NULL ){
2046        fileBufC  += signatureLength;
2047        fileSize  -= signatureLength;
2048        encoding = bomEncoding;
2049    }
2050
2051    //
2052    // Open a converter to take the rule file to UTF-16
2053    //
2054    conv = ucnv_open(encoding, &status);
2055    if (U_FAILURE(status)) {
2056        goto cleanUpAndReturn;
2057    }
2058
2059    //
2060    // Convert the rules to UChar.
2061    //  Preflight first to determine required buffer size.
2062    //
2063    ulen = ucnv_toUChars(conv,
2064        NULL,           //  dest,
2065        0,              //  destCapacity,
2066        fileBufC,
2067        fileSize,
2068        &status);
2069    if (status == U_BUFFER_OVERFLOW_ERROR) {
2070        // Buffer Overflow is expected from the preflight operation.
2071        status = U_ZERO_ERROR;
2072
2073        retPtr = new UChar[ulen+1];
2074        ucnv_toUChars(conv,
2075            retPtr,       //  dest,
2076            ulen+1,
2077            fileBufC,
2078            fileSize,
2079            &status);
2080    }
2081
2082cleanUpAndReturn:
2083    fclose(f);
2084    delete []fileBuf;
2085    ucnv_close(conv);
2086    if (U_FAILURE(status)) {
2087        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
2088        delete retPtr;
2089        retPtr = 0;
2090        ulen   = 0;
2091    };
2092    return retPtr;
2093}
2094
2095
2096
2097//--------------------------------------------------------------------------------------------
2098//
2099//   Run tests from each of the boundary test data files distributed by the Unicode Consortium
2100//
2101//-------------------------------------------------------------------------------------------
2102void RBBITest::TestUnicodeFiles() {
2103    RuleBasedBreakIterator  *bi;
2104    UErrorCode               status = U_ZERO_ERROR;
2105
2106    bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getDefault(), status);
2107    TEST_ASSERT_SUCCESS(status);
2108    if (U_SUCCESS(status)) {
2109        runUnicodeTestData("GraphemeBreakTest.txt", bi);
2110    }
2111    delete bi;
2112
2113    bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status);
2114    TEST_ASSERT_SUCCESS(status);
2115    if (U_SUCCESS(status)) {
2116        runUnicodeTestData("WordBreakTest.txt", bi);
2117    }
2118    delete bi;
2119
2120    bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
2121    TEST_ASSERT_SUCCESS(status);
2122    if (U_SUCCESS(status)) {
2123        runUnicodeTestData("SentenceBreakTest.txt", bi);
2124    }
2125    delete bi;
2126
2127    bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
2128    TEST_ASSERT_SUCCESS(status);
2129    if (U_SUCCESS(status)) {
2130        runUnicodeTestData("LineBreakTest.txt", bi);
2131    }
2132    delete bi;
2133}
2134
2135
2136//--------------------------------------------------------------------------------------------
2137//
2138//   Run tests from one of the boundary test data files distributed by the Unicode Consortium
2139//
2140//-------------------------------------------------------------------------------------------
2141void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
2142#if !UCONFIG_NO_REGULAR_EXPRESSIONS
2143    UErrorCode  status = U_ZERO_ERROR;
2144
2145    //
2146    //  Open and read the test data file, put it into a UnicodeString.
2147    //
2148    const char *testDataDirectory = IntlTest::getSourceTestData(status);
2149    char testFileName[1000];
2150    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
2151        dataerrln("Can't open test data.  Path too long.");
2152        return;
2153    }
2154    strcpy(testFileName, testDataDirectory);
2155    strcat(testFileName, fileName);
2156
2157    logln("Opening data file %s\n", fileName);
2158
2159    int    len;
2160    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
2161    if (status != U_FILE_ACCESS_ERROR) {
2162        TEST_ASSERT_SUCCESS(status);
2163        TEST_ASSERT(testFile != NULL);
2164    }
2165    if (U_FAILURE(status) || testFile == NULL) {
2166        return; /* something went wrong, error already output */
2167    }
2168    UnicodeString testFileAsString(TRUE, testFile, len);
2169
2170    //
2171    //  Parse the test data file using a regular expression.
2172    //  Each kind of token is recognized in its own capture group; what type of item was scanned
2173    //     is identified by which group had a match.
2174    //
2175    //    Caputure Group #                  1          2            3            4           5
2176    //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
2177    //
2178    UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
2179    RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
2180    UnicodeString   testString;
2181    UVector32       breakPositions(status);
2182    int             lineNumber = 1;
2183    TEST_ASSERT_SUCCESS(status);
2184    if (U_FAILURE(status)) {
2185        return;
2186    }
2187
2188    //
2189    //  Scan through each test case, building up the string to be broken in testString,
2190    //   and the positions that should be boundaries in the breakPositions vector.
2191    //
2192    while (tokenMatcher.find()) {
2193        if (tokenMatcher.start(1, status) >= 0) {
2194            // Scanned a divide sign, indicating a break position in the test data.
2195            if (testString.length()>0) {
2196                breakPositions.addElement(testString.length(), status);
2197            }
2198        }
2199        else if (tokenMatcher.start(2, status) >= 0) {
2200            // Scanned an 'x', meaning no break at this position in the test data
2201            //   Nothing to be done here.
2202            }
2203        else if (tokenMatcher.start(3, status) >= 0) {
2204            // Scanned Hex digits.  Convert them to binary, append to the character data string.
2205            const UnicodeString &hexNumber = tokenMatcher.group(3, status);
2206            int length = hexNumber.length();
2207            if (length<=8) {
2208                char buf[10];
2209                hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
2210                UChar32 c = (UChar32)strtol(buf, NULL, 16);
2211                if (c<=0x10ffff) {
2212                    testString.append(c);
2213                } else {
2214                    errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
2215                       fileName, lineNumber);
2216                }
2217            } else {
2218                errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
2219                       fileName, lineNumber);
2220             }
2221        }
2222        else if (tokenMatcher.start(4, status) >= 0) {
2223            // Scanned to end of a line, possibly skipping over a comment in the process.
2224            //   If the line from the file contained test data, run the test now.
2225            //
2226            if (testString.length() > 0) {
2227                checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
2228            }
2229
2230            // Clear out this test case.
2231            //    The string and breakPositions vector will be refilled as the next
2232            //       test case is parsed.
2233            testString.remove();
2234            breakPositions.removeAllElements();
2235            lineNumber++;
2236        } else {
2237            // Scanner catchall.  Something unrecognized appeared on the line.
2238            char token[16];
2239            UnicodeString uToken = tokenMatcher.group(0, status);
2240            uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
2241            token[sizeof(token)-1] = 0;
2242            errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
2243
2244            // Clean up, in preparation for continuing with the next line.
2245            testString.remove();
2246            breakPositions.removeAllElements();
2247            lineNumber++;
2248        }
2249        TEST_ASSERT_SUCCESS(status);
2250        if (U_FAILURE(status)) {
2251            break;
2252        }
2253    }
2254
2255    delete [] testFile;
2256 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
2257}
2258
2259//--------------------------------------------------------------------------------------------
2260//
2261//   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
2262//                            test data files.  Do only a simple, forward-only check -
2263//                            this test is mostly to check that ICU and the Unicode
2264//                            data agree with each other.
2265//
2266//--------------------------------------------------------------------------------------------
2267void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
2268                         const UnicodeString &testString,   // Text data to be broken
2269                         UVector32 *breakPositions,         // Positions where breaks should be found.
2270                         RuleBasedBreakIterator *bi) {
2271    int32_t pos;                 // Break Position in the test string
2272    int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
2273    int32_t expectedPos;         // Expected break position (index into test string)
2274
2275    bi->setText(testString);
2276    pos = bi->first();
2277    pos = bi->next();
2278
2279    while (pos != BreakIterator::DONE) {
2280        if (expectedI >= breakPositions->size()) {
2281            errln("Test file \"%s\", line %d, unexpected break found at position %d",
2282                testFileName, lineNumber, pos);
2283            break;
2284        }
2285        expectedPos = breakPositions->elementAti(expectedI);
2286        if (pos < expectedPos) {
2287            errln("Test file \"%s\", line %d, unexpected break found at position %d",
2288                testFileName, lineNumber, pos);
2289            break;
2290        }
2291        if (pos > expectedPos) {
2292            errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2293                testFileName, lineNumber, expectedPos);
2294            break;
2295        }
2296        pos = bi->next();
2297        expectedI++;
2298    }
2299
2300    if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
2301        errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2302            testFileName, lineNumber, breakPositions->elementAti(expectedI));
2303    }
2304}
2305
2306
2307
2308#if !UCONFIG_NO_REGULAR_EXPRESSIONS
2309//---------------------------------------------------------------------------------------
2310//
2311//   classs RBBIMonkeyKind
2312//
2313//      Monkey Test for Break Iteration
2314//      Abstract interface class.   Concrete derived classes independently
2315//      implement the break rules for different iterator types.
2316//
2317//      The Monkey Test itself uses doesn't know which type of break iterator it is
2318//      testing, but works purely in terms of the interface defined here.
2319//
2320//---------------------------------------------------------------------------------------
2321class RBBIMonkeyKind {
2322public:
2323    // Return a UVector of UnicodeSets, representing the character classes used
2324    //   for this type of iterator.
2325    virtual  UVector  *charClasses() = 0;
2326
2327    // Set the test text on which subsequent calls to next() will operate
2328    virtual  void      setText(const UnicodeString &s) = 0;
2329
2330    // Find the next break postion, starting from the prev break position, or from zero.
2331    // Return -1 after reaching end of string.
2332    virtual  int32_t   next(int32_t i) = 0;
2333
2334    virtual ~RBBIMonkeyKind();
2335    UErrorCode       deferredStatus;
2336
2337
2338protected:
2339    RBBIMonkeyKind();
2340
2341private:
2342};
2343
2344RBBIMonkeyKind::RBBIMonkeyKind() {
2345    deferredStatus = U_ZERO_ERROR;
2346}
2347
2348RBBIMonkeyKind::~RBBIMonkeyKind() {
2349}
2350
2351
2352//----------------------------------------------------------------------------------------
2353//
2354//   Random Numbers.  Similar to standard lib rand() and srand()
2355//                    Not using library to
2356//                      1.  Get same results on all platforms.
2357//                      2.  Get access to current seed, to more easily reproduce failures.
2358//
2359//---------------------------------------------------------------------------------------
2360static uint32_t m_seed = 1;
2361
2362static uint32_t m_rand()
2363{
2364    m_seed = m_seed * 1103515245 + 12345;
2365    return (uint32_t)(m_seed/65536) % 32768;
2366}
2367
2368
2369//------------------------------------------------------------------------------------------
2370//
2371//   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
2372//                             of RBBIMonkeyKind.
2373//
2374//------------------------------------------------------------------------------------------
2375class RBBICharMonkey: public RBBIMonkeyKind {
2376public:
2377    RBBICharMonkey();
2378    virtual          ~RBBICharMonkey();
2379    virtual  UVector *charClasses();
2380    virtual  void     setText(const UnicodeString &s);
2381    virtual  int32_t  next(int32_t i);
2382private:
2383    UVector   *fSets;
2384
2385    UnicodeSet  *fCRLFSet;
2386    UnicodeSet  *fControlSet;
2387    UnicodeSet  *fExtendSet;
2388    UnicodeSet  *fPrependSet;
2389    UnicodeSet  *fSpacingSet;
2390    UnicodeSet  *fLSet;
2391    UnicodeSet  *fVSet;
2392    UnicodeSet  *fTSet;
2393    UnicodeSet  *fLVSet;
2394    UnicodeSet  *fLVTSet;
2395    UnicodeSet  *fHangulSet;
2396    UnicodeSet  *fAnySet;
2397
2398    const UnicodeString *fText;
2399};
2400
2401
2402RBBICharMonkey::RBBICharMonkey() {
2403    UErrorCode  status = U_ZERO_ERROR;
2404
2405    fText = NULL;
2406
2407    fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2408    fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
2409    fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
2410    fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2411    fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2412    fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2413    fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2414    fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2415    fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2416    fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2417    fHangulSet  = new UnicodeSet();
2418    fHangulSet->addAll(*fLSet);
2419    fHangulSet->addAll(*fVSet);
2420    fHangulSet->addAll(*fTSet);
2421    fHangulSet->addAll(*fLVSet);
2422    fHangulSet->addAll(*fLVTSet);
2423    fAnySet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status);
2424
2425    fSets       = new UVector(status);
2426    fSets->addElement(fCRLFSet,    status);
2427    fSets->addElement(fControlSet, status);
2428    fSets->addElement(fExtendSet,  status);
2429    fSets->addElement(fPrependSet, status);
2430    fSets->addElement(fSpacingSet, status);
2431    fSets->addElement(fHangulSet,  status);
2432    fSets->addElement(fAnySet,     status);
2433    if (U_FAILURE(status)) {
2434        deferredStatus = status;
2435    }
2436}
2437
2438
2439void RBBICharMonkey::setText(const UnicodeString &s) {
2440    fText = &s;
2441}
2442
2443
2444
2445int32_t RBBICharMonkey::next(int32_t prevPos) {
2446    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2447                              //   break position being tested.  The candidate break
2448                              //   location is before p2.
2449
2450    int     breakPos = -1;
2451
2452    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2453
2454    if (U_FAILURE(deferredStatus)) {
2455        return -1;
2456    }
2457
2458    // Previous break at end of string.  return DONE.
2459    if (prevPos >= fText->length()) {
2460        return -1;
2461    }
2462    p0 = p1 = p2 = p3 = prevPos;
2463    c3 =  fText->char32At(prevPos);
2464    c0 = c1 = c2 = 0;
2465
2466    // Loop runs once per "significant" character position in the input text.
2467    for (;;) {
2468        // Move all of the positions forward in the input string.
2469        p0 = p1;  c0 = c1;
2470        p1 = p2;  c1 = c2;
2471        p2 = p3;  c2 = c3;
2472
2473        // Advancd p3 by one codepoint
2474        p3 = fText->moveIndex32(p3, 1);
2475        c3 = fText->char32At(p3);
2476
2477        if (p1 == p2) {
2478            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2479            continue;
2480        }
2481        if (p2 == fText->length()) {
2482            // Reached end of string.  Always a break position.
2483            break;
2484        }
2485
2486        // Rule  GB3   CR x LF
2487        //     No Extend or Format characters may appear between the CR and LF,
2488        //     which requires the additional check for p2 immediately following p1.
2489        //
2490        if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2491            continue;
2492        }
2493
2494        // Rule (GB4).   ( Control | CR | LF ) <break>
2495        if (fControlSet->contains(c1) ||
2496            c1 == 0x0D ||
2497            c1 == 0x0A)  {
2498            break;
2499        }
2500
2501        // Rule (GB5)    <break>  ( Control | CR | LF )
2502        //
2503        if (fControlSet->contains(c2) ||
2504            c2 == 0x0D ||
2505            c2 == 0x0A)  {
2506            break;
2507        }
2508
2509
2510        // Rule (GB6)  L x ( L | V | LV | LVT )
2511        if (fLSet->contains(c1) &&
2512               (fLSet->contains(c2)  ||
2513                fVSet->contains(c2)  ||
2514                fLVSet->contains(c2) ||
2515                fLVTSet->contains(c2))) {
2516            continue;
2517        }
2518
2519        // Rule (GB7)    ( LV | V )  x  ( V | T )
2520        if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2521            (fVSet->contains(c2) || fTSet->contains(c2)))  {
2522            continue;
2523        }
2524
2525        // Rule (GB8)    ( LVT | T)  x T
2526        if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2527            fTSet->contains(c2))  {
2528            continue;
2529        }
2530
2531        // Rule (GB9)    Numeric x ALetter
2532        if (fExtendSet->contains(c2))  {
2533            continue;
2534        }
2535
2536        // Rule (GB9a)   x  SpacingMark
2537        if (fSpacingSet->contains(c2)) {
2538            continue;
2539        }
2540
2541        // Rule (GB9b)   Prepend x
2542        if (fPrependSet->contains(c1)) {
2543            continue;
2544        }
2545
2546        // Rule (GB10)  Any  <break>  Any
2547        break;
2548    }
2549
2550    breakPos = p2;
2551    return breakPos;
2552}
2553
2554
2555
2556UVector  *RBBICharMonkey::charClasses() {
2557    return fSets;
2558}
2559
2560
2561RBBICharMonkey::~RBBICharMonkey() {
2562    delete fSets;
2563    delete fCRLFSet;
2564    delete fControlSet;
2565    delete fExtendSet;
2566    delete fPrependSet;
2567    delete fSpacingSet;
2568    delete fLSet;
2569    delete fVSet;
2570    delete fTSet;
2571    delete fLVSet;
2572    delete fLVTSet;
2573    delete fHangulSet;
2574    delete fAnySet;
2575}
2576
2577//------------------------------------------------------------------------------------------
2578//
2579//   class RBBIWordMonkey      Word Break specific implementation
2580//                             of RBBIMonkeyKind.
2581//
2582//------------------------------------------------------------------------------------------
2583class RBBIWordMonkey: public RBBIMonkeyKind {
2584public:
2585    RBBIWordMonkey();
2586    virtual          ~RBBIWordMonkey();
2587    virtual  UVector *charClasses();
2588    virtual  void     setText(const UnicodeString &s);
2589    virtual int32_t   next(int32_t i);
2590private:
2591    UVector      *fSets;
2592
2593    UnicodeSet  *fCRSet;
2594    UnicodeSet  *fLFSet;
2595    UnicodeSet  *fNewlineSet;
2596    UnicodeSet  *fKatakanaSet;
2597    UnicodeSet  *fALetterSet;
2598    UnicodeSet  *fMidNumLetSet;
2599    UnicodeSet  *fMidLetterSet;
2600    UnicodeSet  *fMidNumSet;
2601    UnicodeSet  *fNumericSet;
2602    UnicodeSet  *fFormatSet;
2603    UnicodeSet  *fOtherSet;
2604    UnicodeSet  *fExtendSet;
2605    UnicodeSet  *fExtendNumLetSet;
2606
2607    RegexMatcher  *fMatcher;
2608
2609    const UnicodeString  *fText;
2610};
2611
2612
2613RBBIWordMonkey::RBBIWordMonkey()
2614{
2615    UErrorCode  status = U_ZERO_ERROR;
2616
2617    fSets            = new UVector(status);
2618
2619    fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2620    fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2621    fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2622    fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"),      status);
2623    fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2624    fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2625    fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2626    fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2627    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2628    fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2629    fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2630    fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2631
2632    fOtherSet        = new UnicodeSet();
2633    if(U_FAILURE(status)) {
2634      deferredStatus = status;
2635      return;
2636    }
2637
2638    fOtherSet->complement();
2639    fOtherSet->removeAll(*fCRSet);
2640    fOtherSet->removeAll(*fLFSet);
2641    fOtherSet->removeAll(*fNewlineSet);
2642    fOtherSet->removeAll(*fKatakanaSet);
2643    fOtherSet->removeAll(*fALetterSet);
2644    fOtherSet->removeAll(*fMidLetterSet);
2645    fOtherSet->removeAll(*fMidNumSet);
2646    fOtherSet->removeAll(*fNumericSet);
2647    fOtherSet->removeAll(*fExtendNumLetSet);
2648    fOtherSet->removeAll(*fFormatSet);
2649    fOtherSet->removeAll(*fExtendSet);
2650    // Inhibit dictionary characters from being tested at all.
2651    fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2652
2653    fSets->addElement(fCRSet,        status);
2654    fSets->addElement(fLFSet,        status);
2655    fSets->addElement(fNewlineSet,   status);
2656    fSets->addElement(fALetterSet,   status);
2657    fSets->addElement(fKatakanaSet,  status);
2658    fSets->addElement(fMidLetterSet, status);
2659    fSets->addElement(fMidNumLetSet, status);
2660    fSets->addElement(fMidNumSet,    status);
2661    fSets->addElement(fNumericSet,   status);
2662    fSets->addElement(fFormatSet,    status);
2663    fSets->addElement(fExtendSet,    status);
2664    fSets->addElement(fOtherSet,     status);
2665    fSets->addElement(fExtendNumLetSet, status);
2666
2667    if (U_FAILURE(status)) {
2668        deferredStatus = status;
2669    }
2670}
2671
2672void RBBIWordMonkey::setText(const UnicodeString &s) {
2673    fText       = &s;
2674}
2675
2676
2677int32_t RBBIWordMonkey::next(int32_t prevPos) {
2678    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2679                              //   break position being tested.  The candidate break
2680                              //   location is before p2.
2681
2682    int     breakPos = -1;
2683
2684    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2685
2686    if (U_FAILURE(deferredStatus)) {
2687        return -1;
2688    }
2689
2690    // Prev break at end of string.  return DONE.
2691    if (prevPos >= fText->length()) {
2692        return -1;
2693    }
2694    p0 = p1 = p2 = p3 = prevPos;
2695    c3 =  fText->char32At(prevPos);
2696    c0 = c1 = c2 = 0;
2697
2698    // Loop runs once per "significant" character position in the input text.
2699    for (;;) {
2700        // Move all of the positions forward in the input string.
2701        p0 = p1;  c0 = c1;
2702        p1 = p2;  c1 = c2;
2703        p2 = p3;  c2 = c3;
2704
2705        // Advancd p3 by    X(Extend | Format)*   Rule 4
2706        //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2707        do {
2708            p3 = fText->moveIndex32(p3, 1);
2709            c3 = fText->char32At(p3);
2710            if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2711               break;
2712            };
2713        }
2714        while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2715
2716
2717        if (p1 == p2) {
2718            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2719            continue;
2720        }
2721        if (p2 == fText->length()) {
2722            // Reached end of string.  Always a break position.
2723            break;
2724        }
2725
2726        // Rule  (3)   CR x LF
2727        //     No Extend or Format characters may appear between the CR and LF,
2728        //     which requires the additional check for p2 immediately following p1.
2729        //
2730        if (c1==0x0D && c2==0x0A) {
2731            continue;
2732        }
2733
2734        // Rule (3a)  Break before and after newlines (including CR and LF)
2735        //
2736        if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2737            break;
2738        };
2739        if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2740            break;
2741        };
2742
2743        // Rule (5).   ALetter x ALetter
2744        if (fALetterSet->contains(c1) &&
2745            fALetterSet->contains(c2))  {
2746            continue;
2747        }
2748
2749        // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
2750        //
2751        if ( fALetterSet->contains(c1)   &&
2752             (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
2753             fALetterSet->contains(c3)) {
2754            continue;
2755        }
2756
2757
2758        // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
2759        if (fALetterSet->contains(c0) &&
2760            (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
2761            fALetterSet->contains(c2)) {
2762            continue;
2763        }
2764
2765        // Rule (8)    Numeric x Numeric
2766        if (fNumericSet->contains(c1) &&
2767            fNumericSet->contains(c2))  {
2768            continue;
2769        }
2770
2771        // Rule (9)    ALetter x Numeric
2772        if (fALetterSet->contains(c1) &&
2773            fNumericSet->contains(c2))  {
2774            continue;
2775        }
2776
2777        // Rule (10)    Numeric x ALetter
2778        if (fNumericSet->contains(c1) &&
2779            fALetterSet->contains(c2))  {
2780            continue;
2781        }
2782
2783        // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
2784        if (fNumericSet->contains(c0) &&
2785            (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
2786            fNumericSet->contains(c2)) {
2787            continue;
2788        }
2789
2790        // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
2791        if (fNumericSet->contains(c1) &&
2792            (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
2793            fNumericSet->contains(c3)) {
2794            continue;
2795        }
2796
2797        // Rule (13)  Katakana x Katakana
2798        if (fKatakanaSet->contains(c1) &&
2799            fKatakanaSet->contains(c2))  {
2800            continue;
2801        }
2802
2803        // Rule 13a
2804        if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2805             fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2806             fExtendNumLetSet->contains(c2)) {
2807                continue;
2808             }
2809
2810        // Rule 13b
2811        if (fExtendNumLetSet->contains(c1) &&
2812                (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2813                fKatakanaSet->contains(c2)))  {
2814                continue;
2815             }
2816
2817        // Rule 14.  Break found here.
2818        break;
2819    }
2820
2821    breakPos = p2;
2822    return breakPos;
2823}
2824
2825
2826UVector  *RBBIWordMonkey::charClasses() {
2827    return fSets;
2828}
2829
2830
2831RBBIWordMonkey::~RBBIWordMonkey() {
2832    delete fSets;
2833    delete fCRSet;
2834    delete fLFSet;
2835    delete fNewlineSet;
2836    delete fKatakanaSet;
2837    delete fALetterSet;
2838    delete fMidNumLetSet;
2839    delete fMidLetterSet;
2840    delete fMidNumSet;
2841    delete fNumericSet;
2842    delete fFormatSet;
2843    delete fExtendSet;
2844    delete fExtendNumLetSet;
2845    delete fOtherSet;
2846}
2847
2848
2849
2850
2851//------------------------------------------------------------------------------------------
2852//
2853//   class RBBISentMonkey      Sentence Break specific implementation
2854//                             of RBBIMonkeyKind.
2855//
2856//------------------------------------------------------------------------------------------
2857class RBBISentMonkey: public RBBIMonkeyKind {
2858public:
2859    RBBISentMonkey();
2860    virtual          ~RBBISentMonkey();
2861    virtual  UVector *charClasses();
2862    virtual  void     setText(const UnicodeString &s);
2863    virtual int32_t   next(int32_t i);
2864private:
2865    int               moveBack(int posFrom);
2866    int               moveForward(int posFrom);
2867    UChar32           cAt(int pos);
2868
2869    UVector      *fSets;
2870
2871    UnicodeSet  *fSepSet;
2872    UnicodeSet  *fFormatSet;
2873    UnicodeSet  *fSpSet;
2874    UnicodeSet  *fLowerSet;
2875    UnicodeSet  *fUpperSet;
2876    UnicodeSet  *fOLetterSet;
2877    UnicodeSet  *fNumericSet;
2878    UnicodeSet  *fATermSet;
2879    UnicodeSet  *fSContinueSet;
2880    UnicodeSet  *fSTermSet;
2881    UnicodeSet  *fCloseSet;
2882    UnicodeSet  *fOtherSet;
2883    UnicodeSet  *fExtendSet;
2884
2885    const UnicodeString  *fText;
2886
2887};
2888
2889RBBISentMonkey::RBBISentMonkey()
2890{
2891    UErrorCode  status = U_ZERO_ERROR;
2892
2893    fSets            = new UVector(status);
2894
2895    //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2896    //                       set and made into character classes of their own.  For the monkey impl,
2897    //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2898    fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2899    fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2900    fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2901    fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2902    fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2903    fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2904    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2905    fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2906    fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2907    fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2908    fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2909    fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2910    fOtherSet        = new UnicodeSet();
2911
2912    if(U_FAILURE(status)) {
2913      deferredStatus = status;
2914      return;
2915    }
2916
2917    fOtherSet->complement();
2918    fOtherSet->removeAll(*fSepSet);
2919    fOtherSet->removeAll(*fFormatSet);
2920    fOtherSet->removeAll(*fSpSet);
2921    fOtherSet->removeAll(*fLowerSet);
2922    fOtherSet->removeAll(*fUpperSet);
2923    fOtherSet->removeAll(*fOLetterSet);
2924    fOtherSet->removeAll(*fNumericSet);
2925    fOtherSet->removeAll(*fATermSet);
2926    fOtherSet->removeAll(*fSContinueSet);
2927    fOtherSet->removeAll(*fSTermSet);
2928    fOtherSet->removeAll(*fCloseSet);
2929    fOtherSet->removeAll(*fExtendSet);
2930
2931    fSets->addElement(fSepSet,       status);
2932    fSets->addElement(fFormatSet,    status);
2933    fSets->addElement(fSpSet,        status);
2934    fSets->addElement(fLowerSet,     status);
2935    fSets->addElement(fUpperSet,     status);
2936    fSets->addElement(fOLetterSet,   status);
2937    fSets->addElement(fNumericSet,   status);
2938    fSets->addElement(fATermSet,     status);
2939    fSets->addElement(fSContinueSet, status);
2940    fSets->addElement(fSTermSet,     status);
2941    fSets->addElement(fCloseSet,     status);
2942    fSets->addElement(fOtherSet,     status);
2943    fSets->addElement(fExtendSet,    status);
2944
2945    if (U_FAILURE(status)) {
2946        deferredStatus = status;
2947    }
2948}
2949
2950
2951
2952void RBBISentMonkey::setText(const UnicodeString &s) {
2953    fText       = &s;
2954}
2955
2956UVector  *RBBISentMonkey::charClasses() {
2957    return fSets;
2958}
2959
2960
2961//  moveBack()   Find the "significant" code point preceding the index i.
2962//               Skips over ($Extend | $Format)* .
2963//
2964int RBBISentMonkey::moveBack(int i) {
2965    if (i <= 0) {
2966        return -1;
2967    }
2968    UChar32   c;
2969    int32_t   j = i;
2970    do {
2971        j = fText->moveIndex32(j, -1);
2972        c = fText->char32At(j);
2973    }
2974    while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2975    return j;
2976
2977 }
2978
2979
2980int RBBISentMonkey::moveForward(int i) {
2981    if (i>=fText->length()) {
2982        return fText->length();
2983    }
2984    UChar32   c;
2985    int32_t   j = i;
2986    do {
2987        j = fText->moveIndex32(j, 1);
2988        c = cAt(j);
2989    }
2990    while (fFormatSet->contains(c) || fExtendSet->contains(c));
2991    return j;
2992}
2993
2994UChar32 RBBISentMonkey::cAt(int pos) {
2995    if (pos<0 || pos>=fText->length()) {
2996        return -1;
2997    } else {
2998        return fText->char32At(pos);
2999    }
3000}
3001
3002int32_t RBBISentMonkey::next(int32_t prevPos) {
3003    int    p0, p1, p2, p3;    // Indices of the significant code points around the
3004                              //   break position being tested.  The candidate break
3005                              //   location is before p2.
3006
3007    int     breakPos = -1;
3008
3009    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
3010    UChar32 c;
3011
3012    if (U_FAILURE(deferredStatus)) {
3013        return -1;
3014    }
3015
3016    // Prev break at end of string.  return DONE.
3017    if (prevPos >= fText->length()) {
3018        return -1;
3019    }
3020    p0 = p1 = p2 = p3 = prevPos;
3021    c3 =  fText->char32At(prevPos);
3022    c0 = c1 = c2 = 0;
3023
3024    // Loop runs once per "significant" character position in the input text.
3025    for (;;) {
3026        // Move all of the positions forward in the input string.
3027        p0 = p1;  c0 = c1;
3028        p1 = p2;  c1 = c2;
3029        p2 = p3;  c2 = c3;
3030
3031        // Advancd p3 by    X(Extend | Format)*   Rule 4
3032        p3 = moveForward(p3);
3033        c3 = cAt(p3);
3034
3035        // Rule (3)  CR x LF
3036        if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
3037            continue;
3038        }
3039
3040        // Rule (4).   Sep  <break>
3041        if (fSepSet->contains(c1)) {
3042            p2 = p1+1;   // Separators don't combine with Extend or Format.
3043            break;
3044        }
3045
3046        if (p2 >= fText->length()) {
3047            // Reached end of string.  Always a break position.
3048            break;
3049        }
3050
3051        if (p2 == prevPos) {
3052            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
3053            continue;
3054        }
3055
3056        // Rule (6).   ATerm x Numeric
3057        if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
3058            continue;
3059        }
3060
3061        // Rule (7).  Upper ATerm  x  Uppper
3062        if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
3063            continue;
3064        }
3065
3066        // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
3067        //           Note:  STerm | ATerm are added to the negated part of the expression by a
3068        //                  note to the Unicode 5.0 documents.
3069        int p8 = p1;
3070        while (fSpSet->contains(cAt(p8))) {
3071            p8 = moveBack(p8);
3072        }
3073        while (fCloseSet->contains(cAt(p8))) {
3074            p8 = moveBack(p8);
3075        }
3076        if (fATermSet->contains(cAt(p8))) {
3077            p8=p2;
3078            for (;;) {
3079                c = cAt(p8);
3080                if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
3081                    fLowerSet->contains(c) || fSepSet->contains(c) ||
3082                    fATermSet->contains(c) || fSTermSet->contains(c))  {
3083                    break;
3084                }
3085                p8 = moveForward(p8);
3086            }
3087            if (fLowerSet->contains(cAt(p8))) {
3088                continue;
3089            }
3090        }
3091
3092        // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
3093        if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
3094            p8 = p1;
3095            while (fSpSet->contains(cAt(p8))) {
3096                p8 = moveBack(p8);
3097            }
3098            while (fCloseSet->contains(cAt(p8))) {
3099                p8 = moveBack(p8);
3100            }
3101            c = cAt(p8);
3102            if (fSTermSet->contains(c) || fATermSet->contains(c)) {
3103                continue;
3104            }
3105        }
3106
3107        // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
3108        int p9 = p1;
3109        while (fCloseSet->contains(cAt(p9))) {
3110            p9 = moveBack(p9);
3111        }
3112        c = cAt(p9);
3113        if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
3114            if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
3115                continue;
3116            }
3117        }
3118
3119        // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
3120        int p10 = p1;
3121        while (fSpSet->contains(cAt(p10))) {
3122            p10 = moveBack(p10);
3123        }
3124        while (fCloseSet->contains(cAt(p10))) {
3125            p10 = moveBack(p10);
3126        }
3127        if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
3128            if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
3129                continue;
3130            }
3131        }
3132
3133        // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
3134        int p11 = p1;
3135        if (fSepSet->contains(cAt(p11))) {
3136            p11 = moveBack(p11);
3137        }
3138        while (fSpSet->contains(cAt(p11))) {
3139            p11 = moveBack(p11);
3140        }
3141        while (fCloseSet->contains(cAt(p11))) {
3142            p11 = moveBack(p11);
3143        }
3144        if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
3145            break;
3146        }
3147
3148        //  Rule (12)  Any x Any
3149        continue;
3150    }
3151    breakPos = p2;
3152    return breakPos;
3153}
3154
3155RBBISentMonkey::~RBBISentMonkey() {
3156    delete fSets;
3157    delete fSepSet;
3158    delete fFormatSet;
3159    delete fSpSet;
3160    delete fLowerSet;
3161    delete fUpperSet;
3162    delete fOLetterSet;
3163    delete fNumericSet;
3164    delete fATermSet;
3165    delete fSContinueSet;
3166    delete fSTermSet;
3167    delete fCloseSet;
3168    delete fOtherSet;
3169    delete fExtendSet;
3170}
3171
3172
3173
3174//-------------------------------------------------------------------------------------------
3175//
3176//  RBBILineMonkey
3177//
3178//-------------------------------------------------------------------------------------------
3179
3180class RBBILineMonkey: public RBBIMonkeyKind {
3181public:
3182    RBBILineMonkey();
3183    virtual          ~RBBILineMonkey();
3184    virtual  UVector *charClasses();
3185    virtual  void     setText(const UnicodeString &s);
3186    virtual  int32_t  next(int32_t i);
3187    virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
3188private:
3189    UVector      *fSets;
3190
3191    UnicodeSet  *fBK;
3192    UnicodeSet  *fCR;
3193    UnicodeSet  *fLF;
3194    UnicodeSet  *fCM;
3195    UnicodeSet  *fNL;
3196    UnicodeSet  *fSG;
3197    UnicodeSet  *fWJ;
3198    UnicodeSet  *fZW;
3199    UnicodeSet  *fGL;
3200    UnicodeSet  *fCB;
3201    UnicodeSet  *fSP;
3202    UnicodeSet  *fB2;
3203    UnicodeSet  *fBA;
3204    UnicodeSet  *fBB;
3205    UnicodeSet  *fHY;
3206    UnicodeSet  *fH2;
3207    UnicodeSet  *fH3;
3208    UnicodeSet  *fCL;
3209    UnicodeSet  *fEX;
3210    UnicodeSet  *fIN;
3211    UnicodeSet  *fJL;
3212    UnicodeSet  *fJV;
3213    UnicodeSet  *fJT;
3214    UnicodeSet  *fNS;
3215    UnicodeSet  *fOP;
3216    UnicodeSet  *fQU;
3217    UnicodeSet  *fIS;
3218    UnicodeSet  *fNU;
3219    UnicodeSet  *fPO;
3220    UnicodeSet  *fPR;
3221    UnicodeSet  *fSY;
3222    UnicodeSet  *fAI;
3223    UnicodeSet  *fAL;
3224    UnicodeSet  *fID;
3225    UnicodeSet  *fSA;
3226    UnicodeSet  *fXX;
3227
3228    BreakIterator  *fCharBI;
3229
3230    const UnicodeString  *fText;
3231    int32_t              *fOrigPositions;
3232
3233    RegexMatcher         *fNumberMatcher;
3234    RegexMatcher         *fLB11Matcher;
3235};
3236
3237
3238RBBILineMonkey::RBBILineMonkey()
3239{
3240    UErrorCode  status = U_ZERO_ERROR;
3241
3242    fSets  = new UVector(status);
3243
3244    fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3245    fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3246    fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3247    fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3248    fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3249    fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3250    fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3251    fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3252    fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3253    fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3254    fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3255    fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3256    fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3257    fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3258    fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3259    fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3260    fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3261    fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3262    fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3263    fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3264    fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3265    fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3266    fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3267    fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3268    fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3269    fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3270    fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3271    fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3272    fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3273    fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3274    fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3275    fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3276    fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3277    fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
3278    fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3279    fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3280
3281    if (U_FAILURE(status)) {
3282        deferredStatus = status;
3283        fCharBI = NULL;
3284        fNumberMatcher = NULL;
3285        return;
3286    }
3287
3288    fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
3289    fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
3290    fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
3291    fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
3292
3293    fSets->addElement(fBK, status);
3294    fSets->addElement(fCR, status);
3295    fSets->addElement(fLF, status);
3296    fSets->addElement(fCM, status);
3297    fSets->addElement(fNL, status);
3298    fSets->addElement(fWJ, status);
3299    fSets->addElement(fZW, status);
3300    fSets->addElement(fGL, status);
3301    fSets->addElement(fCB, status);
3302    fSets->addElement(fSP, status);
3303    fSets->addElement(fB2, status);
3304    fSets->addElement(fBA, status);
3305    fSets->addElement(fBB, status);
3306    fSets->addElement(fHY, status);
3307    fSets->addElement(fH2, status);
3308    fSets->addElement(fH3, status);
3309    fSets->addElement(fCL, status);
3310    fSets->addElement(fEX, status);
3311    fSets->addElement(fIN, status);
3312    fSets->addElement(fJL, status);
3313    fSets->addElement(fJT, status);
3314    fSets->addElement(fJV, status);
3315    fSets->addElement(fNS, status);
3316    fSets->addElement(fOP, status);
3317    fSets->addElement(fQU, status);
3318    fSets->addElement(fIS, status);
3319    fSets->addElement(fNU, status);
3320    fSets->addElement(fPO, status);
3321    fSets->addElement(fPR, status);
3322    fSets->addElement(fSY, status);
3323    fSets->addElement(fAI, status);
3324    fSets->addElement(fAL, status);
3325    fSets->addElement(fID, status);
3326    fSets->addElement(fWJ, status);
3327    fSets->addElement(fSA, status);
3328    fSets->addElement(fSG, status);
3329
3330    const char *rules =
3331            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3332            "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3333            "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3334            "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3335            "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
3336            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3337
3338    fNumberMatcher = new RegexMatcher(
3339        UnicodeString(rules, -1, US_INV), 0, status);
3340
3341    fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3342
3343    if (U_FAILURE(status)) {
3344        deferredStatus = status;
3345    }
3346}
3347
3348
3349void RBBILineMonkey::setText(const UnicodeString &s) {
3350    fText       = &s;
3351    fCharBI->setText(s);
3352    fNumberMatcher->reset(s);
3353}
3354
3355//
3356//  rule9Adjust
3357//     Line Break TR rules 9 and 10 implementation.
3358//     This deals with combining marks and other sequences that
3359//     that must be treated as if they were something other than what they actually are.
3360//
3361//     This is factored out into a separate function because it must be applied twice for
3362//     each potential break, once to the chars before the position being checked, then
3363//     again to the text following the possible break.
3364//
3365void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3366    if (pos == -1) {
3367        // Invalid initial position.  Happens during the warmup iteration of the
3368        //   main loop in next().
3369        return;
3370    }
3371
3372    int32_t  nPos = *nextPos;
3373
3374    // LB 9  Keep combining sequences together.
3375    //  advance over any CM class chars.  Note that Line Break CM is different
3376    //  from the normal Grapheme Extend property.
3377    if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3378          *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3379        for (;;) {
3380            *nextChar = fText->char32At(nPos);
3381            if (!fCM->contains(*nextChar)) {
3382                break;
3383            }
3384            nPos = fText->moveIndex32(nPos, 1);
3385        }
3386    }
3387
3388
3389    // LB 9 Treat X CM* as if it were x.
3390    //       No explicit action required.
3391
3392    // LB 10  Treat any remaining combining mark as AL
3393    if (fCM->contains(*posChar)) {
3394        *posChar = 0x41;   // thisChar = 'A';
3395    }
3396
3397    // Push the updated nextPos and nextChar back to our caller.
3398    // This only makes a difference if posChar got bigger by consuming a
3399    // combining sequence.
3400    *nextPos  = nPos;
3401    *nextChar = fText->char32At(nPos);
3402}
3403
3404
3405
3406int32_t RBBILineMonkey::next(int32_t startPos) {
3407    UErrorCode status = U_ZERO_ERROR;
3408    int32_t    pos;       //  Index of the char following a potential break position
3409    UChar32    thisChar;  //  Character at above position "pos"
3410
3411    int32_t    prevPos;   //  Index of the char preceding a potential break position
3412    UChar32    prevChar;  //  Character at above position.  Note that prevChar
3413                          //   and thisChar may not be adjacent because combining
3414                          //   characters between them will be ignored.
3415
3416    int32_t    nextPos;   //  Index of the next character following pos.
3417                          //     Usually skips over combining marks.
3418    int32_t    nextCPPos; //  Index of the code point following "pos."
3419                          //     May point to a combining mark.
3420    int32_t    tPos;      //  temp value.
3421    UChar32    c;
3422
3423    if (U_FAILURE(deferredStatus)) {
3424        return -1;
3425    }
3426
3427    if (startPos >= fText->length()) {
3428        return -1;
3429    }
3430
3431
3432    // Initial values for loop.  Loop will run the first time without finding breaks,
3433    //                           while the invalid values shift out and the "this" and
3434    //                           "prev" positions are filled in with good values.
3435    pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
3436    thisChar = prevChar  = 0;
3437    nextPos  = nextCPPos = startPos;
3438
3439
3440    // Loop runs once per position in the test text, until a break position
3441    //  is found.
3442    for (;;) {
3443        prevPos   = pos;
3444        prevChar  = thisChar;
3445
3446        pos       = nextPos;
3447        thisChar  = fText->char32At(pos);
3448
3449        nextCPPos = fText->moveIndex32(pos, 1);
3450        nextPos   = nextCPPos;
3451
3452        // Rule LB2 - Break at end of text.
3453        if (pos >= fText->length()) {
3454            break;
3455        }
3456
3457        // Rule LB 9 - adjust for combining sequences.
3458        //             We do this one out-of-order because the adjustment does not change anything
3459        //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3460        //             be applied.
3461        rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3462        nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3463        c = fText->char32At(nextPos);
3464        rule9Adjust(pos,     &thisChar, &nextPos, &c);
3465
3466        // If the loop is still warming up - if we haven't shifted the initial
3467        //   -1 positions out of prevPos yet - loop back to advance the
3468        //    position in the input without any further looking for breaks.
3469        if (prevPos == -1) {
3470            continue;
3471        }
3472
3473        // LB 4  Always break after hard line breaks,
3474        if (fBK->contains(prevChar)) {
3475            break;
3476        }
3477
3478        // LB 5  Break after CR, LF, NL, but not inside CR LF
3479        if (prevChar == 0x0d && thisChar == 0x0a) {
3480            continue;
3481        }
3482        if (prevChar == 0x0d ||
3483            prevChar == 0x0a ||
3484            prevChar == 0x85)  {
3485            break;
3486        }
3487
3488        // LB 6  Don't break before hard line breaks
3489        if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3490            fBK->contains(thisChar)) {
3491                continue;
3492        }
3493
3494
3495        // LB 7  Don't break before spaces or zero-width space.
3496        if (fSP->contains(thisChar)) {
3497            continue;
3498        }
3499
3500        if (fZW->contains(thisChar)) {
3501            continue;
3502        }
3503
3504        // LB 8  Break after zero width space
3505        if (fZW->contains(prevChar)) {
3506            break;
3507        }
3508
3509        // LB 9, 10  Already done, at top of loop.
3510        //
3511
3512
3513        // LB 11  Do not break before or after WORD JOINER and related characters.
3514        //    x  WJ
3515        //    WJ  x
3516        //
3517        if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3518            continue;
3519        }
3520
3521        // LB 12
3522        //    GL  x
3523        if (fGL->contains(prevChar)) {
3524            continue;
3525        }
3526
3527        // LB 12a
3528        //    [^SP BA HY] x GL
3529        if (!(fSP->contains(prevChar) ||
3530              fBA->contains(prevChar) ||
3531              fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3532            continue;
3533        }
3534
3535
3536
3537        // LB 13  Don't break before closings.
3538        //        NU x CL  and NU x IS are not matched here so that they will
3539        //        fall into LB 17 and the more general number regular expression.
3540        //
3541        if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
3542                                        fEX->contains(thisChar) ||
3543            !fNU->contains(prevChar) && fIS->contains(thisChar) ||
3544            !fNU->contains(prevChar) && fSY->contains(thisChar))    {
3545            continue;
3546        }
3547
3548        // LB 14 Don't break after OP SP*
3549        //       Scan backwards, checking for this sequence.
3550        //       The OP char could include combining marks, so we actually check for
3551        //           OP CM* SP*
3552        //       Another Twist: The Rule 67 fixes may have changed a SP CM
3553        //       sequence into a ID char, so before scanning back through spaces,
3554        //       verify that prevChar is indeed a space.  The prevChar variable
3555        //       may differ from fText[prevPos]
3556        tPos = prevPos;
3557        if (fSP->contains(prevChar)) {
3558            while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3559                tPos=fText->moveIndex32(tPos, -1);
3560            }
3561        }
3562        while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3563            tPos=fText->moveIndex32(tPos, -1);
3564        }
3565        if (fOP->contains(fText->char32At(tPos))) {
3566            continue;
3567        }
3568
3569
3570        // LB 15    QU SP* x OP
3571        if (fOP->contains(thisChar)) {
3572            // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3573            int tPos = prevPos;
3574            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3575                tPos = fText->moveIndex32(tPos, -1);
3576            }
3577            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3578                tPos = fText->moveIndex32(tPos, -1);
3579            }
3580            if (fQU->contains(fText->char32At(tPos))) {
3581                continue;
3582            }
3583        }
3584
3585
3586
3587        // LB 16   CL SP* x NS
3588        //    Scan backwards for SP* CM* CL
3589        if (fNS->contains(thisChar)) {
3590            int tPos = prevPos;
3591            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3592                tPos = fText->moveIndex32(tPos, -1);
3593            }
3594            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3595                tPos = fText->moveIndex32(tPos, -1);
3596            }
3597            if (fCL->contains(fText->char32At(tPos))) {
3598                continue;
3599            }
3600        }
3601
3602
3603        // LB 17        B2 SP* x B2
3604        if (fB2->contains(thisChar)) {
3605            //  Scan backwards, checking for the B2 CM* SP* sequence.
3606            tPos = prevPos;
3607            if (fSP->contains(prevChar)) {
3608                while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3609                    tPos=fText->moveIndex32(tPos, -1);
3610                }
3611            }
3612            while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3613                tPos=fText->moveIndex32(tPos, -1);
3614            }
3615            if (fB2->contains(fText->char32At(tPos))) {
3616                continue;
3617            }
3618        }
3619
3620
3621        // LB 18    break after space
3622        if (fSP->contains(prevChar)) {
3623            break;
3624        }
3625
3626        // LB 19
3627        //    x   QU
3628        //    QU  x
3629        if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3630            continue;
3631        }
3632
3633        // LB 20  Break around a CB
3634        if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3635            break;
3636        }
3637
3638        // LB 21
3639        if (fBA->contains(thisChar) ||
3640            fHY->contains(thisChar) ||
3641            fNS->contains(thisChar) ||
3642            fBB->contains(prevChar) )   {
3643            continue;
3644        }
3645
3646        // LB 22
3647        if (fAL->contains(prevChar) && fIN->contains(thisChar) ||
3648            fID->contains(prevChar) && fIN->contains(thisChar) ||
3649            fIN->contains(prevChar) && fIN->contains(thisChar) ||
3650            fNU->contains(prevChar) && fIN->contains(thisChar) )   {
3651            continue;
3652        }
3653
3654
3655        // LB 23    ID x PO
3656        //          AL x NU
3657        //          NU x AL
3658        if (fID->contains(prevChar) && fPO->contains(thisChar) ||
3659            fAL->contains(prevChar) && fNU->contains(thisChar) ||
3660            fNU->contains(prevChar) && fAL->contains(thisChar) )   {
3661            continue;
3662        }
3663
3664        // LB 24  Do not break between prefix and letters or ideographs.
3665        //        PR x ID
3666        //        PR x AL
3667        //        PO x AL
3668        if (fPR->contains(prevChar) && fID->contains(thisChar) ||
3669            fPR->contains(prevChar) && fAL->contains(thisChar) ||
3670            fPO->contains(prevChar) && fAL->contains(thisChar) )   {
3671            continue;
3672        }
3673
3674
3675
3676        // LB 25    Numbers
3677        if (fNumberMatcher->lookingAt(prevPos, status)) {
3678            if (U_FAILURE(status)) {
3679                break;
3680            }
3681            // Matched a number.  But could have been just a single digit, which would
3682            //    not represent a "no break here" between prevChar and thisChar
3683            int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3684            if (numEndIdx > pos) {
3685                // Number match includes at least our two chars being checked
3686                if (numEndIdx > nextPos) {
3687                    // Number match includes additional chars.  Update pos and nextPos
3688                    //   so that next loop iteration will continue at the end of the number,
3689                    //   checking for breaks between last char in number & whatever follows.
3690                    pos = nextPos = numEndIdx;
3691                    do {
3692                        pos = fText->moveIndex32(pos, -1);
3693                        thisChar = fText->char32At(pos);
3694                    } while (fCM->contains(thisChar));
3695                }
3696                continue;
3697            }
3698        }
3699
3700
3701        // LB 26 Do not break a Korean syllable.
3702        if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3703                                        fJV->contains(thisChar) ||
3704                                        fH2->contains(thisChar) ||
3705                                        fH3->contains(thisChar))) {
3706                                            continue;
3707                                        }
3708
3709        if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3710            (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3711                continue;
3712        }
3713
3714        if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3715            fJT->contains(thisChar)) {
3716                continue;
3717        }
3718
3719        // LB 27 Treat a Korean Syllable Block the same as ID.
3720        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3721            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3722            fIN->contains(thisChar)) {
3723                continue;
3724            }
3725        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3726            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3727            fPO->contains(thisChar)) {
3728                continue;
3729            }
3730        if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3731            fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3732                continue;
3733            }
3734
3735
3736
3737        // LB 28  Do not break between alphabetics ("at").
3738        if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
3739            continue;
3740        }
3741
3742        // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3743        if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
3744            continue;
3745        }
3746
3747        // LB 31    Break everywhere else
3748        break;
3749
3750    }
3751
3752    return pos;
3753}
3754
3755
3756UVector  *RBBILineMonkey::charClasses() {
3757    return fSets;
3758}
3759
3760
3761RBBILineMonkey::~RBBILineMonkey() {
3762    delete fSets;
3763
3764    delete fBK;
3765    delete fCR;
3766    delete fLF;
3767    delete fCM;
3768    delete fNL;
3769    delete fWJ;
3770    delete fZW;
3771    delete fGL;
3772    delete fCB;
3773    delete fSP;
3774    delete fB2;
3775    delete fBA;
3776    delete fBB;
3777    delete fHY;
3778    delete fH2;
3779    delete fH3;
3780    delete fCL;
3781    delete fEX;
3782    delete fIN;
3783    delete fJL;
3784    delete fJV;
3785    delete fJT;
3786    delete fNS;
3787    delete fOP;
3788    delete fQU;
3789    delete fIS;
3790    delete fNU;
3791    delete fPO;
3792    delete fPR;
3793    delete fSY;
3794    delete fAI;
3795    delete fAL;
3796    delete fID;
3797    delete fSA;
3798    delete fSG;
3799    delete fXX;
3800
3801    delete fCharBI;
3802    delete fNumberMatcher;
3803}
3804
3805
3806//-------------------------------------------------------------------------------------------
3807//
3808//   TestMonkey
3809//
3810//     params
3811//       seed=nnnnn        Random number starting seed.
3812//                         Setting the seed allows errors to be reproduced.
3813//       loop=nnn          Looping count.  Controls running time.
3814//                         -1:  run forever.
3815//                          0 or greater:  run length.
3816//
3817//       type = char | word | line | sent | title
3818//
3819//-------------------------------------------------------------------------------------------
3820
3821static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3822    int32_t val = defaultVal;
3823    name.append(" *= *(-?\\d+)");
3824    UErrorCode status = U_ZERO_ERROR;
3825    RegexMatcher m(name, params, 0, status);
3826    if (m.find()) {
3827        // The param exists.  Convert the string to an int.
3828        char valString[100];
3829        int32_t paramLength = m.end(1, status) - m.start(1, status);
3830        if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3831            paramLength = (int32_t)(sizeof(valString)-2);
3832        }
3833        params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3834        val = strtol(valString,  NULL, 10);
3835
3836        // Delete this parameter from the params string.
3837        m.reset();
3838        params = m.replaceFirst("", status);
3839    }
3840    U_ASSERT(U_SUCCESS(status));
3841    return val;
3842}
3843#endif
3844
3845static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3846                                    BreakIterator *bi,
3847                                    int expected[],
3848                                    int expectedcount)
3849{
3850    int count = 0;
3851    int i = 0;
3852    int forward[50];
3853    bi->setText(ustr);
3854    for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3855        forward[count] = i;
3856        if (count < expectedcount && expected[count] != i) {
3857            test->errln("break forward test failed: expected %d but got %d",
3858                        expected[count], i);
3859            break;
3860        }
3861        count ++;
3862    }
3863    if (count != expectedcount) {
3864        printStringBreaks(ustr, expected, expectedcount);
3865        test->errln("break forward test failed: missed %d match",
3866                    expectedcount - count);
3867        return;
3868    }
3869    // testing boundaries
3870    for (i = 1; i < expectedcount; i ++) {
3871        int j = expected[i - 1];
3872        if (!bi->isBoundary(j)) {
3873            printStringBreaks(ustr, expected, expectedcount);
3874            test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3875            return;
3876        }
3877        for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3878            if (bi->isBoundary(j)) {
3879                printStringBreaks(ustr, expected, expectedcount);
3880                test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3881                return;
3882            }
3883        }
3884    }
3885
3886    for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3887        count --;
3888        if (forward[count] != i) {
3889            test->errln("happy break test previous() failed: expected %d but got %d",
3890                        forward[count], i);
3891            break;
3892        }
3893    }
3894    if (count != 0) {
3895        printStringBreaks(ustr, expected, expectedcount);
3896        test->errln("break test previous() failed: missed a match");
3897        return;
3898    }
3899
3900    // testing preceding
3901    for (i = 0; i < expectedcount - 1; i ++) {
3902        // int j = expected[i] + 1;
3903        int j = ustr.moveIndex32(expected[i], 1);
3904        for (; j <= expected[i + 1]; j ++) {
3905            if (bi->preceding(j) != expected[i]) {
3906                printStringBreaks(ustr, expected, expectedcount);
3907                test->errln("preceding(): Not expecting boundary at position %d", j);
3908                return;
3909            }
3910        }
3911    }
3912}
3913
3914void RBBITest::TestWordBreaks(void)
3915{
3916#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3917
3918    Locale        locale("en");
3919    UErrorCode    status = U_ZERO_ERROR;
3920    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3921    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3922    static const char *strlist[] =
3923    {
3924    "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3925    "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
3926    "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3927    "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3928    "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3929    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3930    "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3931    "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3932    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3933    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3934    "\\u2027\\U000e0067\\u0a47\\u00b7",
3935    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3936    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3937    "\\u0589\\U000e006e\\u0a42\\U000104a5",
3938    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3939    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3940    "\\u0027\\u11af\\U000e0057\\u0602",
3941    "\\U0001d7f2\\U000e007\\u0004\\u0589",
3942    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3943    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3944    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3945    "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3946    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3947    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3948    "\\u0233\\U000e0020\\u0a69\\u0d6a",
3949    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3950    "\\u58f4\\U000e0049\\u20e7\\u2027",
3951    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3952    "\\ua183\\u102d\\u0bec\\u003a",
3953    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3954    "\\u003a\\u0e57\\u0fad\\u002e",
3955    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3956    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3957    "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3958    "\\u003a\\u0664\\u00b7\\u1fba",
3959    "\\u003b\\u0027\\u00b7\\u47a3",
3960    "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3961    "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3962    "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3963    };
3964    int loop;
3965    if (U_FAILURE(status)) {
3966        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3967        return;
3968    }
3969    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3970        // printf("looping %d\n", loop);
3971        UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3972        // RBBICharMonkey monkey;
3973        RBBIWordMonkey monkey;
3974
3975        int expected[50];
3976        int expectedcount = 0;
3977
3978        monkey.setText(ustr);
3979        int i;
3980        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3981            expected[expectedcount ++] = i;
3982        }
3983
3984        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3985    }
3986    delete bi;
3987#endif
3988}
3989
3990void RBBITest::TestWordBoundary(void)
3991{
3992    // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3993    Locale        locale("en");
3994    UErrorCode    status = U_ZERO_ERROR;
3995    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3996    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3997    UChar         str[50];
3998    static const char *strlist[] =
3999    {
4000    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
4001    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
4002    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
4003    "\\u2027\\U000e0067\\u0a47\\u00b7",
4004    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
4005    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
4006    "\\u0589\\U000e006e\\u0a42\\U000104a5",
4007    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
4008    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
4009    "\\u0027\\u11af\\U000e0057\\u0602",
4010    "\\U0001d7f2\\U000e007\\u0004\\u0589",
4011    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
4012    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
4013    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
4014    "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
4015    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
4016    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
4017    "\\u0233\\U000e0020\\u0a69\\u0d6a",
4018    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
4019    "\\u58f4\\U000e0049\\u20e7\\u2027",
4020    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
4021    "\\ua183\\u102d\\u0bec\\u003a",
4022    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
4023    "\\u003a\\u0e57\\u0fad\\u002e",
4024    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
4025    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
4026    "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
4027    "\\u003a\\u0664\\u00b7\\u1fba",
4028    "\\u003b\\u0027\\u00b7\\u47a3",
4029    };
4030    int loop;
4031    if (U_FAILURE(status)) {
4032        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4033        return;
4034    }
4035    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4036        // printf("looping %d\n", loop);
4037        u_unescape(strlist[loop], str, 20);
4038        UnicodeString ustr(str);
4039        int forward[50];
4040        int count = 0;
4041
4042        bi->setText(ustr);
4043        int prev = 0;
4044        int i;
4045        for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
4046            forward[count ++] = i;
4047            if (i > prev) {
4048                int j;
4049                for (j = prev + 1; j < i; j ++) {
4050                    if (bi->isBoundary(j)) {
4051                        printStringBreaks(ustr, forward, count);
4052                        errln("happy boundary test failed: expected %d not a boundary",
4053                               j);
4054                        return;
4055                    }
4056                }
4057            }
4058            if (!bi->isBoundary(i)) {
4059                printStringBreaks(ustr, forward, count);
4060                errln("happy boundary test failed: expected %d a boundary",
4061                       i);
4062                return;
4063            }
4064            prev = i;
4065        }
4066    }
4067    delete bi;
4068}
4069
4070void RBBITest::TestLineBreaks(void)
4071{
4072#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4073    Locale        locale("en");
4074    UErrorCode    status = U_ZERO_ERROR;
4075    BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4076    const int32_t  STRSIZE = 50;
4077    UChar         str[STRSIZE];
4078    static const char *strlist[] =
4079    {
4080     "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
4081     "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
4082             "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
4083     "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
4084             "u2014\\U000e0105\\u118c\\u000a\\u07f8",
4085     "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
4086     "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4087     "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4088     "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4089     "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
4090     "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
4091     "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4092     "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4093     "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4094     "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4095     "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4096     "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4097     "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4098     "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4099     "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4100     "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4101     "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4102     "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4103     "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4104     "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4105     "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4106     "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
4107     "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4108     "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4109     "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4110     "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4111     "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4112     "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
4113     "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4114     "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4115     "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
4116     "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4117     "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4118     "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4119     "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4120     "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4121     "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4122         "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
4123         "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
4124         "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
4125     "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4126         "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4127    };
4128    int loop;
4129    TEST_ASSERT_SUCCESS(status);
4130    if (U_FAILURE(status)) {
4131        return;
4132    }
4133    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4134        // printf("looping %d\n", loop);
4135        int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4136        if (t >= STRSIZE) {
4137            TEST_ASSERT(FALSE);
4138            continue;
4139        }
4140
4141
4142        UnicodeString ustr(str);
4143        RBBILineMonkey monkey;
4144        if (U_FAILURE(monkey.deferredStatus)) {
4145            continue;
4146        }
4147
4148        const int EXPECTEDSIZE = 50;
4149        int expected[EXPECTEDSIZE];
4150        int expectedcount = 0;
4151
4152        monkey.setText(ustr);
4153        int i;
4154        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4155            if (expectedcount >= EXPECTEDSIZE) {
4156                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4157                return;
4158            }
4159            expected[expectedcount ++] = i;
4160        }
4161
4162        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4163    }
4164    delete bi;
4165#endif
4166}
4167
4168void RBBITest::TestSentBreaks(void)
4169{
4170#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4171    Locale        locale("en");
4172    UErrorCode    status = U_ZERO_ERROR;
4173    BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4174    UChar         str[200];
4175    static const char *strlist[] =
4176    {
4177     "Now\ris\nthe\r\ntime\n\rfor\r\r",
4178     "This\n",
4179     "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4180     "\"Sentence ending with a quote.\" Bye.",
4181     "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
4182     "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4183     "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4184     "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4185     "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4186     "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4187     "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4188             "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4189             "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4190             "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4191     "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4192             "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4193             "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4194             "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4195             "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4196             "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4197    };
4198    int loop;
4199    if (U_FAILURE(status)) {
4200        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4201        return;
4202    }
4203    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4204        u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
4205        UnicodeString ustr(str);
4206
4207        RBBISentMonkey monkey;
4208        if (U_FAILURE(monkey.deferredStatus)) {
4209            continue;
4210        }
4211
4212        const int EXPECTEDSIZE = 50;
4213        int expected[EXPECTEDSIZE];
4214        int expectedcount = 0;
4215
4216        monkey.setText(ustr);
4217        int i;
4218        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4219            if (expectedcount >= EXPECTEDSIZE) {
4220                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4221                return;
4222            }
4223            expected[expectedcount ++] = i;
4224        }
4225
4226        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4227    }
4228    delete bi;
4229#endif
4230}
4231
4232void RBBITest::TestMonkey(char *params) {
4233#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4234
4235    UErrorCode     status    = U_ZERO_ERROR;
4236    int32_t        loopCount = 500;
4237    int32_t        seed      = 1;
4238    UnicodeString  breakType = "all";
4239    Locale         locale("en");
4240    UBool          useUText  = FALSE;
4241
4242    if (quick == FALSE) {
4243        loopCount = 10000;
4244    }
4245
4246    if (params) {
4247        UnicodeString p(params);
4248        loopCount = getIntParam("loop", p, loopCount);
4249        seed      = getIntParam("seed", p, seed);
4250
4251        RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4252        if (m.find()) {
4253            breakType = m.group(1, status);
4254            m.reset();
4255            p = m.replaceFirst("", status);
4256        }
4257
4258        RegexMatcher u(" *utext", p, 0, status);
4259        if (u.find()) {
4260            useUText = TRUE;
4261            u.reset();
4262            p = u.replaceFirst("", status);
4263        }
4264
4265
4266        // m.reset(p);
4267        if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4268            // Each option is stripped out of the option string as it is processed.
4269            // All options have been checked.  The option string should have been completely emptied..
4270            char buf[100];
4271            p.extract(buf, sizeof(buf), NULL, status);
4272            buf[sizeof(buf)-1] = 0;
4273            errln("Unrecognized or extra parameter:  %s\n", buf);
4274            return;
4275        }
4276
4277    }
4278
4279    if (breakType == "char" || breakType == "all") {
4280        RBBICharMonkey  m;
4281        BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4282        if (U_SUCCESS(status)) {
4283            RunMonkey(bi, m, "char", seed, loopCount, useUText);
4284            if (breakType == "all" && useUText==FALSE) {
4285                // Also run a quick test with UText when "all" is specified
4286                RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4287            }
4288        }
4289        else {
4290            errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4291        }
4292        delete bi;
4293    }
4294
4295    if (breakType == "word" || breakType == "all") {
4296        logln("Word Break Monkey Test");
4297        RBBIWordMonkey  m;
4298        BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4299        if (U_SUCCESS(status)) {
4300            RunMonkey(bi, m, "word", seed, loopCount, useUText);
4301        }
4302        else {
4303            errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4304        }
4305        delete bi;
4306    }
4307
4308    if (breakType == "line" || breakType == "all") {
4309        logln("Line Break Monkey Test");
4310        RBBILineMonkey  m;
4311        BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4312        if (loopCount >= 10) {
4313            loopCount = loopCount / 5;   // Line break runs slower than the others.
4314        }
4315        if (U_SUCCESS(status)) {
4316            RunMonkey(bi, m, "line", seed, loopCount, useUText);
4317        }
4318        else {
4319            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4320        }
4321        delete bi;
4322    }
4323
4324    if (breakType == "sent" || breakType == "all"  ) {
4325        logln("Sentence Break Monkey Test");
4326        RBBISentMonkey  m;
4327        BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4328        if (loopCount >= 10) {
4329            loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4330        }
4331        if (U_SUCCESS(status)) {
4332            RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4333        }
4334        else {
4335            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4336        }
4337        delete bi;
4338    }
4339
4340#endif
4341}
4342
4343//
4344//  Run a RBBI monkey test.  Common routine, for all break iterator types.
4345//    Parameters:
4346//       bi      - the break iterator to use
4347//       mk      - MonkeyKind, abstraction for obtaining expected results
4348//       name    - Name of test (char, word, etc.) for use in error messages
4349//       seed    - Seed for starting random number generator (parameter from user)
4350//       numIterations
4351//
4352void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4353                         int32_t numIterations, UBool useUText) {
4354
4355#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4356
4357    const int32_t    TESTSTRINGLEN = 500;
4358    UnicodeString    testText;
4359    int32_t          numCharClasses;
4360    UVector          *chClasses;
4361    int              expected[TESTSTRINGLEN*2 + 1];
4362    int              expectedCount = 0;
4363    char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4364    char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4365    char             reverseBreaks[TESTSTRINGLEN*2+1];
4366    char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4367    char             followingBreaks[TESTSTRINGLEN*2+1];
4368    char             precedingBreaks[TESTSTRINGLEN*2+1];
4369    int              i;
4370    int              loopCount = 0;
4371
4372    m_seed = seed;
4373
4374    numCharClasses = mk.charClasses()->size();
4375    chClasses      = mk.charClasses();
4376
4377    // Check for errors that occured during the construction of the MonkeyKind object.
4378    //  Can't report them where they occured because errln() is a method coming from intlTest,
4379    //  and is not visible outside of RBBITest :-(
4380    if (U_FAILURE(mk.deferredStatus)) {
4381        errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4382        return;
4383    }
4384
4385    // Verify that the character classes all have at least one member.
4386    for (i=0; i<numCharClasses; i++) {
4387        UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4388        if (s == NULL || s->size() == 0) {
4389            errln("Character Class #%d is null or of zero size.", i);
4390            return;
4391        }
4392    }
4393
4394    while (loopCount < numIterations || numIterations == -1) {
4395        if (numIterations == -1 && loopCount % 10 == 0) {
4396            // If test is running in an infinite loop, display a periodic tic so
4397            //   we can tell that it is making progress.
4398            fprintf(stderr, ".");
4399        }
4400        // Save current random number seed, so that we can recreate the random numbers
4401        //   for this loop iteration in event of an error.
4402        seed = m_seed;
4403
4404        // Populate a test string with data.
4405        testText.truncate(0);
4406        for (i=0; i<TESTSTRINGLEN; i++) {
4407            int32_t  aClassNum = m_rand() % numCharClasses;
4408            UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4409            int32_t   charIdx = m_rand() % classSet->size();
4410            UChar32   c = classSet->charAt(charIdx);
4411            if (c < 0) {   // TODO:  deal with sets containing strings.
4412                errln("c < 0");
4413                break;
4414            }
4415            testText.append(c);
4416        }
4417
4418        // Calculate the expected results for this test string.
4419        mk.setText(testText);
4420        memset(expectedBreaks, 0, sizeof(expectedBreaks));
4421        expectedBreaks[0] = 1;
4422        int32_t breakPos = 0;
4423        expectedCount = 0;
4424        for (;;) {
4425            breakPos = mk.next(breakPos);
4426            if (breakPos == -1) {
4427                break;
4428            }
4429            if (breakPos > testText.length()) {
4430                errln("breakPos > testText.length()");
4431            }
4432            expectedBreaks[breakPos] = 1;
4433            U_ASSERT(expectedCount<testText.length());
4434            expected[expectedCount ++] = breakPos;
4435        }
4436
4437        // Find the break positions using forward iteration
4438        memset(forwardBreaks, 0, sizeof(forwardBreaks));
4439        if (useUText) {
4440            UErrorCode status = U_ZERO_ERROR;
4441            UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4442            // testUText = utext_openUnicodeString(testUText, &testText, &status);
4443            bi->setText(testUText, status);
4444            TEST_ASSERT_SUCCESS(status);
4445            utext_close(testUText);   // The break iterator does a shallow clone of the UText
4446                                      //  This UText can be closed immediately, so long as the
4447                                      //  testText string continues to exist.
4448        } else {
4449            bi->setText(testText);
4450        }
4451
4452        for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4453            if (i < 0 || i > testText.length()) {
4454                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4455                break;
4456            }
4457            forwardBreaks[i] = 1;
4458        }
4459
4460        // Find the break positions using reverse iteration
4461        memset(reverseBreaks, 0, sizeof(reverseBreaks));
4462        for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4463            if (i < 0 || i > testText.length()) {
4464                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4465                break;
4466            }
4467            reverseBreaks[i] = 1;
4468        }
4469
4470        // Find the break positions using isBoundary() tests.
4471        memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4472        U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4473        for (i=0; i<=testText.length(); i++) {
4474            isBoundaryBreaks[i] = bi->isBoundary(i);
4475        }
4476
4477
4478        // Find the break positions using the following() function.
4479        // printf(".");
4480        memset(followingBreaks, 0, sizeof(followingBreaks));
4481        int32_t   lastBreakPos = 0;
4482        followingBreaks[0] = 1;
4483        for (i=0; i<testText.length(); i++) {
4484            breakPos = bi->following(i);
4485            if (breakPos <= i ||
4486                breakPos < lastBreakPos ||
4487                breakPos > testText.length() ||
4488                breakPos > lastBreakPos && lastBreakPos > i ) {
4489                errln("%s break monkey test: "
4490                    "Out of range value returned by BreakIterator::following().\n"
4491                        "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4492                         name, seed, i, breakPos, lastBreakPos);
4493                break;
4494            }
4495            followingBreaks[breakPos] = 1;
4496            lastBreakPos = breakPos;
4497        }
4498
4499        // Find the break positions using the preceding() function.
4500        memset(precedingBreaks, 0, sizeof(precedingBreaks));
4501        lastBreakPos = testText.length();
4502        precedingBreaks[testText.length()] = 1;
4503        for (i=testText.length(); i>0; i--) {
4504            breakPos = bi->preceding(i);
4505            if (breakPos >= i ||
4506                breakPos > lastBreakPos ||
4507                breakPos < 0 && testText.getChar32Start(i)>0 ||
4508                breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) {
4509                errln("%s break monkey test: "
4510                    "Out of range value returned by BreakIterator::preceding().\n"
4511                    "index=%d;  prev returned %d; lastBreak=%d" ,
4512                    name,  i, breakPos, lastBreakPos);
4513                if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4514                    precedingBreaks[i] = 2;   // Forces an error.
4515                }
4516            } else {
4517                if (breakPos >= 0) {
4518                    precedingBreaks[breakPos] = 1;
4519                }
4520                lastBreakPos = breakPos;
4521            }
4522        }
4523
4524        // Compare the expected and actual results.
4525        for (i=0; i<=testText.length(); i++) {
4526            const char *errorType = NULL;
4527            if  (forwardBreaks[i] != expectedBreaks[i]) {
4528                errorType = "next()";
4529            } else if (reverseBreaks[i] != forwardBreaks[i]) {
4530                errorType = "previous()";
4531            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4532                errorType = "isBoundary()";
4533            } else if (followingBreaks[i] != expectedBreaks[i]) {
4534                errorType = "following()";
4535            } else if (precedingBreaks[i] != expectedBreaks[i]) {
4536                errorType = "preceding()";
4537            }
4538
4539
4540            if (errorType != NULL) {
4541                // Format a range of the test text that includes the failure as
4542                //  a data item that can be included in the rbbi test data file.
4543
4544                // Start of the range is the last point where expected and actual results
4545                //   both agreed that there was a break position.
4546                int startContext = i;
4547                int32_t count = 0;
4548                for (;;) {
4549                    if (startContext==0) { break; }
4550                    startContext --;
4551                    if (expectedBreaks[startContext] != 0) {
4552                        if (count == 2) break;
4553                        count ++;
4554                    }
4555                }
4556
4557                // End of range is two expected breaks past the start position.
4558                int endContext = i + 1;
4559                int ci;
4560                for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4561                    for (;;) {
4562                        if (endContext >= testText.length()) {break;}
4563                        if (expectedBreaks[endContext-1] != 0) {
4564                            if (count == 0) break;
4565                            count --;
4566                        }
4567                        endContext ++;
4568                    }
4569                }
4570
4571                // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4572                UnicodeString errorText = "<data>";
4573                /***if (strcmp(errorType, "next()") == 0) {
4574                    startContext = 0;
4575                    endContext = testText.length();
4576
4577                    printStringBreaks(testText, expected, expectedCount);
4578                }***/
4579
4580                for (ci=startContext; ci<endContext;) {
4581                    UnicodeString hexChars("0123456789abcdef");
4582                    UChar32  c;
4583                    int      bn;
4584                    c = testText.char32At(ci);
4585                    if (ci == i) {
4586                        // This is the location of the error.
4587                        errorText.append("<?>");
4588                    } else if (expectedBreaks[ci] != 0) {
4589                        // This a non-error expected break position.
4590                        errorText.append("\\");
4591                    }
4592                    if (c < 0x10000) {
4593                        errorText.append("\\u");
4594                        for (bn=12; bn>=0; bn-=4) {
4595                            errorText.append(hexChars.charAt((c>>bn)&0xf));
4596                        }
4597                    } else {
4598                        errorText.append("\\U");
4599                        for (bn=28; bn>=0; bn-=4) {
4600                            errorText.append(hexChars.charAt((c>>bn)&0xf));
4601                        }
4602                    }
4603                    ci = testText.moveIndex32(ci, 1);
4604                }
4605                errorText.append("\\");
4606                errorText.append("</data>\n");
4607
4608                // Output the error
4609                char  charErrorTxt[500];
4610                UErrorCode status = U_ZERO_ERROR;
4611                errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4612                charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4613                errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4614                    name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4615                    errorType, seed, i, charErrorTxt);
4616                break;
4617            }
4618        }
4619
4620        loopCount++;
4621    }
4622#endif
4623}
4624
4625//
4626//  TestDebug    -  A place-holder test for debugging purposes.
4627//                  For putting in fragments of other tests that can be invoked
4628//                  for tracing  without a lot of unwanted extra stuff happening.
4629//
4630void RBBITest::TestDebug(void) {
4631#if 0
4632    UErrorCode   status = U_ZERO_ERROR;
4633    int pos = 0;
4634    int ruleStatus = 0;
4635
4636    RuleBasedBreakIterator* bi =
4637       // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4638       // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4639       (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4640    UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4641    // UnicodeString s("Aaa.  Bcd");
4642    s = s.unescape();
4643    bi->setText(s);
4644    UBool r = bi->isBoundary(8);
4645    printf("%s", r?"true":"false");
4646    return;
4647    pos = bi->last();
4648    do {
4649        // ruleStatus = bi->getRuleStatus();
4650        printf("%d\t%d\n", pos, ruleStatus);
4651        pos = bi->previous();
4652    } while (pos != BreakIterator::DONE);
4653#endif
4654}
4655
4656#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
4657