rbbitst.cpp revision 6d5deb12725f146643d443090dfa11b206df528a
1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2009, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6/************************************************************************
7*   Date        Name        Description
8*   12/15/99    Madhu        Creation.
9*   01/12/2000  Madhu        Updated for changed API and added new tests
10************************************************************************/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_BREAK_ITERATION
15
16#include "unicode/utypes.h"
17#include "unicode/brkiter.h"
18#include "unicode/rbbi.h"
19#include "unicode/uchar.h"
20#include "unicode/utf16.h"
21#include "unicode/ucnv.h"
22#include "unicode/schriter.h"
23#include "unicode/uniset.h"
24#include "unicode/regex.h"        // TODO: make conditional on regexp being built.
25#include "unicode/ustring.h"
26#include "unicode/utext.h"
27#include "intltest.h"
28#include "rbbitst.h"
29#include <string.h>
30#include "uvector.h"
31#include "uvectr32.h"
32#include "triedict.h"
33#include <string.h>
34#include <stdio.h>
35#include <stdlib.h>
36
37#define TEST_ASSERT(x) {if (!(x)) { \
38    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
39
40#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
41    errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
42
43
44//---------------------------------------------
45// runIndexedTest
46//---------------------------------------------
47
48void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
49{
50    if (exec) logln("TestSuite RuleBasedBreakIterator: ");
51
52    switch (index) {
53        case 0: name = "TestBug4153072";
54            if(exec) TestBug4153072();                         break;
55        case 1: name = "TestJapaneseLineBreak";
56            if(exec) TestJapaneseLineBreak();                  break;
57        case 2: name = "TestStatusReturn";
58            if(exec) TestStatusReturn();                       break;
59        case 3: name = "TestUnicodeFiles";
60            if(exec) TestUnicodeFiles();                       break;
61        case 4: name = "TestEmptyString";
62            if(exec) TestEmptyString();                        break;
63
64        case 5: name = "TestGetAvailableLocales";
65            if(exec) TestGetAvailableLocales();                break;
66
67        case 6: name = "TestGetDisplayName";
68            if(exec) TestGetDisplayName();                     break;
69
70        case 7: name = "TestEndBehaviour";
71            if(exec) TestEndBehaviour();                       break;
72        case 8: name = "TestMixedThaiLineBreak";
73             if(exec) TestMixedThaiLineBreak();                break;
74        case 9: name = "TestThaiLineBreak";
75             if(exec) TestThaiLineBreak();                     break;
76        case 10: name = "TestMaiyamok";
77             if(exec) TestMaiyamok();                          break;
78        case 11: name = "TestWordBreaks";
79             if(exec) TestWordBreaks();                        break;
80        case 12: name = "TestWordBoundary";
81             if(exec) TestWordBoundary();                      break;
82        case 13: name = "TestLineBreaks";
83             if(exec) TestLineBreaks();                        break;
84        case 14: name = "TestSentBreaks";
85             if(exec) TestSentBreaks();                        break;
86        case 15: name = "TestExtended";
87             if(exec) TestExtended();                          break;
88        case 16: name = "TestMonkey";
89             if(exec) {
90 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
91               TestMonkey(params);
92 #else
93               logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
94 #endif
95             }
96                                                               break;
97        case 17: name = "TestBug3818";
98            if(exec) TestBug3818();                            break;
99        case 18: name = "TestJapaneseWordBreak";
100            if(exec) TestJapaneseWordBreak();                  break;
101        case 19: name = "TestDebug";
102            if(exec) TestDebug();                              break;
103        case 20: name = "TestTrieDict";
104            if(exec) TestTrieDict();                           break;
105        case 21: name = "TestBug5775";
106            if (exec) TestBug5775();                           break;
107        case 22: name = "TestThaiBreaks";
108            if (exec) TestThaiBreaks();                        break;
109        case 23: name = "TestTailoredBreaks";
110            if (exec) TestTailoredBreaks();                    break;
111
112        default: name = ""; break; //needed to end loop
113    }
114}
115
116
117//---------------------------------------------------------------------------
118//
119//   class BITestData   Holds a set of Break iterator test data and results
120//                      Includes
121//                         - the string data to be broken
122//                         - a vector of the expected break positions.
123//                         - a vector of source line numbers for the data,
124//                               (to help see where errors occured.)
125//                         - The expected break tag values.
126//                         - Vectors of actual break positions and tag values.
127//                         - Functions for comparing actual with expected and
128//                            reporting errors.
129//
130//----------------------------------------------------------------------------
131class BITestData {
132public:
133    UnicodeString    fDataToBreak;
134    UVector          fExpectedBreakPositions;
135    UVector          fExpectedTags;
136    UVector          fLineNum;
137    UVector          fActualBreakPositions;   // Test Results.
138    UVector          fActualTags;
139
140    BITestData(UErrorCode &status);
141    void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
142    void             checkResults(const char *heading, RBBITest *test);
143    void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
144    void             clearResults();
145};
146
147//
148// Constructor.
149//
150BITestData::BITestData(UErrorCode &status)
151: fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
152  fActualTags(status)
153{
154}
155
156//
157// addDataChunk.   Add a section (non-breaking) piece if data to the test data.
158//                 The macro form collects the line number, which is helpful
159//                 when tracking down failures.
160//
161//                 A null data item is inserted at the start of each test's data
162//                  to put the starting zero into the data list.  The position saved for
163//                  each non-null item is its ending position.
164//
165#define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
166void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
167    if (U_FAILURE(status)) {return;}
168    if (data != NULL) {
169        fDataToBreak.append(CharsToUnicodeString(data));
170    }
171    fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
172    fExpectedTags.addElement(tag, status);
173    fLineNum.addElement(lineNum, status);
174}
175
176
177//
178//  checkResults.   Compare the actual and expected break positions, report any differences.
179//
180void BITestData::checkResults(const char *heading, RBBITest *test) {
181    int32_t   expectedIndex = 0;
182    int32_t   actualIndex = 0;
183
184    for (;;) {
185        // If we've run through both the expected and actual results vectors, we're done.
186        //   break out of the loop.
187        if (expectedIndex >= fExpectedBreakPositions.size() &&
188            actualIndex   >= fActualBreakPositions.size()) {
189            break;
190        }
191
192
193        if (expectedIndex >= fExpectedBreakPositions.size()) {
194            err(heading, test, expectedIndex-1, actualIndex);
195            actualIndex++;
196            continue;
197        }
198
199        if (actualIndex >= fActualBreakPositions.size()) {
200            err(heading, test, expectedIndex, actualIndex-1);
201            expectedIndex++;
202            continue;
203        }
204
205        if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
206            err(heading, test, expectedIndex, actualIndex);
207            // Try to resync the positions of the indices, to avoid a rash of spurious erros.
208            if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
209                actualIndex++;
210            } else {
211                expectedIndex++;
212            }
213            continue;
214        }
215
216        if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
217            test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
218                heading, fLineNum.elementAt(expectedIndex),
219                fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
220        }
221
222        actualIndex++;
223        expectedIndex++;
224    }
225}
226
227//
228//  err   -  An error was found.  Report it, along with information about where the
229//                                incorrectly broken test data appeared in the source file.
230//
231void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
232{
233    int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
234    int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
235    int32_t   o        = 0;
236    int32_t   line     = fLineNum.elementAti(expectedIdx);
237    if (expectedIdx > 0) {
238        // The line numbers are off by one because a premature break occurs somewhere
239        //    within the previous item, rather than at the start of the current (expected) item.
240        //    We want to report the offset of the unexpected break from the start of
241        //      this previous item.
242        o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
243    }
244    if (actual < expected) {
245        test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
246    } else {
247        test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
248    }
249}
250
251
252void BITestData::clearResults() {
253    fActualBreakPositions.removeAllElements();
254    fActualTags.removeAllElements();
255}
256
257
258//-----------------------------------------------------------------------------------
259//
260//    Cannned Test Characters
261//
262//-----------------------------------------------------------------------------------
263
264static const UChar cannedTestArray[] = {
265    0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
266    0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
267    0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
268    0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
269    0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
270    0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
271    0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
272    0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
273};
274
275static UnicodeString* cannedTestChars = 0;
276
277#define  halfNA     "\\u0928\\u094d\\u200d"
278#define  halfSA     "\\u0938\\u094d\\u200d"
279#define  halfCHA    "\\u091a\\u094d\\u200d"
280#define  halfKA     "\\u0915\\u094d\\u200d"
281#define  deadTA     "\\u0924\\u094d"
282
283//--------------------------------------------------------------------------------------
284//
285//    RBBITest    constructor and destructor
286//
287//--------------------------------------------------------------------------------------
288
289RBBITest::RBBITest() {
290    UnicodeString temp(cannedTestArray);
291    cannedTestChars = new UnicodeString();
292    *cannedTestChars += (UChar)0x0000;
293    *cannedTestChars += temp;
294}
295
296
297RBBITest::~RBBITest() {
298    delete cannedTestChars;
299}
300
301
302static const int T_NUMBER = 100;
303static const int T_LETTER = 200;
304static const int T_H_OR_K = 300;
305static const int T_IDEO   = 400;
306
307
308
309
310
311
312//--------------------------------------------------------------------
313//Testing the BreakIterator for devanagari script
314//--------------------------------------------------------------------
315
316#define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
317#define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
318#define deadTTHA "\\u0920\\u094d"
319#define deadPA   "\\u092a\\u094d"
320#define deadSA   "\\u0938\\u094d"
321#define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
322
323
324
325
326
327
328//-----------------------------------------------------------------------------------
329//
330//   Test for status {tag} return value from break rules.
331//        TODO:  a more thorough test.
332//
333//-----------------------------------------------------------------------------------
334void RBBITest::TestStatusReturn() {
335     UnicodeString rulesString1("$Letters = [:L:];\n"
336                                  "$Numbers = [:N:];\n"
337                                  "$Letters+{1};\n"
338                                  "$Numbers+{2};\n"
339                                  "Help\\ {4}/me\\!;\n"
340                                  "[^$Letters $Numbers];\n"
341                                  "!.*;\n", -1, US_INV);
342     UnicodeString testString1  = "abc123..abc Help me Help me!";
343                                // 01234567890123456789012345678
344     int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
345     int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
346
347     UErrorCode status=U_ZERO_ERROR;
348     UParseError    parseError;
349
350     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
351     if(U_FAILURE(status)) {
352         dataerrln("FAIL : in construction - %s", u_errorName(status));
353     } else {
354         int32_t  pos;
355         int32_t  i = 0;
356         bi->setText(testString1);
357         for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
358             if (pos != bounds1[i]) {
359                 errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
360                 break;
361             }
362
363             int tag = bi->getRuleStatus();
364             if (tag != brkStatus[i]) {
365                 errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
366                 break;
367             }
368             i++;
369         }
370     }
371     delete bi;
372}
373
374
375static void printStringBreaks(UnicodeString ustr, int expected[],
376                              int expectedcount)
377{
378    UErrorCode status = U_ZERO_ERROR;
379    char name[100];
380    printf("code    alpha extend alphanum type word sent line name\n");
381    int j;
382    for (j = 0; j < ustr.length(); j ++) {
383        if (expectedcount > 0) {
384            int k;
385            for (k = 0; k < expectedcount; k ++) {
386                if (j == expected[k]) {
387                    printf("------------------------------------------------ %d\n",
388                           j);
389                }
390            }
391        }
392        UChar32 c = ustr.char32At(j);
393        if (c > 0xffff) {
394            j ++;
395        }
396        u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
397        printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
398                           u_isUAlphabetic(c),
399                           u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
400                           u_isalnum(c),
401                           u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
402                                                  u_charType(c),
403                                                  U_SHORT_PROPERTY_NAME),
404                           u_getPropertyValueName(UCHAR_WORD_BREAK,
405                                                  u_getIntPropertyValue(c,
406                                                          UCHAR_WORD_BREAK),
407                                                  U_SHORT_PROPERTY_NAME),
408                           u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
409                                   u_getIntPropertyValue(c,
410                                           UCHAR_SENTENCE_BREAK),
411                                   U_SHORT_PROPERTY_NAME),
412                           u_getPropertyValueName(UCHAR_LINE_BREAK,
413                                   u_getIntPropertyValue(c,
414                                           UCHAR_LINE_BREAK),
415                                   U_SHORT_PROPERTY_NAME),
416                           name);
417    }
418}
419
420void RBBITest::TestThaiLineBreak() {
421    UErrorCode status = U_ZERO_ERROR;
422    BITestData thaiLineSelection(status);
423
424    // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
425    // represents elided letters at the end of a long word.  It should be bound to
426    // the end of the word and not treated as an independent punctuation mark.
427
428
429    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
430    ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
431    ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
432    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
433    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
434//        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
435//        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
436    ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
437    // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
438    ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
439    ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
440    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
441    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
442    ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
443    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
444
445    // the one time where the paiyannoi occurs somewhere other than at the end
446    // of a word is in the Thai abbrevation for "etc.", which both begins and
447    // ends with a paiyannoi
448    ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
449    ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
450    ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
451
452    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
453        Locale("th"), status);
454    if (U_FAILURE(status))
455    {
456        errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status));
457        return;
458    }
459
460    generalIteratorTest(*e, thaiLineSelection);
461    delete e;
462}
463
464
465
466void RBBITest::TestMixedThaiLineBreak()
467{
468    UErrorCode   status = U_ZERO_ERROR;
469    BITestData   thaiLineSelection(status);
470
471    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
472
473
474    // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
475    // start
476
477    ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
478    ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
479    ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
480    ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
481    ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
482    ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
483    ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
484    ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
485    ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
486    ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
487    ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
488    ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
489    ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
490    ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
491    ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
492    ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
493
494    // @suwit - end of changes
495
496
497    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
498    if (U_FAILURE(status))
499    {
500        errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status));
501        return;
502    }
503
504
505    generalIteratorTest(*e, thaiLineSelection);
506    delete e;
507}
508
509
510void RBBITest::TestMaiyamok()
511{
512    UErrorCode status = U_ZERO_ERROR;
513    BITestData   thaiLineSelection(status);
514    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
515    // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
516    // word".  Instead of appearing as a word unto itself, however, it's kept together
517    // with the word before it
518    ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
519    ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
520    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
521    ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
522    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
523    ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
524    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
525    ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
526    ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
527
528    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
529        Locale("th"), status);
530
531    if (U_FAILURE(status))
532    {
533        errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status));
534        return;
535    }
536    generalIteratorTest(*e, thaiLineSelection);
537    delete e;
538}
539
540
541
542void RBBITest::TestBug3818() {
543    UErrorCode  status = U_ZERO_ERROR;
544
545    // Four Thai words...
546    static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
547                                           0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
548    UnicodeString  thaiStr(thaiWordData);
549
550    RuleBasedBreakIterator* bi =
551        (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
552    if (U_FAILURE(status) || bi == NULL) {
553        errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
554        return;
555    }
556    bi->setText(thaiStr);
557
558    int32_t  startOfSecondWord = bi->following(1);
559    if (startOfSecondWord != 4) {
560        errln("Fail at file %s, line %d expected start of word at 4, got %d",
561            __FILE__, __LINE__, startOfSecondWord);
562    }
563    startOfSecondWord = bi->following(0);
564    if (startOfSecondWord != 4) {
565        errln("Fail at file %s, line %d expected start of word at 4, got %d",
566            __FILE__, __LINE__, startOfSecondWord);
567    }
568    delete bi;
569}
570
571
572void RBBITest::TestJapaneseWordBreak() {
573    UErrorCode status = U_ZERO_ERROR;
574    BITestData   japaneseWordSelection(status);
575
576    ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
577    ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
578    ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
579    ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
580    ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
581    ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
582    ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
583
584    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
585        Locale("ja"), status);
586    if (U_FAILURE(status))
587    {
588        errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
589        return;
590    }
591
592    generalIteratorTest(*e, japaneseWordSelection);
593    delete e;
594}
595
596void RBBITest::TestTrieDict() {
597    UErrorCode      status  = U_ZERO_ERROR;
598
599    //
600    //  Open and read the test data file.
601    //
602    const char *testDataDirectory = IntlTest::getSourceTestData(status);
603    char testFileName[1000];
604    if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
605        errln("Can't open test data.  Path too long.");
606        return;
607    }
608    strcpy(testFileName, testDataDirectory);
609    strcat(testFileName, "riwords.txt");
610
611    // Items needing deleting at the end
612    MutableTrieDictionary *mutableDict = NULL;
613    CompactTrieDictionary *compactDict = NULL;
614    UnicodeSet            *breaks      = NULL;
615    UChar                 *testFile    = NULL;
616    StringEnumeration     *enumer1     = NULL;
617    StringEnumeration     *enumer2     = NULL;
618    MutableTrieDictionary *mutable2    = NULL;
619    StringEnumeration     *cloneEnum   = NULL;
620    CompactTrieDictionary *compact2    = NULL;
621
622
623    const UnicodeString *originalWord = NULL;
624    const UnicodeString *cloneWord    = NULL;
625    UChar *current;
626    UChar *word;
627    UChar uc;
628    int32_t wordLen;
629    int32_t wordCount;
630    int32_t testCount;
631
632    int    len;
633    testFile = ReadAndConvertFile(testFileName, len, NULL, status);
634    if (U_FAILURE(status)) {
635        goto cleanup; /* something went wrong, error already output */
636    }
637
638    mutableDict = new MutableTrieDictionary(0x0E1C, status);
639    if (U_FAILURE(status)) {
640        errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
641        goto cleanup;
642    }
643
644    breaks = new UnicodeSet;
645    breaks->add(0x000A);     // Line Feed
646    breaks->add(0x000D);     // Carriage Return
647    breaks->add(0x2028);     // Line Separator
648    breaks->add(0x2029);     // Paragraph Separator
649
650    // Now add each non-comment line of the file as a word.
651    current = testFile;
652    word = current;
653    uc = *current++;
654    wordLen = 0;
655    wordCount = 0;
656
657    while (uc) {
658        if (uc == 0x0023) {     // #comment line, skip
659            while (uc && !breaks->contains(uc)) {
660                uc = *current++;
661            }
662        }
663        else while (uc && !breaks->contains(uc)) {
664            ++wordLen;
665            uc = *current++;
666        }
667        if (wordLen > 0) {
668            mutableDict->addWord(word, wordLen, status);
669            if (U_FAILURE(status)) {
670                errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
671                goto cleanup;
672            }
673            wordCount += 1;
674        }
675
676        // Find beginning of next line
677        while (uc && breaks->contains(uc)) {
678            uc = *current++;
679        }
680        word = current-1;
681        wordLen = 0;
682    }
683
684    if (wordCount < 50) {
685        errln("Word count (%d) unreasonably small\n", wordCount);
686        goto cleanup;
687    }
688
689    enumer1 = mutableDict->openWords(status);
690    if (U_FAILURE(status)) {
691        errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
692        goto cleanup;
693    }
694
695    testCount = 0;
696    if (wordCount != (testCount = enumer1->count(status))) {
697        errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
698            testCount, wordCount, u_errorName(status));
699        goto cleanup;
700    }
701
702    // Now compact it
703    compactDict = new CompactTrieDictionary(*mutableDict, status);
704    if (U_FAILURE(status)) {
705        errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
706        goto cleanup;
707    }
708
709    enumer2 = compactDict->openWords(status);
710    if (U_FAILURE(status)) {
711        errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
712        goto cleanup;
713    }
714
715    if (wordCount != (testCount = enumer2->count(status))) {
716        errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
717            testCount, wordCount, u_errorName(status));
718        goto cleanup;
719    }
720
721    if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
722        errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
723    }
724    delete enumer1;
725    enumer1 = NULL;
726    delete enumer2;
727    enumer2 = NULL;
728
729    // Now un-compact it
730    mutable2 = compactDict->cloneMutable(status);
731    if (U_FAILURE(status)) {
732        errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
733        goto cleanup;
734    }
735
736    cloneEnum = mutable2->openWords(status);
737    if (U_FAILURE(status)) {
738        errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
739        goto cleanup;
740    }
741
742    if (wordCount != (testCount = cloneEnum->count(status))) {
743        errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
744            testCount, wordCount, u_errorName(status));
745        goto cleanup;
746    }
747
748    // Compact original dictionary to clone. Note that we can only compare the same kind of
749    // dictionary as the order of the enumerators is not guaranteed to be the same between
750    // different kinds
751    enumer1 = mutableDict->openWords(status);
752    if (U_FAILURE(status)) {
753        errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
754        goto cleanup;
755     }
756
757    originalWord = enumer1->snext(status);
758    cloneWord = cloneEnum->snext(status);
759    while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
760        if (*originalWord != *cloneWord) {
761            errln("Original and cloned MutableTrieDictionary word mismatch\n");
762            goto cleanup;
763        }
764        originalWord = enumer1->snext(status);
765        cloneWord = cloneEnum->snext(status);
766    }
767
768    if (U_FAILURE(status)) {
769        errln("Enumeration failed: %s\n", u_errorName(status));
770        goto cleanup;
771    }
772
773    if (originalWord != cloneWord) {
774        errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
775        goto cleanup;
776    }
777
778    // Test the data copying constructor for CompactTrieDict, and the data access APIs.
779    compact2 = new CompactTrieDictionary(compactDict->data(), status);
780    if (U_FAILURE(status)) {
781        errln("CompactTrieDictionary(const void *,...) failed\n");
782        goto cleanup;
783    }
784
785    if (compact2->dataSize() == 0) {
786        errln("CompactTrieDictionary->dataSize() == 0\n");
787        goto cleanup;
788    }
789
790    // Now count the words via the second dictionary
791    delete enumer1;
792    enumer1 = compact2->openWords(status);
793    if (U_FAILURE(status)) {
794        errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
795        goto cleanup;
796    }
797
798    if (wordCount != (testCount = enumer1->count(status))) {
799        errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
800            testCount, wordCount, u_errorName(status));
801        goto cleanup;
802    }
803
804cleanup:
805    delete compactDict;
806    delete mutableDict;
807    delete breaks;
808    delete[] testFile;
809    delete enumer1;
810    delete mutable2;
811    delete cloneEnum;
812    delete compact2;
813}
814
815
816//----------------------------------------------------------------------------
817//
818// generalIteratorTest      Given a break iterator and a set of test data,
819//                          Run the tests and report the results.
820//
821//----------------------------------------------------------------------------
822void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
823{
824
825    bi.setText(td.fDataToBreak);
826
827    testFirstAndNext(bi, td);
828
829    testLastAndPrevious(bi, td);
830
831    testFollowing(bi, td);
832    testPreceding(bi, td);
833    testIsBoundary(bi, td);
834    doMultipleSelectionTest(bi, td);
835}
836
837
838//
839//   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
840//                       kind of loop.
841//
842void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
843{
844    UErrorCode  status = U_ZERO_ERROR;
845    int32_t     p;
846    int32_t     lastP = -1;
847    int32_t     tag;
848
849    logln("Test first and next");
850    bi.setText(td.fDataToBreak);
851    td.clearResults();
852
853    for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
854        td.fActualBreakPositions.addElement(p, status);  // Save result.
855        tag = bi.getRuleStatus();
856        td.fActualTags.addElement(tag, status);
857        if (p <= lastP) {
858            // If the iterator is not making forward progress, stop.
859            //  No need to raise an error here, it'll be detected in the normal check of results.
860            break;
861        }
862        lastP = p;
863    }
864    td.checkResults("testFirstAndNext", this);
865}
866
867
868//
869//  TestLastAndPrevious.   Run the iterator backwards, starting with last().
870//
871void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
872{
873    UErrorCode  status = U_ZERO_ERROR;
874    int32_t     p;
875    int32_t     lastP  = 0x7ffffffe;
876    int32_t     tag;
877
878    logln("Test last and previous");
879    bi.setText(td.fDataToBreak);
880    td.clearResults();
881
882    for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
883        // Save break position.  Insert it at start of vector of results, shoving
884        //    already-saved results further towards the end.
885        td.fActualBreakPositions.insertElementAt(p, 0, status);
886        // bi.previous();   // TODO:  Why does this fix things up????
887        // bi.next();
888        tag = bi.getRuleStatus();
889        td.fActualTags.insertElementAt(tag, 0, status);
890        if (p >= lastP) {
891            // If the iterator is not making progress, stop.
892            //  No need to raise an error here, it'll be detected in the normal check of results.
893            break;
894        }
895        lastP = p;
896    }
897    td.checkResults("testLastAndPrevious", this);
898}
899
900
901void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
902{
903    UErrorCode  status = U_ZERO_ERROR;
904    int32_t     p;
905    int32_t     tag;
906    int32_t     lastP  = -2;     // A value that will never be returned as a break position.
907                                 //   cannot be -1; that is returned for DONE.
908    int         i;
909
910    logln("testFollowing():");
911    bi.setText(td.fDataToBreak);
912    td.clearResults();
913
914    // Save the starting point, since we won't get that out of following.
915    p = bi.first();
916    td.fActualBreakPositions.addElement(p, status);  // Save result.
917    tag = bi.getRuleStatus();
918    td.fActualTags.addElement(tag, status);
919
920    for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
921        p = bi.following(i);
922        if (p != lastP) {
923            if (p == RuleBasedBreakIterator::DONE) {
924                break;
925            }
926            // We've reached a new break position.  Save it.
927            td.fActualBreakPositions.addElement(p, status);  // Save result.
928            tag = bi.getRuleStatus();
929            td.fActualTags.addElement(tag, status);
930            lastP = p;
931        }
932    }
933    // The loop normally exits by means of the break in the middle.
934    // Make sure that the index was at the correct position for the break iterator to have
935    //   returned DONE.
936    if (i != td.fDataToBreak.length()) {
937        errln("testFollowing():  iterator returned DONE prematurely.");
938    }
939
940    // Full check of all results.
941    td.checkResults("testFollowing", this);
942}
943
944
945
946void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
947    UErrorCode  status = U_ZERO_ERROR;
948    int32_t     p;
949    int32_t     tag;
950    int32_t     lastP  = 0x7ffffffe;
951    int         i;
952
953    logln("testPreceding():");
954    bi.setText(td.fDataToBreak);
955    td.clearResults();
956
957    p = bi.last();
958    td.fActualBreakPositions.addElement(p, status);
959    tag = bi.getRuleStatus();
960    td.fActualTags.addElement(tag, status);
961
962    for (i = td.fDataToBreak.length(); i>=-1; i--) {
963        p = bi.preceding(i);
964        if (p != lastP) {
965            if (p == RuleBasedBreakIterator::DONE) {
966                break;
967            }
968            // We've reached a new break position.  Save it.
969            td.fActualBreakPositions.insertElementAt(p, 0, status);
970            lastP = p;
971            tag = bi.getRuleStatus();
972            td.fActualTags.insertElementAt(tag, 0, status);
973        }
974    }
975    // The loop normally exits by means of the break in the middle.
976    // Make sure that the index was at the correct position for the break iterator to have
977    //   returned DONE.
978    if (i != 0) {
979        errln("testPreceding():  iterator returned DONE prematurely.");
980    }
981
982    // Full check of all results.
983    td.checkResults("testPreceding", this);
984}
985
986
987
988void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
989    UErrorCode  status = U_ZERO_ERROR;
990    int         i;
991    int32_t     tag;
992
993    logln("testIsBoundary():");
994    bi.setText(td.fDataToBreak);
995    td.clearResults();
996
997    for (i = 0; i <= td.fDataToBreak.length(); i++) {
998        if (bi.isBoundary(i)) {
999            td.fActualBreakPositions.addElement(i, status);  // Save result.
1000            tag = bi.getRuleStatus();
1001            td.fActualTags.addElement(tag, status);
1002        }
1003    }
1004    td.checkResults("testIsBoundary: ", this);
1005}
1006
1007
1008
1009void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
1010{
1011    iterator.setText(td.fDataToBreak);
1012
1013    RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
1014    int32_t offset = iterator.first();
1015    int32_t testOffset;
1016    int32_t count = 0;
1017
1018    logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
1019
1020    if (*testIterator != iterator)
1021        errln("clone() or operator!= failed: two clones compared unequal");
1022
1023    do {
1024        testOffset = testIterator->first();
1025        testOffset = testIterator->next(count);
1026        if (offset != testOffset)
1027            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1028
1029        if (offset != RuleBasedBreakIterator::DONE) {
1030            count++;
1031            offset = iterator.next();
1032
1033            if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
1034                errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
1035                if (count > 10000 || offset == -1) {
1036                    errln("operator== failed too many times. Stopping test.");
1037                    if (offset == -1) {
1038                        errln("Does (RuleBasedBreakIterator::DONE == -1)?");
1039                    }
1040                    return;
1041                }
1042            }
1043        }
1044    } while (offset != RuleBasedBreakIterator::DONE);
1045
1046    // now do it backwards...
1047    offset = iterator.last();
1048    count = 0;
1049
1050    do {
1051        testOffset = testIterator->last();
1052        testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
1053        if (offset != testOffset)
1054            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1055
1056        if (offset != RuleBasedBreakIterator::DONE) {
1057            count--;
1058            offset = iterator.previous();
1059        }
1060    } while (offset != RuleBasedBreakIterator::DONE);
1061
1062    delete testIterator;
1063}
1064
1065
1066//---------------------------------------------
1067//
1068//     other tests
1069//
1070//---------------------------------------------
1071void RBBITest::TestEmptyString()
1072{
1073    UnicodeString text = "";
1074    UErrorCode status = U_ZERO_ERROR;
1075
1076    BITestData x(status);
1077    ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
1078    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
1079    if (U_FAILURE(status))
1080    {
1081        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
1082        return;
1083    }
1084    generalIteratorTest(*bi, x);
1085    delete bi;
1086}
1087
1088void RBBITest::TestGetAvailableLocales()
1089{
1090    int32_t locCount = 0;
1091    const Locale* locList = BreakIterator::getAvailableLocales(locCount);
1092
1093    if (locCount == 0)
1094        dataerrln("getAvailableLocales() returned an empty list!");
1095    // Just make sure that it's returning good memory.
1096    int32_t i;
1097    for (i = 0; i < locCount; ++i) {
1098        logln(locList[i].getName());
1099    }
1100}
1101
1102//Testing the BreakIterator::getDisplayName() function
1103void RBBITest::TestGetDisplayName()
1104{
1105    UnicodeString   result;
1106
1107    BreakIterator::getDisplayName(Locale::getUS(), result);
1108    if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
1109        dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1110                + result);
1111
1112    BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
1113    if (result != "French (France)")
1114        dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1115                + result);
1116}
1117/**
1118 * Test End Behaviour
1119 * @bug 4068137
1120 */
1121void RBBITest::TestEndBehaviour()
1122{
1123    UErrorCode status = U_ZERO_ERROR;
1124    UnicodeString testString("boo.");
1125    BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
1126    if (U_FAILURE(status))
1127    {
1128        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
1129        return;
1130    }
1131    wb->setText(testString);
1132
1133    if (wb->first() != 0)
1134        errln("Didn't get break at beginning of string.");
1135    if (wb->next() != 3)
1136        errln("Didn't get break before period in \"boo.\"");
1137    if (wb->current() != 4 && wb->next() != 4)
1138        errln("Didn't get break at end of string.");
1139    delete wb;
1140}
1141/*
1142 * @bug 4153072
1143 */
1144void RBBITest::TestBug4153072() {
1145    UErrorCode status = U_ZERO_ERROR;
1146    BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
1147    if (U_FAILURE(status))
1148    {
1149        errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
1150        return;
1151    }
1152    UnicodeString str("...Hello, World!...");
1153    int32_t begin = 3;
1154    int32_t end = str.length() - 3;
1155    UBool onBoundary;
1156
1157    StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
1158    iter->adoptText(textIterator);
1159    int index;
1160    // Note: with the switch to UText, there is no way to restrict the
1161    //       iteration range to begin at an index other than zero.
1162    //       String character iterators created with a non-zero bound are
1163    //         treated by RBBI as being empty.
1164    for (index = -1; index < begin + 1; ++index) {
1165        onBoundary = iter->isBoundary(index);
1166        if (index == 0?  !onBoundary : onBoundary) {
1167            errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
1168                            " and begin index = " + begin);
1169        }
1170    }
1171    delete iter;
1172}
1173
1174
1175//
1176// Test for problem reported by Ashok Matoria on 9 July 2007
1177//    One.<kSoftHyphen><kSpace>Two.
1178//
1179//    Sentence break at start (0) and then on calling next() it breaks at
1180//   'T' of "Two". Now, at this point if I do next() and
1181//    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
1182//
1183void RBBITest::TestBug5775() {
1184    UErrorCode status = U_ZERO_ERROR;
1185    BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1186    TEST_ASSERT_SUCCESS(status);
1187    if (U_FAILURE(status)) {
1188        return;
1189    }
1190// Check for status first for better handling of no data errors.
1191    TEST_ASSERT(bi != NULL);
1192    if (bi == NULL) {
1193        return;
1194    }
1195
1196    UnicodeString s("One.\\u00ad Two.", -1, US_INV);
1197    //               01234      56789
1198    s = s.unescape();
1199    bi->setText(s);
1200    int pos = bi->next();
1201    TEST_ASSERT(pos == 6);
1202    pos = bi->next();
1203    TEST_ASSERT(pos == 10);
1204    pos = bi->previous();
1205    TEST_ASSERT(pos == 6);
1206    delete bi;
1207}
1208
1209
1210
1211/**
1212 * Test Japanese Line Break
1213 * @bug 4095322
1214 */
1215void RBBITest::TestJapaneseLineBreak()
1216{
1217#if 0
1218    // Test needs updating some more...   Dump it for now.
1219
1220
1221    // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
1222    //        as opening and closing punctuation for line breaking.
1223    //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
1224    //        from these tests.    6-13-2002
1225    //
1226    UErrorCode status = U_ZERO_ERROR;
1227    UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
1228    UnicodeString precedingChars = CharsToUnicodeString(
1229        //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1230        "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1231    UnicodeString followingChars = CharsToUnicodeString(
1232        // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1233        ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1234        // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1235        ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1236        "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1237    BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
1238
1239    int32_t i;
1240    if (U_FAILURE(status))
1241    {
1242        errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1243        return;
1244    }
1245
1246    for (i = 0; i < precedingChars.length(); i++) {
1247        testString.setCharAt(1, precedingChars[i]);
1248        iter->setText(testString);
1249        int32_t j = iter->first();
1250        if (j != 0)
1251            errln("ja line break failure: failed to start at 0");
1252        j = iter->next();
1253        if (j != 1)
1254            errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
1255                        + "' (" + ((int)(precedingChars[i])) + ")");
1256        j = iter->next();
1257        if (j != 3)
1258            errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
1259                        + "' (" + ((int)(precedingChars[i])) + ")");
1260    }
1261
1262    for (i = 0; i < followingChars.length(); i++) {
1263        testString.setCharAt(1, followingChars[i]);
1264        iter->setText(testString);
1265        int j = iter->first();
1266        if (j != 0)
1267            errln("ja line break failure: failed to start at 0");
1268        j = iter->next();
1269        if (j != 2)
1270            errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
1271                        + "' (" + ((int)(followingChars[i])) + ")");
1272        j = iter->next();
1273        if (j != 3)
1274            errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
1275                        + "' (" + ((int)(followingChars[i])) + ")");
1276    }
1277    delete iter;
1278#endif
1279}
1280
1281
1282//------------------------------------------------------------------------------
1283//
1284//   RBBITest::Extended    Run  RBBI Tests from an external test data file
1285//
1286//------------------------------------------------------------------------------
1287
1288struct TestParams {
1289    BreakIterator   *bi;
1290    UnicodeString    dataToBreak;
1291    UVector32       *expectedBreaks;
1292    UVector32       *srcLine;
1293    UVector32       *srcCol;
1294};
1295
1296void RBBITest::executeTest(TestParams *t) {
1297    int32_t    bp;
1298    int32_t    prevBP;
1299    int32_t    i;
1300
1301    if (t->bi == NULL) {
1302        return;
1303    }
1304
1305    t->bi->setText(t->dataToBreak);
1306    //
1307    //  Run the iterator forward
1308    //
1309    prevBP = -1;
1310    for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1311        if (prevBP ==  bp) {
1312            // Fail for lack of forward progress.
1313            errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
1314                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1315            break;
1316        }
1317
1318        // Check that there were we didn't miss an expected break between the last one
1319        //  and this one.
1320        for (i=prevBP+1; i<bp; i++) {
1321            if (t->expectedBreaks->elementAti(i) != 0) {
1322                int expected[] = {0, i};
1323                printStringBreaks(t->dataToBreak, expected, 2);
1324                errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1325                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1326            }
1327        }
1328
1329        // Check that the break we did find was expected
1330        if (t->expectedBreaks->elementAti(bp) == 0) {
1331            int expected[] = {0, bp};
1332            printStringBreaks(t->dataToBreak, expected, 2);
1333            errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1334                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1335        } else {
1336            // The break was expected.
1337            //   Check that the {nnn} tag value is correct.
1338            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1339            if (expectedTagVal == -1) {
1340                expectedTagVal = 0;
1341            }
1342            int32_t line = t->srcLine->elementAti(bp);
1343            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1344            if (rs != expectedTagVal) {
1345                errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1346                      "          Actual, Expected status = %4d, %4d",
1347                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1348            }
1349        }
1350
1351
1352        prevBP = bp;
1353    }
1354
1355    // Verify that there were no missed expected breaks after the last one found
1356    for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
1357        if (t->expectedBreaks->elementAti(i) != 0) {
1358            errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1359                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1360        }
1361    }
1362
1363    //
1364    //  Run the iterator backwards, verify that the same breaks are found.
1365    //
1366    prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
1367    for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1368        if (prevBP ==  bp) {
1369            // Fail for lack of progress.
1370            errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1371                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1372            break;
1373        }
1374
1375        // Check that there were we didn't miss an expected break between the last one
1376        //  and this one.  (UVector returns zeros for index out of bounds.)
1377        for (i=prevBP-1; i>bp; i--) {
1378            if (t->expectedBreaks->elementAti(i) != 0) {
1379                errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1380                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1381            }
1382        }
1383
1384        // Check that the break we did find was expected
1385        if (t->expectedBreaks->elementAti(bp) == 0) {
1386            errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1387                   bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1388        } else {
1389            // The break was expected.
1390            //   Check that the {nnn} tag value is correct.
1391            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1392            if (expectedTagVal == -1) {
1393                expectedTagVal = 0;
1394            }
1395            int line = t->srcLine->elementAti(bp);
1396            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1397            if (rs != expectedTagVal) {
1398                errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1399                      "          Actual, Expected status = %4d, %4d",
1400                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1401            }
1402        }
1403
1404        prevBP = bp;
1405    }
1406
1407    // Verify that there were no missed breaks prior to the last one found
1408    for (i=prevBP-1; i>=0; i--) {
1409        if (t->expectedBreaks->elementAti(i) != 0) {
1410            errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1411                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1412        }
1413    }
1414}
1415
1416
1417void RBBITest::TestExtended() {
1418#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1419    UErrorCode      status  = U_ZERO_ERROR;
1420    Locale          locale("");
1421
1422    UnicodeString       rules;
1423    TestParams          tp;
1424    tp.bi             = NULL;
1425    tp.expectedBreaks = new UVector32(status);
1426    tp.srcLine        = new UVector32(status);
1427    tp.srcCol         = new UVector32(status);
1428
1429    RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
1430    if (U_FAILURE(status)) {
1431        dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1432    }
1433
1434
1435    //
1436    //  Open and read the test data file.
1437    //
1438    const char *testDataDirectory = IntlTest::getSourceTestData(status);
1439    char testFileName[1000];
1440    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1441        errln("Can't open test data.  Path too long.");
1442        return;
1443    }
1444    strcpy(testFileName, testDataDirectory);
1445    strcat(testFileName, "rbbitst.txt");
1446
1447    int    len;
1448    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1449    if (U_FAILURE(status)) {
1450        return; /* something went wrong, error already output */
1451    }
1452
1453
1454
1455
1456    //
1457    //  Put the test data into a UnicodeString
1458    //
1459    UnicodeString testString(FALSE, testFile, len);
1460
1461    enum EParseState{
1462        PARSE_COMMENT,
1463        PARSE_TAG,
1464        PARSE_DATA,
1465        PARSE_NUM
1466    }
1467    parseState = PARSE_TAG;
1468
1469    EParseState savedState = PARSE_TAG;
1470
1471    static const UChar CH_LF        = 0x0a;
1472    static const UChar CH_CR        = 0x0d;
1473    static const UChar CH_HASH      = 0x23;
1474    /*static const UChar CH_PERIOD    = 0x2e;*/
1475    static const UChar CH_LT        = 0x3c;
1476    static const UChar CH_GT        = 0x3e;
1477    static const UChar CH_BACKSLASH = 0x5c;
1478    static const UChar CH_BULLET    = 0x2022;
1479
1480    int32_t    lineNum  = 1;
1481    int32_t    colStart = 0;
1482    int32_t    column   = 0;
1483    int32_t    charIdx  = 0;
1484
1485    int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1486
1487    for (charIdx = 0; charIdx < len; ) {
1488        status = U_ZERO_ERROR;
1489        UChar  c = testString.charAt(charIdx);
1490        charIdx++;
1491        if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1492            // treat CRLF as a unit
1493            c = CH_LF;
1494            charIdx++;
1495        }
1496        if (c == CH_LF || c == CH_CR) {
1497            lineNum++;
1498            colStart = charIdx;
1499        }
1500        column = charIdx - colStart + 1;
1501
1502        switch (parseState) {
1503        case PARSE_COMMENT:
1504            if (c == 0x0a || c == 0x0d) {
1505                parseState = savedState;
1506            }
1507            break;
1508
1509        case PARSE_TAG:
1510            {
1511            if (c == CH_HASH) {
1512                parseState = PARSE_COMMENT;
1513                savedState = PARSE_TAG;
1514                break;
1515            }
1516            if (u_isUWhiteSpace(c)) {
1517                break;
1518            }
1519            if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1520                delete tp.bi;
1521                tp.bi = BreakIterator::createWordInstance(locale,  status);
1522                charIdx += 5;
1523                break;
1524            }
1525            if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1526                delete tp.bi;
1527                tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1528                charIdx += 5;
1529                break;
1530            }
1531            if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1532                delete tp.bi;
1533                tp.bi = BreakIterator::createLineInstance(locale,  status);
1534                charIdx += 5;
1535                break;
1536            }
1537            if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1538                delete tp.bi;
1539                tp.bi = NULL;
1540                tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1541                charIdx += 5;
1542                break;
1543            }
1544            if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1545                delete tp.bi;
1546                tp.bi = BreakIterator::createTitleInstance(locale,  status);
1547                charIdx += 6;
1548                break;
1549            }
1550
1551            // <locale  loc_name>
1552            localeMatcher.reset(testString);
1553            if (localeMatcher.lookingAt(charIdx-1, status)) {
1554                UnicodeString localeName = localeMatcher.group(1, status);
1555                char localeName8[100];
1556                localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1557                locale = Locale::createFromName(localeName8);
1558                charIdx += localeMatcher.group(0, status).length();
1559                TEST_ASSERT_SUCCESS(status);
1560                break;
1561            }
1562            if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1563                parseState = PARSE_DATA;
1564                charIdx += 5;
1565                tp.dataToBreak = "";
1566                tp.expectedBreaks->removeAllElements();
1567                tp.srcCol ->removeAllElements();
1568                tp.srcLine->removeAllElements();
1569                break;
1570            }
1571
1572            errln("line %d: Tag expected in test file.", lineNum);
1573            parseState = PARSE_COMMENT;
1574            savedState = PARSE_DATA;
1575            goto end_test; // Stop the test.
1576            }
1577            break;
1578
1579        case PARSE_DATA:
1580            if (c == CH_BULLET) {
1581                int32_t  breakIdx = tp.dataToBreak.length();
1582                tp.expectedBreaks->setSize(breakIdx+1);
1583                tp.expectedBreaks->setElementAt(-1, breakIdx);
1584                tp.srcLine->setSize(breakIdx+1);
1585                tp.srcLine->setElementAt(lineNum, breakIdx);
1586                tp.srcCol ->setSize(breakIdx+1);
1587                tp.srcCol ->setElementAt(column, breakIdx);
1588                break;
1589            }
1590
1591            if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1592                // Add final entry to mappings from break location to source file position.
1593                //  Need one extra because last break position returned is after the
1594                //    last char in the data, not at the last char.
1595                tp.srcLine->addElement(lineNum, status);
1596                tp.srcCol ->addElement(column, status);
1597
1598                parseState = PARSE_TAG;
1599                charIdx += 6;
1600
1601                // RUN THE TEST!
1602                executeTest(&tp);
1603                break;
1604            }
1605
1606            if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1607                // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1608                // Get the code point from the name and insert it into the test data.
1609                //   (Damn, no API takes names in Unicode  !!!
1610                //    we've got to take it back to char *)
1611                int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1612                int32_t nameLength = nameEndIdx - (charIdx+2);
1613                char charNameBuf[200];
1614                UChar32 theChar = -1;
1615                if (nameEndIdx != -1) {
1616                    UErrorCode status = U_ZERO_ERROR;
1617                    testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1618                    charNameBuf[sizeof(charNameBuf)-1] = 0;
1619                    theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1620                    if (U_FAILURE(status)) {
1621                        theChar = -1;
1622                    }
1623                }
1624                if (theChar == -1) {
1625                    errln("Error in named character in test file at line %d, col %d",
1626                        lineNum, column);
1627                } else {
1628                    // Named code point was recognized.  Insert it
1629                    //   into the test data.
1630                    tp.dataToBreak.append(theChar);
1631                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
1632                        tp.srcLine->addElement(lineNum, status);
1633                        tp.srcCol ->addElement(column, status);
1634                    }
1635                }
1636                if (nameEndIdx > charIdx) {
1637                    charIdx = nameEndIdx+1;
1638
1639                }
1640                break;
1641            }
1642
1643
1644
1645
1646            if (testString.compare(charIdx-1, 2, "<>") == 0) {
1647                charIdx++;
1648                int32_t  breakIdx = tp.dataToBreak.length();
1649                tp.expectedBreaks->setSize(breakIdx+1);
1650                tp.expectedBreaks->setElementAt(-1, breakIdx);
1651                tp.srcLine->setSize(breakIdx+1);
1652                tp.srcLine->setElementAt(lineNum, breakIdx);
1653                tp.srcCol ->setSize(breakIdx+1);
1654                tp.srcCol ->setElementAt(column, breakIdx);
1655                break;
1656            }
1657
1658            if (c == CH_LT) {
1659                tagValue   = 0;
1660                parseState = PARSE_NUM;
1661                break;
1662            }
1663
1664            if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1665                parseState = PARSE_COMMENT;
1666                savedState = PARSE_DATA;
1667                break;
1668            }
1669
1670            if (c == CH_BACKSLASH) {
1671                // Check for \ at end of line, a line continuation.
1672                //     Advance over (discard) the newline
1673                UChar32 cp = testString.char32At(charIdx);
1674                if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1675                    // We have a CR LF
1676                    //  Need an extra increment of the input ptr to move over both of them
1677                    charIdx++;
1678                }
1679                if (cp == CH_LF || cp == CH_CR) {
1680                    lineNum++;
1681                    colStart = charIdx;
1682                    charIdx++;
1683                    break;
1684                }
1685
1686                // Let unescape handle the back slash.
1687                cp = testString.unescapeAt(charIdx);
1688                if (cp != -1) {
1689                    // Escape sequence was recognized.  Insert the char
1690                    //   into the test data.
1691                    tp.dataToBreak.append(cp);
1692                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
1693                        tp.srcLine->addElement(lineNum, status);
1694                        tp.srcCol ->addElement(column, status);
1695                    }
1696                    break;
1697                }
1698
1699
1700                // Not a recognized backslash escape sequence.
1701                // Take the next char as a literal.
1702                //  TODO:  Should this be an error?
1703                c = testString.charAt(charIdx);
1704                charIdx = testString.moveIndex32(charIdx, 1);
1705            }
1706
1707            // Normal, non-escaped data char.
1708            tp.dataToBreak.append(c);
1709
1710            // Save the mapping from offset in the data to line/column numbers in
1711            //   the original input file.  Will be used for better error messages only.
1712            //   If there's an expected break before this char, the slot in the mapping
1713            //     vector will already be set for this char; don't overwrite it.
1714            if (tp.dataToBreak.length() > tp.srcLine->size()) {
1715                tp.srcLine->addElement(lineNum, status);
1716                tp.srcCol ->addElement(column, status);
1717            }
1718            break;
1719
1720
1721        case PARSE_NUM:
1722            // We are parsing an expected numeric tag value, like <1234>,
1723            //   within a chunk of data.
1724            if (u_isUWhiteSpace(c)) {
1725                break;
1726            }
1727
1728            if (c == CH_GT) {
1729                // Finished the number.  Add the info to the expected break data,
1730                //   and switch parse state back to doing plain data.
1731                parseState = PARSE_DATA;
1732                if (tagValue == 0) {
1733                    tagValue = -1;
1734                }
1735                int32_t  breakIdx = tp.dataToBreak.length();
1736                tp.expectedBreaks->setSize(breakIdx+1);
1737                tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1738                tp.srcLine->setSize(breakIdx+1);
1739                tp.srcLine->setElementAt(lineNum, breakIdx);
1740                tp.srcCol ->setSize(breakIdx+1);
1741                tp.srcCol ->setElementAt(column, breakIdx);
1742                break;
1743            }
1744
1745            if (u_isdigit(c)) {
1746                tagValue = tagValue*10 + u_charDigitValue(c);
1747                break;
1748            }
1749
1750            errln("Syntax Error in test file at line %d, col %d",
1751                lineNum, column);
1752            parseState = PARSE_COMMENT;
1753            goto end_test; // Stop the test
1754            break;
1755        }
1756
1757
1758        if (U_FAILURE(status)) {
1759            errln("ICU Error %s while parsing test file at line %d.",
1760                u_errorName(status), lineNum);
1761            status = U_ZERO_ERROR;
1762            goto end_test; // Stop the test
1763        }
1764
1765    }
1766
1767end_test:
1768    delete tp.bi;
1769    delete tp.expectedBreaks;
1770    delete tp.srcLine;
1771    delete tp.srcCol;
1772    delete [] testFile;
1773#endif
1774}
1775
1776void RBBITest::TestThaiBreaks() {
1777    UErrorCode status=U_ZERO_ERROR;
1778    BreakIterator* b;
1779    Locale locale = Locale("th");
1780    int32_t p, index;
1781    UChar c[]= {
1782            0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
1783            0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
1784            0x0E16, 0x0E49, 0x0E33
1785    };
1786    int32_t expectedWordResult[] = {
1787            2, 3, 6, 10, 11, 15, 17, 20, 22
1788    };
1789    int32_t expectedLineResult[] = {
1790            3, 6, 11, 15, 17, 20, 22
1791    };
1792    int32_t size = sizeof(c)/sizeof(UChar);
1793    UnicodeString text=UnicodeString(c);
1794
1795    b = BreakIterator::createWordInstance(locale, status);
1796    if (U_FAILURE(status)) {
1797        errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
1798        return;
1799    }
1800    b->setText(text);
1801    p = index = 0;
1802    while ((p=b->next())!=BreakIterator::DONE && p < size) {
1803        if (p != expectedWordResult[index++]) {
1804            errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
1805        }
1806    }
1807    delete b;
1808
1809    b = BreakIterator::createLineInstance(locale, status);
1810    if (U_FAILURE(status)) {
1811        printf("Unable to create thai line break iterator.\n");
1812        return;
1813    }
1814    b->setText(text);
1815    p = index = 0;
1816    while ((p=b->next())!=BreakIterator::DONE && p < size) {
1817        if (p != expectedLineResult[index++]) {
1818            errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
1819        }
1820    }
1821
1822    delete b;
1823}
1824
1825// UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
1826// Words don't include colon or period (cldrbug #1969).
1827static const char    posxWordText[]     = "Can't have breaks in xx:yy or struct.field for CS-types.";
1828static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
1829static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21,         26, 27, 29, 30,         42, 43, 46, 47, 49, 50, 55, 56 };
1830
1831// UBreakIteratorType UBRK_WORD, Locale "ja"
1832// Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
1833static const char    jaWordText[]     = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
1834                                        "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
1835static const int32_t jaWordTOffsets[] = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };
1836static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
1837
1838// UBreakIteratorType UBRK_SENTENCE, Locale "el"
1839// Add break after Greek question mark (cldrbug #2069).
1840static const char    elSentText[]     = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "
1841                                        "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3";
1842static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 };
1843static const int32_t elSentROffsets[] = {        20, 27, 35, 36 };
1844
1845// UBreakIteratorType UBRK_CHARACTER, Locale "th"
1846// Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
1847static const char    thCharText[]     = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 "
1848                                        "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) "
1849                                        "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 ";
1850static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,
1851                                          12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,
1852                                          29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };
1853static const int32_t thCharROffsets[] = { 1,    3, 5, 6, 7, 8, 9,     11,
1854                                          12, 13, 15,     17, 19, 20, 22,     24,     26, 27, 28,
1855                                          29,     32, 33, 35, 37, 38,     40, 41 };
1856
1857typedef struct {
1858    UBreakIteratorType  type;
1859    const char *        locale;
1860    const char *        escapedText;
1861    const int32_t *     tailoredOffsets;
1862    int32_t             tailoredOffsetsCount;
1863    const int32_t *     rootOffsets;
1864    int32_t             rootOffsetsCount;
1865} TailoredBreakItem;
1866
1867#define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0]))
1868
1869static const TailoredBreakItem tbItems[] = {
1870    { UBRK_WORD,      "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) },
1871    { UBRK_WORD,      "ja",          jaWordText,   ARRAY_PTR_LEN(jaWordTOffsets),   ARRAY_PTR_LEN(jaWordROffsets)   },
1872    { UBRK_SENTENCE,  "el",          elSentText,   ARRAY_PTR_LEN(elSentTOffsets),   ARRAY_PTR_LEN(elSentROffsets)   },
1873    { UBRK_CHARACTER, "th",          thCharText,   ARRAY_PTR_LEN(thCharTOffsets),   ARRAY_PTR_LEN(thCharROffsets)   },
1874    { UBRK_CHARACTER, NULL,          NULL,         NULL,0,                          NULL,0                          } // terminator
1875};
1876
1877static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) {
1878    while (count-- > 0) {
1879        int writeCount;
1880        sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */
1881        buffer += writeCount;
1882        buflen -= writeCount;
1883    }
1884}
1885
1886enum { kMaxOffsetCount = 128 };
1887
1888void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) {
1889    brkitr->setText( CharsToUnicodeString(escapedText) );
1890    int32_t foundOffsets[kMaxOffsetCount];
1891    int32_t offset, foundOffsetsCount = 0;
1892    // do forwards iteration test
1893    while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) {
1894        foundOffsets[foundOffsetsCount++] = offset;
1895    }
1896    if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) {
1897        // log error for forwards test
1898        char formatExpect[512], formatFound[512];
1899        formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
1900        formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets);
1901        errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n",
1902                type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound);
1903    } else {
1904        // do backwards iteration test
1905        --foundOffsetsCount; // back off one from the end offset
1906        while ( foundOffsetsCount > 0 ) {
1907            offset = brkitr->previous();
1908            if ( offset != foundOffsets[--foundOffsetsCount] ) {
1909                // log error for backwards test
1910                char formatExpect[512];
1911                formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
1912                errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n",
1913                        type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]);
1914                break;
1915            }
1916        }
1917    }
1918}
1919
1920void RBBITest::TestTailoredBreaks() {
1921    const TailoredBreakItem * tbItemPtr;
1922    Locale rootLocale = Locale("root");
1923    for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) {
1924        Locale testLocale = Locale(tbItemPtr->locale);
1925        BreakIterator * tailoredBrkiter;
1926        BreakIterator * rootBrkiter;
1927        UErrorCode status = U_ZERO_ERROR;
1928        switch (tbItemPtr->type) {
1929            case UBRK_CHARACTER:
1930                tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status);
1931                rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status);
1932                break;
1933            case UBRK_WORD:
1934                tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status);
1935                rootBrkiter = BreakIterator::createWordInstance(rootLocale, status);
1936                break;
1937            case UBRK_LINE:
1938                tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status);
1939                rootBrkiter = BreakIterator::createLineInstance(rootLocale, status);
1940                break;
1941            case UBRK_SENTENCE:
1942                tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status);
1943                rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status);
1944                break;
1945            default:
1946                status = U_UNSUPPORTED_ERROR;
1947                break;
1948        }
1949        if (U_FAILURE(status)) {
1950            errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status));
1951            continue;
1952        }
1953        TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount);
1954        TBTest(rootBrkiter,     (int)(tbItemPtr->type), "root",            tbItemPtr->escapedText, tbItemPtr->rootOffsets,     tbItemPtr->rootOffsetsCount);
1955
1956        delete rootBrkiter;
1957        delete tailoredBrkiter;
1958    }
1959}
1960
1961
1962//-------------------------------------------------------------------------------
1963//
1964//    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1965//    return the datain one big UChar * buffer, which the caller must delete.
1966//
1967//    parameters:
1968//          fileName:   the name of the file, with no directory part.  The test data directory
1969//                      is assumed.
1970//          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1971//          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1972//                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1973//                      Pass NULL for the system default encoding.
1974//          status
1975//    returns:
1976//                      The file data, converted to UChar.
1977//                      The caller must delete this when done with
1978//                           delete [] theBuffer;
1979//
1980//    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1981//           Move this function to some common place.
1982//
1983//--------------------------------------------------------------------------------
1984UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1985    UChar       *retPtr  = NULL;
1986    char        *fileBuf = NULL;
1987    UConverter* conv     = NULL;
1988    FILE        *f       = NULL;
1989
1990    ulen = 0;
1991    if (U_FAILURE(status)) {
1992        return retPtr;
1993    }
1994
1995    //
1996    //  Open the file.
1997    //
1998    f = fopen(fileName, "rb");
1999    if (f == 0) {
2000        dataerrln("Error opening test data file %s\n", fileName);
2001        status = U_FILE_ACCESS_ERROR;
2002        return NULL;
2003    }
2004    //
2005    //  Read it in
2006    //
2007    int   fileSize;
2008    int   amt_read;
2009
2010    fseek( f, 0, SEEK_END);
2011    fileSize = ftell(f);
2012    fileBuf = new char[fileSize];
2013    fseek(f, 0, SEEK_SET);
2014    amt_read = fread(fileBuf, 1, fileSize, f);
2015    if (amt_read != fileSize || fileSize <= 0) {
2016        errln("Error reading test data file.");
2017        goto cleanUpAndReturn;
2018    }
2019
2020    //
2021    // Look for a Unicode Signature (BOM) on the data just read
2022    //
2023    int32_t        signatureLength;
2024    const char *   fileBufC;
2025    const char*    bomEncoding;
2026
2027    fileBufC = fileBuf;
2028    bomEncoding = ucnv_detectUnicodeSignature(
2029        fileBuf, fileSize, &signatureLength, &status);
2030    if(bomEncoding!=NULL ){
2031        fileBufC  += signatureLength;
2032        fileSize  -= signatureLength;
2033        encoding = bomEncoding;
2034    }
2035
2036    //
2037    // Open a converter to take the rule file to UTF-16
2038    //
2039    conv = ucnv_open(encoding, &status);
2040    if (U_FAILURE(status)) {
2041        goto cleanUpAndReturn;
2042    }
2043
2044    //
2045    // Convert the rules to UChar.
2046    //  Preflight first to determine required buffer size.
2047    //
2048    ulen = ucnv_toUChars(conv,
2049        NULL,           //  dest,
2050        0,              //  destCapacity,
2051        fileBufC,
2052        fileSize,
2053        &status);
2054    if (status == U_BUFFER_OVERFLOW_ERROR) {
2055        // Buffer Overflow is expected from the preflight operation.
2056        status = U_ZERO_ERROR;
2057
2058        retPtr = new UChar[ulen+1];
2059        ucnv_toUChars(conv,
2060            retPtr,       //  dest,
2061            ulen+1,
2062            fileBufC,
2063            fileSize,
2064            &status);
2065    }
2066
2067cleanUpAndReturn:
2068    fclose(f);
2069    delete []fileBuf;
2070    ucnv_close(conv);
2071    if (U_FAILURE(status)) {
2072        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
2073        delete retPtr;
2074        retPtr = 0;
2075        ulen   = 0;
2076    };
2077    return retPtr;
2078}
2079
2080
2081
2082//--------------------------------------------------------------------------------------------
2083//
2084//   Run tests from each of the boundary test data files distributed by the Unicode Consortium
2085//
2086//-------------------------------------------------------------------------------------------
2087void RBBITest::TestUnicodeFiles() {
2088    RuleBasedBreakIterator  *bi;
2089    UErrorCode               status = U_ZERO_ERROR;
2090
2091    bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getDefault(), status);
2092    TEST_ASSERT_SUCCESS(status);
2093    if (U_SUCCESS(status)) {
2094        runUnicodeTestData("GraphemeBreakTest.txt", bi);
2095    }
2096    delete bi;
2097
2098    bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status);
2099    TEST_ASSERT_SUCCESS(status);
2100    if (U_SUCCESS(status)) {
2101        runUnicodeTestData("WordBreakTest.txt", bi);
2102    }
2103    delete bi;
2104
2105    bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
2106    TEST_ASSERT_SUCCESS(status);
2107    if (U_SUCCESS(status)) {
2108        runUnicodeTestData("SentenceBreakTest.txt", bi);
2109    }
2110    delete bi;
2111
2112    bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
2113    TEST_ASSERT_SUCCESS(status);
2114    if (U_SUCCESS(status)) {
2115        runUnicodeTestData("LineBreakTest.txt", bi);
2116    }
2117    delete bi;
2118}
2119
2120
2121//--------------------------------------------------------------------------------------------
2122//
2123//   Run tests from one of the boundary test data files distributed by the Unicode Consortium
2124//
2125//-------------------------------------------------------------------------------------------
2126void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
2127#if !UCONFIG_NO_REGULAR_EXPRESSIONS
2128    UErrorCode  status = U_ZERO_ERROR;
2129
2130    //
2131    //  Open and read the test data file, put it into a UnicodeString.
2132    //
2133    const char *testDataDirectory = IntlTest::getSourceTestData(status);
2134    char testFileName[1000];
2135    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
2136        dataerrln("Can't open test data.  Path too long.");
2137        return;
2138    }
2139    strcpy(testFileName, testDataDirectory);
2140    strcat(testFileName, fileName);
2141
2142    logln("Opening data file %s\n", fileName);
2143
2144    int    len;
2145    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
2146    if (status != U_FILE_ACCESS_ERROR) {
2147        TEST_ASSERT_SUCCESS(status);
2148        TEST_ASSERT(testFile != NULL);
2149    }
2150    if (U_FAILURE(status) || testFile == NULL) {
2151        return; /* something went wrong, error already output */
2152    }
2153    UnicodeString testFileAsString(TRUE, testFile, len);
2154
2155    //
2156    //  Parse the test data file using a regular expression.
2157    //  Each kind of token is recognized in its own capture group; what type of item was scanned
2158    //     is identified by which group had a match.
2159    //
2160    //    Caputure Group #                  1          2            3            4           5
2161    //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
2162    //
2163    UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
2164    RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
2165    UnicodeString   testString;
2166    UVector32       breakPositions(status);
2167    int             lineNumber = 1;
2168    TEST_ASSERT_SUCCESS(status);
2169    if (U_FAILURE(status)) {
2170        return;
2171    }
2172
2173    //
2174    //  Scan through each test case, building up the string to be broken in testString,
2175    //   and the positions that should be boundaries in the breakPositions vector.
2176    //
2177    while (tokenMatcher.find()) {
2178        if (tokenMatcher.start(1, status) >= 0) {
2179            // Scanned a divide sign, indicating a break position in the test data.
2180            if (testString.length()>0) {
2181                breakPositions.addElement(testString.length(), status);
2182            }
2183        }
2184        else if (tokenMatcher.start(2, status) >= 0) {
2185            // Scanned an 'x', meaning no break at this position in the test data
2186            //   Nothing to be done here.
2187            }
2188        else if (tokenMatcher.start(3, status) >= 0) {
2189            // Scanned Hex digits.  Convert them to binary, append to the character data string.
2190            const UnicodeString &hexNumber = tokenMatcher.group(3, status);
2191            int length = hexNumber.length();
2192            if (length<=8) {
2193                char buf[10];
2194                hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
2195                UChar32 c = (UChar32)strtol(buf, NULL, 16);
2196                if (c<=0x10ffff) {
2197                    testString.append(c);
2198                } else {
2199                    errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
2200                       fileName, lineNumber);
2201                }
2202            } else {
2203                errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
2204                       fileName, lineNumber);
2205             }
2206        }
2207        else if (tokenMatcher.start(4, status) >= 0) {
2208            // Scanned to end of a line, possibly skipping over a comment in the process.
2209            //   If the line from the file contained test data, run the test now.
2210            //
2211            if (testString.length() > 0) {
2212                checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
2213            }
2214
2215            // Clear out this test case.
2216            //    The string and breakPositions vector will be refilled as the next
2217            //       test case is parsed.
2218            testString.remove();
2219            breakPositions.removeAllElements();
2220            lineNumber++;
2221        } else {
2222            // Scanner catchall.  Something unrecognized appeared on the line.
2223            char token[16];
2224            UnicodeString uToken = tokenMatcher.group(0, status);
2225            uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
2226            token[sizeof(token)-1] = 0;
2227            errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
2228
2229            // Clean up, in preparation for continuing with the next line.
2230            testString.remove();
2231            breakPositions.removeAllElements();
2232            lineNumber++;
2233        }
2234        TEST_ASSERT_SUCCESS(status);
2235        if (U_FAILURE(status)) {
2236            break;
2237        }
2238    }
2239
2240    delete [] testFile;
2241 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
2242}
2243
2244//--------------------------------------------------------------------------------------------
2245//
2246//   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
2247//                            test data files.  Do only a simple, forward-only check -
2248//                            this test is mostly to check that ICU and the Unicode
2249//                            data agree with each other.
2250//
2251//--------------------------------------------------------------------------------------------
2252void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
2253                         const UnicodeString &testString,   // Text data to be broken
2254                         UVector32 *breakPositions,         // Positions where breaks should be found.
2255                         RuleBasedBreakIterator *bi) {
2256    int32_t pos;                 // Break Position in the test string
2257    int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
2258    int32_t expectedPos;         // Expected break position (index into test string)
2259
2260    bi->setText(testString);
2261    pos = bi->first();
2262    pos = bi->next();
2263
2264    while (pos != BreakIterator::DONE) {
2265        if (expectedI >= breakPositions->size()) {
2266            errln("Test file \"%s\", line %d, unexpected break found at position %d",
2267                testFileName, lineNumber, pos);
2268            break;
2269        }
2270        expectedPos = breakPositions->elementAti(expectedI);
2271        if (pos < expectedPos) {
2272            errln("Test file \"%s\", line %d, unexpected break found at position %d",
2273                testFileName, lineNumber, pos);
2274            break;
2275        }
2276        if (pos > expectedPos) {
2277            errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2278                testFileName, lineNumber, expectedPos);
2279            break;
2280        }
2281        pos = bi->next();
2282        expectedI++;
2283    }
2284
2285    if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
2286        errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2287            testFileName, lineNumber, breakPositions->elementAti(expectedI));
2288    }
2289}
2290
2291
2292
2293#if !UCONFIG_NO_REGULAR_EXPRESSIONS
2294//---------------------------------------------------------------------------------------
2295//
2296//   classs RBBIMonkeyKind
2297//
2298//      Monkey Test for Break Iteration
2299//      Abstract interface class.   Concrete derived classes independently
2300//      implement the break rules for different iterator types.
2301//
2302//      The Monkey Test itself uses doesn't know which type of break iterator it is
2303//      testing, but works purely in terms of the interface defined here.
2304//
2305//---------------------------------------------------------------------------------------
2306class RBBIMonkeyKind {
2307public:
2308    // Return a UVector of UnicodeSets, representing the character classes used
2309    //   for this type of iterator.
2310    virtual  UVector  *charClasses() = 0;
2311
2312    // Set the test text on which subsequent calls to next() will operate
2313    virtual  void      setText(const UnicodeString &s) = 0;
2314
2315    // Find the next break postion, starting from the prev break position, or from zero.
2316    // Return -1 after reaching end of string.
2317    virtual  int32_t   next(int32_t i) = 0;
2318
2319    virtual ~RBBIMonkeyKind();
2320    UErrorCode       deferredStatus;
2321
2322
2323protected:
2324    RBBIMonkeyKind();
2325
2326private:
2327};
2328
2329RBBIMonkeyKind::RBBIMonkeyKind() {
2330    deferredStatus = U_ZERO_ERROR;
2331}
2332
2333RBBIMonkeyKind::~RBBIMonkeyKind() {
2334}
2335
2336
2337//----------------------------------------------------------------------------------------
2338//
2339//   Random Numbers.  Similar to standard lib rand() and srand()
2340//                    Not using library to
2341//                      1.  Get same results on all platforms.
2342//                      2.  Get access to current seed, to more easily reproduce failures.
2343//
2344//---------------------------------------------------------------------------------------
2345static uint32_t m_seed = 1;
2346
2347static uint32_t m_rand()
2348{
2349    m_seed = m_seed * 1103515245 + 12345;
2350    return (uint32_t)(m_seed/65536) % 32768;
2351}
2352
2353
2354//------------------------------------------------------------------------------------------
2355//
2356//   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
2357//                             of RBBIMonkeyKind.
2358//
2359//------------------------------------------------------------------------------------------
2360class RBBICharMonkey: public RBBIMonkeyKind {
2361public:
2362    RBBICharMonkey();
2363    virtual          ~RBBICharMonkey();
2364    virtual  UVector *charClasses();
2365    virtual  void     setText(const UnicodeString &s);
2366    virtual  int32_t  next(int32_t i);
2367private:
2368    UVector   *fSets;
2369
2370    UnicodeSet  *fCRLFSet;
2371    UnicodeSet  *fControlSet;
2372    UnicodeSet  *fExtendSet;
2373    UnicodeSet  *fPrependSet;
2374    UnicodeSet  *fSpacingSet;
2375    UnicodeSet  *fLSet;
2376    UnicodeSet  *fVSet;
2377    UnicodeSet  *fTSet;
2378    UnicodeSet  *fLVSet;
2379    UnicodeSet  *fLVTSet;
2380    UnicodeSet  *fHangulSet;
2381    UnicodeSet  *fAnySet;
2382
2383    const UnicodeString *fText;
2384};
2385
2386
2387RBBICharMonkey::RBBICharMonkey() {
2388    UErrorCode  status = U_ZERO_ERROR;
2389
2390    fText = NULL;
2391
2392    fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2393    fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
2394    fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
2395    fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2396    fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2397    fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2398    fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2399    fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2400    fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2401    fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2402    fHangulSet  = new UnicodeSet();
2403    fHangulSet->addAll(*fLSet);
2404    fHangulSet->addAll(*fVSet);
2405    fHangulSet->addAll(*fTSet);
2406    fHangulSet->addAll(*fLVSet);
2407    fHangulSet->addAll(*fLVTSet);
2408    fAnySet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status);
2409
2410    fSets       = new UVector(status);
2411    fSets->addElement(fCRLFSet,    status);
2412    fSets->addElement(fControlSet, status);
2413    fSets->addElement(fExtendSet,  status);
2414    fSets->addElement(fPrependSet, status);
2415    fSets->addElement(fSpacingSet, status);
2416    fSets->addElement(fHangulSet,  status);
2417    fSets->addElement(fAnySet,     status);
2418    if (U_FAILURE(status)) {
2419        deferredStatus = status;
2420    }
2421}
2422
2423
2424void RBBICharMonkey::setText(const UnicodeString &s) {
2425    fText = &s;
2426}
2427
2428
2429
2430int32_t RBBICharMonkey::next(int32_t prevPos) {
2431    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2432                              //   break position being tested.  The candidate break
2433                              //   location is before p2.
2434
2435    int     breakPos = -1;
2436
2437    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2438
2439    if (U_FAILURE(deferredStatus)) {
2440        return -1;
2441    }
2442
2443    // Previous break at end of string.  return DONE.
2444    if (prevPos >= fText->length()) {
2445        return -1;
2446    }
2447    p0 = p1 = p2 = p3 = prevPos;
2448    c3 =  fText->char32At(prevPos);
2449    c0 = c1 = c2 = 0;
2450
2451    // Loop runs once per "significant" character position in the input text.
2452    for (;;) {
2453        // Move all of the positions forward in the input string.
2454        p0 = p1;  c0 = c1;
2455        p1 = p2;  c1 = c2;
2456        p2 = p3;  c2 = c3;
2457
2458        // Advancd p3 by one codepoint
2459        p3 = fText->moveIndex32(p3, 1);
2460        c3 = fText->char32At(p3);
2461
2462        if (p1 == p2) {
2463            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2464            continue;
2465        }
2466        if (p2 == fText->length()) {
2467            // Reached end of string.  Always a break position.
2468            break;
2469        }
2470
2471        // Rule  GB3   CR x LF
2472        //     No Extend or Format characters may appear between the CR and LF,
2473        //     which requires the additional check for p2 immediately following p1.
2474        //
2475        if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2476            continue;
2477        }
2478
2479        // Rule (GB4).   ( Control | CR | LF ) <break>
2480        if (fControlSet->contains(c1) ||
2481            c1 == 0x0D ||
2482            c1 == 0x0A)  {
2483            break;
2484        }
2485
2486        // Rule (GB5)    <break>  ( Control | CR | LF )
2487        //
2488        if (fControlSet->contains(c2) ||
2489            c2 == 0x0D ||
2490            c2 == 0x0A)  {
2491            break;
2492        }
2493
2494
2495        // Rule (GB6)  L x ( L | V | LV | LVT )
2496        if (fLSet->contains(c1) &&
2497               (fLSet->contains(c2)  ||
2498                fVSet->contains(c2)  ||
2499                fLVSet->contains(c2) ||
2500                fLVTSet->contains(c2))) {
2501            continue;
2502        }
2503
2504        // Rule (GB7)    ( LV | V )  x  ( V | T )
2505        if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2506            (fVSet->contains(c2) || fTSet->contains(c2)))  {
2507            continue;
2508        }
2509
2510        // Rule (GB8)    ( LVT | T)  x T
2511        if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2512            fTSet->contains(c2))  {
2513            continue;
2514        }
2515
2516        // Rule (GB9)    Numeric x ALetter
2517        if (fExtendSet->contains(c2))  {
2518            continue;
2519        }
2520
2521        // Rule (GB9a)   x  SpacingMark
2522        if (fSpacingSet->contains(c2)) {
2523            continue;
2524        }
2525
2526        // Rule (GB9b)   Prepend x
2527        if (fPrependSet->contains(c1)) {
2528            continue;
2529        }
2530
2531        // Rule (GB10)  Any  <break>  Any
2532        break;
2533    }
2534
2535    breakPos = p2;
2536    return breakPos;
2537}
2538
2539
2540
2541UVector  *RBBICharMonkey::charClasses() {
2542    return fSets;
2543}
2544
2545
2546RBBICharMonkey::~RBBICharMonkey() {
2547    delete fSets;
2548    delete fCRLFSet;
2549    delete fControlSet;
2550    delete fExtendSet;
2551    delete fPrependSet;
2552    delete fSpacingSet;
2553    delete fLSet;
2554    delete fVSet;
2555    delete fTSet;
2556    delete fLVSet;
2557    delete fLVTSet;
2558    delete fHangulSet;
2559    delete fAnySet;
2560}
2561
2562//------------------------------------------------------------------------------------------
2563//
2564//   class RBBIWordMonkey      Word Break specific implementation
2565//                             of RBBIMonkeyKind.
2566//
2567//------------------------------------------------------------------------------------------
2568class RBBIWordMonkey: public RBBIMonkeyKind {
2569public:
2570    RBBIWordMonkey();
2571    virtual          ~RBBIWordMonkey();
2572    virtual  UVector *charClasses();
2573    virtual  void     setText(const UnicodeString &s);
2574    virtual int32_t   next(int32_t i);
2575private:
2576    UVector      *fSets;
2577
2578    UnicodeSet  *fCRSet;
2579    UnicodeSet  *fLFSet;
2580    UnicodeSet  *fNewlineSet;
2581    UnicodeSet  *fKatakanaSet;
2582    UnicodeSet  *fALetterSet;
2583    UnicodeSet  *fMidNumLetSet;
2584    UnicodeSet  *fMidLetterSet;
2585    UnicodeSet  *fMidNumSet;
2586    UnicodeSet  *fNumericSet;
2587    UnicodeSet  *fFormatSet;
2588    UnicodeSet  *fOtherSet;
2589    UnicodeSet  *fExtendSet;
2590    UnicodeSet  *fExtendNumLetSet;
2591
2592    RegexMatcher  *fMatcher;
2593
2594    const UnicodeString  *fText;
2595};
2596
2597
2598RBBIWordMonkey::RBBIWordMonkey()
2599{
2600    UErrorCode  status = U_ZERO_ERROR;
2601
2602    fSets            = new UVector(status);
2603
2604    fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2605    fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2606    fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2607    fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"),      status);
2608    fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2609    fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2610    fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2611    fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2612    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2613    fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2614    fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2615    fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2616
2617    fOtherSet        = new UnicodeSet();
2618    if(U_FAILURE(status)) {
2619      deferredStatus = status;
2620      return;
2621    }
2622
2623    fOtherSet->complement();
2624    fOtherSet->removeAll(*fCRSet);
2625    fOtherSet->removeAll(*fLFSet);
2626    fOtherSet->removeAll(*fNewlineSet);
2627    fOtherSet->removeAll(*fKatakanaSet);
2628    fOtherSet->removeAll(*fALetterSet);
2629    fOtherSet->removeAll(*fMidLetterSet);
2630    fOtherSet->removeAll(*fMidNumSet);
2631    fOtherSet->removeAll(*fNumericSet);
2632    fOtherSet->removeAll(*fExtendNumLetSet);
2633    fOtherSet->removeAll(*fFormatSet);
2634    fOtherSet->removeAll(*fExtendSet);
2635    // Inhibit dictionary characters from being tested at all.
2636    fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2637
2638    fSets->addElement(fCRSet,        status);
2639    fSets->addElement(fLFSet,        status);
2640    fSets->addElement(fNewlineSet,   status);
2641    fSets->addElement(fALetterSet,   status);
2642    fSets->addElement(fKatakanaSet,  status);
2643    fSets->addElement(fMidLetterSet, status);
2644    fSets->addElement(fMidNumLetSet, status);
2645    fSets->addElement(fMidNumSet,    status);
2646    fSets->addElement(fNumericSet,   status);
2647    fSets->addElement(fFormatSet,    status);
2648    fSets->addElement(fExtendSet,    status);
2649    fSets->addElement(fOtherSet,     status);
2650    fSets->addElement(fExtendNumLetSet, status);
2651
2652    if (U_FAILURE(status)) {
2653        deferredStatus = status;
2654    }
2655}
2656
2657void RBBIWordMonkey::setText(const UnicodeString &s) {
2658    fText       = &s;
2659}
2660
2661
2662int32_t RBBIWordMonkey::next(int32_t prevPos) {
2663    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2664                              //   break position being tested.  The candidate break
2665                              //   location is before p2.
2666
2667    int     breakPos = -1;
2668
2669    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2670
2671    if (U_FAILURE(deferredStatus)) {
2672        return -1;
2673    }
2674
2675    // Prev break at end of string.  return DONE.
2676    if (prevPos >= fText->length()) {
2677        return -1;
2678    }
2679    p0 = p1 = p2 = p3 = prevPos;
2680    c3 =  fText->char32At(prevPos);
2681    c0 = c1 = c2 = 0;
2682
2683    // Loop runs once per "significant" character position in the input text.
2684    for (;;) {
2685        // Move all of the positions forward in the input string.
2686        p0 = p1;  c0 = c1;
2687        p1 = p2;  c1 = c2;
2688        p2 = p3;  c2 = c3;
2689
2690        // Advancd p3 by    X(Extend | Format)*   Rule 4
2691        //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2692        do {
2693            p3 = fText->moveIndex32(p3, 1);
2694            c3 = fText->char32At(p3);
2695            if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2696               break;
2697            };
2698        }
2699        while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2700
2701
2702        if (p1 == p2) {
2703            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2704            continue;
2705        }
2706        if (p2 == fText->length()) {
2707            // Reached end of string.  Always a break position.
2708            break;
2709        }
2710
2711        // Rule  (3)   CR x LF
2712        //     No Extend or Format characters may appear between the CR and LF,
2713        //     which requires the additional check for p2 immediately following p1.
2714        //
2715        if (c1==0x0D && c2==0x0A) {
2716            continue;
2717        }
2718
2719        // Rule (3a)  Break before and after newlines (including CR and LF)
2720        //
2721        if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2722            break;
2723        };
2724        if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2725            break;
2726        };
2727
2728        // Rule (5).   ALetter x ALetter
2729        if (fALetterSet->contains(c1) &&
2730            fALetterSet->contains(c2))  {
2731            continue;
2732        }
2733
2734        // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
2735        //
2736        if ( fALetterSet->contains(c1)   &&
2737             (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
2738             fALetterSet->contains(c3)) {
2739            continue;
2740        }
2741
2742
2743        // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
2744        if (fALetterSet->contains(c0) &&
2745            (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
2746            fALetterSet->contains(c2)) {
2747            continue;
2748        }
2749
2750        // Rule (8)    Numeric x Numeric
2751        if (fNumericSet->contains(c1) &&
2752            fNumericSet->contains(c2))  {
2753            continue;
2754        }
2755
2756        // Rule (9)    ALetter x Numeric
2757        if (fALetterSet->contains(c1) &&
2758            fNumericSet->contains(c2))  {
2759            continue;
2760        }
2761
2762        // Rule (10)    Numeric x ALetter
2763        if (fNumericSet->contains(c1) &&
2764            fALetterSet->contains(c2))  {
2765            continue;
2766        }
2767
2768        // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
2769        if (fNumericSet->contains(c0) &&
2770            (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
2771            fNumericSet->contains(c2)) {
2772            continue;
2773        }
2774
2775        // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
2776        if (fNumericSet->contains(c1) &&
2777            (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
2778            fNumericSet->contains(c3)) {
2779            continue;
2780        }
2781
2782        // Rule (13)  Katakana x Katakana
2783        if (fKatakanaSet->contains(c1) &&
2784            fKatakanaSet->contains(c2))  {
2785            continue;
2786        }
2787
2788        // Rule 13a
2789        if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2790             fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2791             fExtendNumLetSet->contains(c2)) {
2792                continue;
2793             }
2794
2795        // Rule 13b
2796        if (fExtendNumLetSet->contains(c1) &&
2797                (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2798                fKatakanaSet->contains(c2)))  {
2799                continue;
2800             }
2801
2802        // Rule 14.  Break found here.
2803        break;
2804    }
2805
2806    breakPos = p2;
2807    return breakPos;
2808}
2809
2810
2811UVector  *RBBIWordMonkey::charClasses() {
2812    return fSets;
2813}
2814
2815
2816RBBIWordMonkey::~RBBIWordMonkey() {
2817    delete fSets;
2818    delete fCRSet;
2819    delete fLFSet;
2820    delete fNewlineSet;
2821    delete fKatakanaSet;
2822    delete fALetterSet;
2823    delete fMidNumLetSet;
2824    delete fMidLetterSet;
2825    delete fMidNumSet;
2826    delete fNumericSet;
2827    delete fFormatSet;
2828    delete fExtendSet;
2829    delete fExtendNumLetSet;
2830    delete fOtherSet;
2831}
2832
2833
2834
2835
2836//------------------------------------------------------------------------------------------
2837//
2838//   class RBBISentMonkey      Sentence Break specific implementation
2839//                             of RBBIMonkeyKind.
2840//
2841//------------------------------------------------------------------------------------------
2842class RBBISentMonkey: public RBBIMonkeyKind {
2843public:
2844    RBBISentMonkey();
2845    virtual          ~RBBISentMonkey();
2846    virtual  UVector *charClasses();
2847    virtual  void     setText(const UnicodeString &s);
2848    virtual int32_t   next(int32_t i);
2849private:
2850    int               moveBack(int posFrom);
2851    int               moveForward(int posFrom);
2852    UChar32           cAt(int pos);
2853
2854    UVector      *fSets;
2855
2856    UnicodeSet  *fSepSet;
2857    UnicodeSet  *fFormatSet;
2858    UnicodeSet  *fSpSet;
2859    UnicodeSet  *fLowerSet;
2860    UnicodeSet  *fUpperSet;
2861    UnicodeSet  *fOLetterSet;
2862    UnicodeSet  *fNumericSet;
2863    UnicodeSet  *fATermSet;
2864    UnicodeSet  *fSContinueSet;
2865    UnicodeSet  *fSTermSet;
2866    UnicodeSet  *fCloseSet;
2867    UnicodeSet  *fOtherSet;
2868    UnicodeSet  *fExtendSet;
2869
2870    const UnicodeString  *fText;
2871
2872};
2873
2874RBBISentMonkey::RBBISentMonkey()
2875{
2876    UErrorCode  status = U_ZERO_ERROR;
2877
2878    fSets            = new UVector(status);
2879
2880    //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2881    //                       set and made into character classes of their own.  For the monkey impl,
2882    //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2883    fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2884    fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2885    fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2886    fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2887    fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2888    fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2889    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2890    fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2891    fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2892    fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2893    fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2894    fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2895    fOtherSet        = new UnicodeSet();
2896
2897    if(U_FAILURE(status)) {
2898      deferredStatus = status;
2899      return;
2900    }
2901
2902    fOtherSet->complement();
2903    fOtherSet->removeAll(*fSepSet);
2904    fOtherSet->removeAll(*fFormatSet);
2905    fOtherSet->removeAll(*fSpSet);
2906    fOtherSet->removeAll(*fLowerSet);
2907    fOtherSet->removeAll(*fUpperSet);
2908    fOtherSet->removeAll(*fOLetterSet);
2909    fOtherSet->removeAll(*fNumericSet);
2910    fOtherSet->removeAll(*fATermSet);
2911    fOtherSet->removeAll(*fSContinueSet);
2912    fOtherSet->removeAll(*fSTermSet);
2913    fOtherSet->removeAll(*fCloseSet);
2914    fOtherSet->removeAll(*fExtendSet);
2915
2916    fSets->addElement(fSepSet,       status);
2917    fSets->addElement(fFormatSet,    status);
2918    fSets->addElement(fSpSet,        status);
2919    fSets->addElement(fLowerSet,     status);
2920    fSets->addElement(fUpperSet,     status);
2921    fSets->addElement(fOLetterSet,   status);
2922    fSets->addElement(fNumericSet,   status);
2923    fSets->addElement(fATermSet,     status);
2924    fSets->addElement(fSContinueSet, status);
2925    fSets->addElement(fSTermSet,     status);
2926    fSets->addElement(fCloseSet,     status);
2927    fSets->addElement(fOtherSet,     status);
2928    fSets->addElement(fExtendSet,    status);
2929
2930    if (U_FAILURE(status)) {
2931        deferredStatus = status;
2932    }
2933}
2934
2935
2936
2937void RBBISentMonkey::setText(const UnicodeString &s) {
2938    fText       = &s;
2939}
2940
2941UVector  *RBBISentMonkey::charClasses() {
2942    return fSets;
2943}
2944
2945
2946//  moveBack()   Find the "significant" code point preceding the index i.
2947//               Skips over ($Extend | $Format)* .
2948//
2949int RBBISentMonkey::moveBack(int i) {
2950    if (i <= 0) {
2951        return -1;
2952    }
2953    UChar32   c;
2954    int32_t   j = i;
2955    do {
2956        j = fText->moveIndex32(j, -1);
2957        c = fText->char32At(j);
2958    }
2959    while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2960    return j;
2961
2962 }
2963
2964
2965int RBBISentMonkey::moveForward(int i) {
2966    if (i>=fText->length()) {
2967        return fText->length();
2968    }
2969    UChar32   c;
2970    int32_t   j = i;
2971    do {
2972        j = fText->moveIndex32(j, 1);
2973        c = cAt(j);
2974    }
2975    while (fFormatSet->contains(c) || fExtendSet->contains(c));
2976    return j;
2977}
2978
2979UChar32 RBBISentMonkey::cAt(int pos) {
2980    if (pos<0 || pos>=fText->length()) {
2981        return -1;
2982    } else {
2983        return fText->char32At(pos);
2984    }
2985}
2986
2987int32_t RBBISentMonkey::next(int32_t prevPos) {
2988    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2989                              //   break position being tested.  The candidate break
2990                              //   location is before p2.
2991
2992    int     breakPos = -1;
2993
2994    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2995    UChar32 c;
2996
2997    if (U_FAILURE(deferredStatus)) {
2998        return -1;
2999    }
3000
3001    // Prev break at end of string.  return DONE.
3002    if (prevPos >= fText->length()) {
3003        return -1;
3004    }
3005    p0 = p1 = p2 = p3 = prevPos;
3006    c3 =  fText->char32At(prevPos);
3007    c0 = c1 = c2 = 0;
3008
3009    // Loop runs once per "significant" character position in the input text.
3010    for (;;) {
3011        // Move all of the positions forward in the input string.
3012        p0 = p1;  c0 = c1;
3013        p1 = p2;  c1 = c2;
3014        p2 = p3;  c2 = c3;
3015
3016        // Advancd p3 by    X(Extend | Format)*   Rule 4
3017        p3 = moveForward(p3);
3018        c3 = cAt(p3);
3019
3020        // Rule (3)  CR x LF
3021        if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
3022            continue;
3023        }
3024
3025        // Rule (4).   Sep  <break>
3026        if (fSepSet->contains(c1)) {
3027            p2 = p1+1;   // Separators don't combine with Extend or Format.
3028            break;
3029        }
3030
3031        if (p2 >= fText->length()) {
3032            // Reached end of string.  Always a break position.
3033            break;
3034        }
3035
3036        if (p2 == prevPos) {
3037            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
3038            continue;
3039        }
3040
3041        // Rule (6).   ATerm x Numeric
3042        if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
3043            continue;
3044        }
3045
3046        // Rule (7).  Upper ATerm  x  Uppper
3047        if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
3048            continue;
3049        }
3050
3051        // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
3052        //           Note:  STerm | ATerm are added to the negated part of the expression by a
3053        //                  note to the Unicode 5.0 documents.
3054        int p8 = p1;
3055        while (fSpSet->contains(cAt(p8))) {
3056            p8 = moveBack(p8);
3057        }
3058        while (fCloseSet->contains(cAt(p8))) {
3059            p8 = moveBack(p8);
3060        }
3061        if (fATermSet->contains(cAt(p8))) {
3062            p8=p2;
3063            for (;;) {
3064                c = cAt(p8);
3065                if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
3066                    fLowerSet->contains(c) || fSepSet->contains(c) ||
3067                    fATermSet->contains(c) || fSTermSet->contains(c))  {
3068                    break;
3069                }
3070                p8 = moveForward(p8);
3071            }
3072            if (fLowerSet->contains(cAt(p8))) {
3073                continue;
3074            }
3075        }
3076
3077        // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
3078        if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
3079            p8 = p1;
3080            while (fSpSet->contains(cAt(p8))) {
3081                p8 = moveBack(p8);
3082            }
3083            while (fCloseSet->contains(cAt(p8))) {
3084                p8 = moveBack(p8);
3085            }
3086            c = cAt(p8);
3087            if (fSTermSet->contains(c) || fATermSet->contains(c)) {
3088                continue;
3089            }
3090        }
3091
3092        // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
3093        int p9 = p1;
3094        while (fCloseSet->contains(cAt(p9))) {
3095            p9 = moveBack(p9);
3096        }
3097        c = cAt(p9);
3098        if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
3099            if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
3100                continue;
3101            }
3102        }
3103
3104        // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
3105        int p10 = p1;
3106        while (fSpSet->contains(cAt(p10))) {
3107            p10 = moveBack(p10);
3108        }
3109        while (fCloseSet->contains(cAt(p10))) {
3110            p10 = moveBack(p10);
3111        }
3112        if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
3113            if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
3114                continue;
3115            }
3116        }
3117
3118        // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
3119        int p11 = p1;
3120        if (fSepSet->contains(cAt(p11))) {
3121            p11 = moveBack(p11);
3122        }
3123        while (fSpSet->contains(cAt(p11))) {
3124            p11 = moveBack(p11);
3125        }
3126        while (fCloseSet->contains(cAt(p11))) {
3127            p11 = moveBack(p11);
3128        }
3129        if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
3130            break;
3131        }
3132
3133        //  Rule (12)  Any x Any
3134        continue;
3135    }
3136    breakPos = p2;
3137    return breakPos;
3138}
3139
3140RBBISentMonkey::~RBBISentMonkey() {
3141    delete fSets;
3142    delete fSepSet;
3143    delete fFormatSet;
3144    delete fSpSet;
3145    delete fLowerSet;
3146    delete fUpperSet;
3147    delete fOLetterSet;
3148    delete fNumericSet;
3149    delete fATermSet;
3150    delete fSContinueSet;
3151    delete fSTermSet;
3152    delete fCloseSet;
3153    delete fOtherSet;
3154    delete fExtendSet;
3155}
3156
3157
3158
3159//-------------------------------------------------------------------------------------------
3160//
3161//  RBBILineMonkey
3162//
3163//-------------------------------------------------------------------------------------------
3164
3165class RBBILineMonkey: public RBBIMonkeyKind {
3166public:
3167    RBBILineMonkey();
3168    virtual          ~RBBILineMonkey();
3169    virtual  UVector *charClasses();
3170    virtual  void     setText(const UnicodeString &s);
3171    virtual  int32_t  next(int32_t i);
3172    virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
3173private:
3174    UVector      *fSets;
3175
3176    UnicodeSet  *fBK;
3177    UnicodeSet  *fCR;
3178    UnicodeSet  *fLF;
3179    UnicodeSet  *fCM;
3180    UnicodeSet  *fNL;
3181    UnicodeSet  *fSG;
3182    UnicodeSet  *fWJ;
3183    UnicodeSet  *fZW;
3184    UnicodeSet  *fGL;
3185    UnicodeSet  *fCB;
3186    UnicodeSet  *fSP;
3187    UnicodeSet  *fB2;
3188    UnicodeSet  *fBA;
3189    UnicodeSet  *fBB;
3190    UnicodeSet  *fHY;
3191    UnicodeSet  *fH2;
3192    UnicodeSet  *fH3;
3193    UnicodeSet  *fCL;
3194    UnicodeSet  *fEX;
3195    UnicodeSet  *fIN;
3196    UnicodeSet  *fJL;
3197    UnicodeSet  *fJV;
3198    UnicodeSet  *fJT;
3199    UnicodeSet  *fNS;
3200    UnicodeSet  *fOP;
3201    UnicodeSet  *fQU;
3202    UnicodeSet  *fIS;
3203    UnicodeSet  *fNU;
3204    UnicodeSet  *fPO;
3205    UnicodeSet  *fPR;
3206    UnicodeSet  *fSY;
3207    UnicodeSet  *fAI;
3208    UnicodeSet  *fAL;
3209    UnicodeSet  *fID;
3210    UnicodeSet  *fSA;
3211    UnicodeSet  *fXX;
3212
3213    BreakIterator  *fCharBI;
3214
3215    const UnicodeString  *fText;
3216    int32_t              *fOrigPositions;
3217
3218    RegexMatcher         *fNumberMatcher;
3219    RegexMatcher         *fLB11Matcher;
3220};
3221
3222
3223RBBILineMonkey::RBBILineMonkey()
3224{
3225    UErrorCode  status = U_ZERO_ERROR;
3226
3227    fSets  = new UVector(status);
3228
3229    fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3230    fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3231    fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3232    fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3233    fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3234    fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3235    fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3236    fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3237    fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3238    fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3239    fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3240    fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3241    fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3242    fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3243    fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3244    fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3245    fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3246    fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3247    fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3248    fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3249    fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3250    fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3251    fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3252    fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3253    fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3254    fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3255    fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3256    fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3257    fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3258    fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3259    fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3260    fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3261    fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3262    fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
3263    fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3264    fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3265
3266    if (U_FAILURE(status)) {
3267        deferredStatus = status;
3268        fCharBI = NULL;
3269        fNumberMatcher = NULL;
3270        return;
3271    }
3272
3273    fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
3274    fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
3275    fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
3276    fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
3277
3278    fSets->addElement(fBK, status);
3279    fSets->addElement(fCR, status);
3280    fSets->addElement(fLF, status);
3281    fSets->addElement(fCM, status);
3282    fSets->addElement(fNL, status);
3283    fSets->addElement(fWJ, status);
3284    fSets->addElement(fZW, status);
3285    fSets->addElement(fGL, status);
3286    fSets->addElement(fCB, status);
3287    fSets->addElement(fSP, status);
3288    fSets->addElement(fB2, status);
3289    fSets->addElement(fBA, status);
3290    fSets->addElement(fBB, status);
3291    fSets->addElement(fHY, status);
3292    fSets->addElement(fH2, status);
3293    fSets->addElement(fH3, status);
3294    fSets->addElement(fCL, status);
3295    fSets->addElement(fEX, status);
3296    fSets->addElement(fIN, status);
3297    fSets->addElement(fJL, status);
3298    fSets->addElement(fJT, status);
3299    fSets->addElement(fJV, status);
3300    fSets->addElement(fNS, status);
3301    fSets->addElement(fOP, status);
3302    fSets->addElement(fQU, status);
3303    fSets->addElement(fIS, status);
3304    fSets->addElement(fNU, status);
3305    fSets->addElement(fPO, status);
3306    fSets->addElement(fPR, status);
3307    fSets->addElement(fSY, status);
3308    fSets->addElement(fAI, status);
3309    fSets->addElement(fAL, status);
3310    fSets->addElement(fID, status);
3311    fSets->addElement(fWJ, status);
3312    fSets->addElement(fSA, status);
3313    fSets->addElement(fSG, status);
3314
3315    const char *rules =
3316            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3317            "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3318            "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3319            "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3320            "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
3321            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3322
3323    fNumberMatcher = new RegexMatcher(
3324        UnicodeString(rules, -1, US_INV), 0, status);
3325
3326    fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3327
3328    if (U_FAILURE(status)) {
3329        deferredStatus = status;
3330    }
3331}
3332
3333
3334void RBBILineMonkey::setText(const UnicodeString &s) {
3335    fText       = &s;
3336    fCharBI->setText(s);
3337    fNumberMatcher->reset(s);
3338}
3339
3340//
3341//  rule9Adjust
3342//     Line Break TR rules 9 and 10 implementation.
3343//     This deals with combining marks and other sequences that
3344//     that must be treated as if they were something other than what they actually are.
3345//
3346//     This is factored out into a separate function because it must be applied twice for
3347//     each potential break, once to the chars before the position being checked, then
3348//     again to the text following the possible break.
3349//
3350void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3351    if (pos == -1) {
3352        // Invalid initial position.  Happens during the warmup iteration of the
3353        //   main loop in next().
3354        return;
3355    }
3356
3357    int32_t  nPos = *nextPos;
3358
3359    // LB 9  Keep combining sequences together.
3360    //  advance over any CM class chars.  Note that Line Break CM is different
3361    //  from the normal Grapheme Extend property.
3362    if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3363          *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3364        for (;;) {
3365            *nextChar = fText->char32At(nPos);
3366            if (!fCM->contains(*nextChar)) {
3367                break;
3368            }
3369            nPos = fText->moveIndex32(nPos, 1);
3370        }
3371    }
3372
3373
3374    // LB 9 Treat X CM* as if it were x.
3375    //       No explicit action required.
3376
3377    // LB 10  Treat any remaining combining mark as AL
3378    if (fCM->contains(*posChar)) {
3379        *posChar = 0x41;   // thisChar = 'A';
3380    }
3381
3382    // Push the updated nextPos and nextChar back to our caller.
3383    // This only makes a difference if posChar got bigger by consuming a
3384    // combining sequence.
3385    *nextPos  = nPos;
3386    *nextChar = fText->char32At(nPos);
3387}
3388
3389
3390
3391int32_t RBBILineMonkey::next(int32_t startPos) {
3392    UErrorCode status = U_ZERO_ERROR;
3393    int32_t    pos;       //  Index of the char following a potential break position
3394    UChar32    thisChar;  //  Character at above position "pos"
3395
3396    int32_t    prevPos;   //  Index of the char preceding a potential break position
3397    UChar32    prevChar;  //  Character at above position.  Note that prevChar
3398                          //   and thisChar may not be adjacent because combining
3399                          //   characters between them will be ignored.
3400
3401    int32_t    nextPos;   //  Index of the next character following pos.
3402                          //     Usually skips over combining marks.
3403    int32_t    nextCPPos; //  Index of the code point following "pos."
3404                          //     May point to a combining mark.
3405    int32_t    tPos;      //  temp value.
3406    UChar32    c;
3407
3408    if (U_FAILURE(deferredStatus)) {
3409        return -1;
3410    }
3411
3412    if (startPos >= fText->length()) {
3413        return -1;
3414    }
3415
3416
3417    // Initial values for loop.  Loop will run the first time without finding breaks,
3418    //                           while the invalid values shift out and the "this" and
3419    //                           "prev" positions are filled in with good values.
3420    pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
3421    thisChar = prevChar  = 0;
3422    nextPos  = nextCPPos = startPos;
3423
3424
3425    // Loop runs once per position in the test text, until a break position
3426    //  is found.
3427    for (;;) {
3428        prevPos   = pos;
3429        prevChar  = thisChar;
3430
3431        pos       = nextPos;
3432        thisChar  = fText->char32At(pos);
3433
3434        nextCPPos = fText->moveIndex32(pos, 1);
3435        nextPos   = nextCPPos;
3436
3437        // Rule LB2 - Break at end of text.
3438        if (pos >= fText->length()) {
3439            break;
3440        }
3441
3442        // Rule LB 9 - adjust for combining sequences.
3443        //             We do this one out-of-order because the adjustment does not change anything
3444        //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3445        //             be applied.
3446        rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3447        nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3448        c = fText->char32At(nextPos);
3449        rule9Adjust(pos,     &thisChar, &nextPos, &c);
3450
3451        // If the loop is still warming up - if we haven't shifted the initial
3452        //   -1 positions out of prevPos yet - loop back to advance the
3453        //    position in the input without any further looking for breaks.
3454        if (prevPos == -1) {
3455            continue;
3456        }
3457
3458        // LB 4  Always break after hard line breaks,
3459        if (fBK->contains(prevChar)) {
3460            break;
3461        }
3462
3463        // LB 5  Break after CR, LF, NL, but not inside CR LF
3464        if (prevChar == 0x0d && thisChar == 0x0a) {
3465            continue;
3466        }
3467        if (prevChar == 0x0d ||
3468            prevChar == 0x0a ||
3469            prevChar == 0x85)  {
3470            break;
3471        }
3472
3473        // LB 6  Don't break before hard line breaks
3474        if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3475            fBK->contains(thisChar)) {
3476                continue;
3477        }
3478
3479
3480        // LB 7  Don't break before spaces or zero-width space.
3481        if (fSP->contains(thisChar)) {
3482            continue;
3483        }
3484
3485        if (fZW->contains(thisChar)) {
3486            continue;
3487        }
3488
3489        // LB 8  Break after zero width space
3490        if (fZW->contains(prevChar)) {
3491            break;
3492        }
3493
3494        // LB 9, 10  Already done, at top of loop.
3495        //
3496
3497
3498        // LB 11  Do not break before or after WORD JOINER and related characters.
3499        //    x  WJ
3500        //    WJ  x
3501        //
3502        if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3503            continue;
3504        }
3505
3506        // LB 12
3507        //    GL  x
3508        if (fGL->contains(prevChar)) {
3509            continue;
3510        }
3511
3512        // LB 12a
3513        //    [^SP BA HY] x GL
3514        if (!(fSP->contains(prevChar) ||
3515              fBA->contains(prevChar) ||
3516              fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3517            continue;
3518        }
3519
3520
3521
3522        // LB 13  Don't break before closings.
3523        //        NU x CL  and NU x IS are not matched here so that they will
3524        //        fall into LB 17 and the more general number regular expression.
3525        //
3526        if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
3527                                        fEX->contains(thisChar) ||
3528            !fNU->contains(prevChar) && fIS->contains(thisChar) ||
3529            !fNU->contains(prevChar) && fSY->contains(thisChar))    {
3530            continue;
3531        }
3532
3533        // LB 14 Don't break after OP SP*
3534        //       Scan backwards, checking for this sequence.
3535        //       The OP char could include combining marks, so we actually check for
3536        //           OP CM* SP*
3537        //       Another Twist: The Rule 67 fixes may have changed a SP CM
3538        //       sequence into a ID char, so before scanning back through spaces,
3539        //       verify that prevChar is indeed a space.  The prevChar variable
3540        //       may differ from fText[prevPos]
3541        tPos = prevPos;
3542        if (fSP->contains(prevChar)) {
3543            while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3544                tPos=fText->moveIndex32(tPos, -1);
3545            }
3546        }
3547        while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3548            tPos=fText->moveIndex32(tPos, -1);
3549        }
3550        if (fOP->contains(fText->char32At(tPos))) {
3551            continue;
3552        }
3553
3554
3555        // LB 15    QU SP* x OP
3556        if (fOP->contains(thisChar)) {
3557            // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3558            int tPos = prevPos;
3559            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3560                tPos = fText->moveIndex32(tPos, -1);
3561            }
3562            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3563                tPos = fText->moveIndex32(tPos, -1);
3564            }
3565            if (fQU->contains(fText->char32At(tPos))) {
3566                continue;
3567            }
3568        }
3569
3570
3571
3572        // LB 16   CL SP* x NS
3573        //    Scan backwards for SP* CM* CL
3574        if (fNS->contains(thisChar)) {
3575            int tPos = prevPos;
3576            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3577                tPos = fText->moveIndex32(tPos, -1);
3578            }
3579            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3580                tPos = fText->moveIndex32(tPos, -1);
3581            }
3582            if (fCL->contains(fText->char32At(tPos))) {
3583                continue;
3584            }
3585        }
3586
3587
3588        // LB 17        B2 SP* x B2
3589        if (fB2->contains(thisChar)) {
3590            //  Scan backwards, checking for the B2 CM* SP* sequence.
3591            tPos = prevPos;
3592            if (fSP->contains(prevChar)) {
3593                while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3594                    tPos=fText->moveIndex32(tPos, -1);
3595                }
3596            }
3597            while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3598                tPos=fText->moveIndex32(tPos, -1);
3599            }
3600            if (fB2->contains(fText->char32At(tPos))) {
3601                continue;
3602            }
3603        }
3604
3605
3606        // LB 18    break after space
3607        if (fSP->contains(prevChar)) {
3608            break;
3609        }
3610
3611        // LB 19
3612        //    x   QU
3613        //    QU  x
3614        if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3615            continue;
3616        }
3617
3618        // LB 20  Break around a CB
3619        if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3620            break;
3621        }
3622
3623        // LB 21
3624        if (fBA->contains(thisChar) ||
3625            fHY->contains(thisChar) ||
3626            fNS->contains(thisChar) ||
3627            fBB->contains(prevChar) )   {
3628            continue;
3629        }
3630
3631        // LB 22
3632        if (fAL->contains(prevChar) && fIN->contains(thisChar) ||
3633            fID->contains(prevChar) && fIN->contains(thisChar) ||
3634            fIN->contains(prevChar) && fIN->contains(thisChar) ||
3635            fNU->contains(prevChar) && fIN->contains(thisChar) )   {
3636            continue;
3637        }
3638
3639
3640        // LB 23    ID x PO
3641        //          AL x NU
3642        //          NU x AL
3643        if (fID->contains(prevChar) && fPO->contains(thisChar) ||
3644            fAL->contains(prevChar) && fNU->contains(thisChar) ||
3645            fNU->contains(prevChar) && fAL->contains(thisChar) )   {
3646            continue;
3647        }
3648
3649        // LB 24  Do not break between prefix and letters or ideographs.
3650        //        PR x ID
3651        //        PR x AL
3652        //        PO x AL
3653        if (fPR->contains(prevChar) && fID->contains(thisChar) ||
3654            fPR->contains(prevChar) && fAL->contains(thisChar) ||
3655            fPO->contains(prevChar) && fAL->contains(thisChar) )   {
3656            continue;
3657        }
3658
3659
3660
3661        // LB 25    Numbers
3662        if (fNumberMatcher->lookingAt(prevPos, status)) {
3663            if (U_FAILURE(status)) {
3664                break;
3665            }
3666            // Matched a number.  But could have been just a single digit, which would
3667            //    not represent a "no break here" between prevChar and thisChar
3668            int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3669            if (numEndIdx > pos) {
3670                // Number match includes at least our two chars being checked
3671                if (numEndIdx > nextPos) {
3672                    // Number match includes additional chars.  Update pos and nextPos
3673                    //   so that next loop iteration will continue at the end of the number,
3674                    //   checking for breaks between last char in number & whatever follows.
3675                    pos = nextPos = numEndIdx;
3676                    do {
3677                        pos = fText->moveIndex32(pos, -1);
3678                        thisChar = fText->char32At(pos);
3679                    } while (fCM->contains(thisChar));
3680                }
3681                continue;
3682            }
3683        }
3684
3685
3686        // LB 26 Do not break a Korean syllable.
3687        if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3688                                        fJV->contains(thisChar) ||
3689                                        fH2->contains(thisChar) ||
3690                                        fH3->contains(thisChar))) {
3691                                            continue;
3692                                        }
3693
3694        if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3695            (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3696                continue;
3697        }
3698
3699        if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3700            fJT->contains(thisChar)) {
3701                continue;
3702        }
3703
3704        // LB 27 Treat a Korean Syllable Block the same as ID.
3705        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3706            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3707            fIN->contains(thisChar)) {
3708                continue;
3709            }
3710        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3711            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3712            fPO->contains(thisChar)) {
3713                continue;
3714            }
3715        if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3716            fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3717                continue;
3718            }
3719
3720
3721
3722        // LB 28  Do not break between alphabetics ("at").
3723        if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
3724            continue;
3725        }
3726
3727        // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3728        if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
3729            continue;
3730        }
3731
3732        // LB 31    Break everywhere else
3733        break;
3734
3735    }
3736
3737    return pos;
3738}
3739
3740
3741UVector  *RBBILineMonkey::charClasses() {
3742    return fSets;
3743}
3744
3745
3746RBBILineMonkey::~RBBILineMonkey() {
3747    delete fSets;
3748
3749    delete fBK;
3750    delete fCR;
3751    delete fLF;
3752    delete fCM;
3753    delete fNL;
3754    delete fWJ;
3755    delete fZW;
3756    delete fGL;
3757    delete fCB;
3758    delete fSP;
3759    delete fB2;
3760    delete fBA;
3761    delete fBB;
3762    delete fHY;
3763    delete fH2;
3764    delete fH3;
3765    delete fCL;
3766    delete fEX;
3767    delete fIN;
3768    delete fJL;
3769    delete fJV;
3770    delete fJT;
3771    delete fNS;
3772    delete fOP;
3773    delete fQU;
3774    delete fIS;
3775    delete fNU;
3776    delete fPO;
3777    delete fPR;
3778    delete fSY;
3779    delete fAI;
3780    delete fAL;
3781    delete fID;
3782    delete fSA;
3783    delete fSG;
3784    delete fXX;
3785
3786    delete fCharBI;
3787    delete fNumberMatcher;
3788}
3789
3790
3791//-------------------------------------------------------------------------------------------
3792//
3793//   TestMonkey
3794//
3795//     params
3796//       seed=nnnnn        Random number starting seed.
3797//                         Setting the seed allows errors to be reproduced.
3798//       loop=nnn          Looping count.  Controls running time.
3799//                         -1:  run forever.
3800//                          0 or greater:  run length.
3801//
3802//       type = char | word | line | sent | title
3803//
3804//-------------------------------------------------------------------------------------------
3805
3806static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3807    int32_t val = defaultVal;
3808    name.append(" *= *(-?\\d+)");
3809    UErrorCode status = U_ZERO_ERROR;
3810    RegexMatcher m(name, params, 0, status);
3811    if (m.find()) {
3812        // The param exists.  Convert the string to an int.
3813        char valString[100];
3814        int32_t paramLength = m.end(1, status) - m.start(1, status);
3815        if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3816            paramLength = (int32_t)(sizeof(valString)-2);
3817        }
3818        params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3819        val = strtol(valString,  NULL, 10);
3820
3821        // Delete this parameter from the params string.
3822        m.reset();
3823        params = m.replaceFirst("", status);
3824    }
3825    U_ASSERT(U_SUCCESS(status));
3826    return val;
3827}
3828#endif
3829
3830static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3831                                    BreakIterator *bi,
3832                                    int expected[],
3833                                    int expectedcount)
3834{
3835    int count = 0;
3836    int i = 0;
3837    int forward[50];
3838    bi->setText(ustr);
3839    for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3840        forward[count] = i;
3841        if (count < expectedcount && expected[count] != i) {
3842            test->errln("break forward test failed: expected %d but got %d",
3843                        expected[count], i);
3844            break;
3845        }
3846        count ++;
3847    }
3848    if (count != expectedcount) {
3849        printStringBreaks(ustr, expected, expectedcount);
3850        test->errln("break forward test failed: missed %d match",
3851                    expectedcount - count);
3852        return;
3853    }
3854    // testing boundaries
3855    for (i = 1; i < expectedcount; i ++) {
3856        int j = expected[i - 1];
3857        if (!bi->isBoundary(j)) {
3858            printStringBreaks(ustr, expected, expectedcount);
3859            test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3860            return;
3861        }
3862        for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3863            if (bi->isBoundary(j)) {
3864                printStringBreaks(ustr, expected, expectedcount);
3865                test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3866                return;
3867            }
3868        }
3869    }
3870
3871    for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3872        count --;
3873        if (forward[count] != i) {
3874            test->errln("happy break test previous() failed: expected %d but got %d",
3875                        forward[count], i);
3876            break;
3877        }
3878    }
3879    if (count != 0) {
3880        printStringBreaks(ustr, expected, expectedcount);
3881        test->errln("break test previous() failed: missed a match");
3882        return;
3883    }
3884
3885    // testing preceding
3886    for (i = 0; i < expectedcount - 1; i ++) {
3887        // int j = expected[i] + 1;
3888        int j = ustr.moveIndex32(expected[i], 1);
3889        for (; j <= expected[i + 1]; j ++) {
3890            if (bi->preceding(j) != expected[i]) {
3891                printStringBreaks(ustr, expected, expectedcount);
3892                test->errln("preceding(): Not expecting boundary at position %d", j);
3893                return;
3894            }
3895        }
3896    }
3897}
3898
3899void RBBITest::TestWordBreaks(void)
3900{
3901#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3902
3903    Locale        locale("en");
3904    UErrorCode    status = U_ZERO_ERROR;
3905    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3906    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3907    static const char *strlist[] =
3908    {
3909    "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3910    "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
3911    "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3912    "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3913    "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3914    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3915    "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3916    "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3917    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3918    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3919    "\\u2027\\U000e0067\\u0a47\\u00b7",
3920    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3921    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3922    "\\u0589\\U000e006e\\u0a42\\U000104a5",
3923    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3924    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3925    "\\u0027\\u11af\\U000e0057\\u0602",
3926    "\\U0001d7f2\\U000e007\\u0004\\u0589",
3927    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3928    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3929    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3930    "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3931    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3932    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3933    "\\u0233\\U000e0020\\u0a69\\u0d6a",
3934    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3935    "\\u58f4\\U000e0049\\u20e7\\u2027",
3936    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3937    "\\ua183\\u102d\\u0bec\\u003a",
3938    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3939    "\\u003a\\u0e57\\u0fad\\u002e",
3940    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3941    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3942    "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3943    "\\u003a\\u0664\\u00b7\\u1fba",
3944    "\\u003b\\u0027\\u00b7\\u47a3",
3945    "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3946    "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3947    "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3948    };
3949    int loop;
3950    if (U_FAILURE(status)) {
3951        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3952        return;
3953    }
3954    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3955        // printf("looping %d\n", loop);
3956        UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3957        // RBBICharMonkey monkey;
3958        RBBIWordMonkey monkey;
3959
3960        int expected[50];
3961        int expectedcount = 0;
3962
3963        monkey.setText(ustr);
3964        int i;
3965        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3966            expected[expectedcount ++] = i;
3967        }
3968
3969        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3970    }
3971    delete bi;
3972#endif
3973}
3974
3975void RBBITest::TestWordBoundary(void)
3976{
3977    // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3978    Locale        locale("en");
3979    UErrorCode    status = U_ZERO_ERROR;
3980    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3981    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3982    UChar         str[50];
3983    static const char *strlist[] =
3984    {
3985    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3986    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3987    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3988    "\\u2027\\U000e0067\\u0a47\\u00b7",
3989    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3990    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3991    "\\u0589\\U000e006e\\u0a42\\U000104a5",
3992    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3993    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3994    "\\u0027\\u11af\\U000e0057\\u0602",
3995    "\\U0001d7f2\\U000e007\\u0004\\u0589",
3996    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3997    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3998    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3999    "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
4000    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
4001    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
4002    "\\u0233\\U000e0020\\u0a69\\u0d6a",
4003    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
4004    "\\u58f4\\U000e0049\\u20e7\\u2027",
4005    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
4006    "\\ua183\\u102d\\u0bec\\u003a",
4007    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
4008    "\\u003a\\u0e57\\u0fad\\u002e",
4009    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
4010    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
4011    "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
4012    "\\u003a\\u0664\\u00b7\\u1fba",
4013    "\\u003b\\u0027\\u00b7\\u47a3",
4014    };
4015    int loop;
4016    if (U_FAILURE(status)) {
4017        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4018        return;
4019    }
4020    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4021        // printf("looping %d\n", loop);
4022        u_unescape(strlist[loop], str, 20);
4023        UnicodeString ustr(str);
4024        int forward[50];
4025        int count = 0;
4026
4027        bi->setText(ustr);
4028        int prev = 0;
4029        int i;
4030        for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
4031            forward[count ++] = i;
4032            if (i > prev) {
4033                int j;
4034                for (j = prev + 1; j < i; j ++) {
4035                    if (bi->isBoundary(j)) {
4036                        printStringBreaks(ustr, forward, count);
4037                        errln("happy boundary test failed: expected %d not a boundary",
4038                               j);
4039                        return;
4040                    }
4041                }
4042            }
4043            if (!bi->isBoundary(i)) {
4044                printStringBreaks(ustr, forward, count);
4045                errln("happy boundary test failed: expected %d a boundary",
4046                       i);
4047                return;
4048            }
4049            prev = i;
4050        }
4051    }
4052    delete bi;
4053}
4054
4055void RBBITest::TestLineBreaks(void)
4056{
4057#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4058    Locale        locale("en");
4059    UErrorCode    status = U_ZERO_ERROR;
4060    BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4061    const int32_t  STRSIZE = 50;
4062    UChar         str[STRSIZE];
4063    static const char *strlist[] =
4064    {
4065     "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
4066     "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
4067             "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
4068     "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
4069             "u2014\\U000e0105\\u118c\\u000a\\u07f8",
4070     "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
4071     "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4072     "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4073     "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4074     "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
4075     "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
4076     "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4077     "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4078     "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4079     "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4080     "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4081     "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4082     "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4083     "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4084     "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4085     "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4086     "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4087     "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4088     "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4089     "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4090     "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4091     "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
4092     "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4093     "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4094     "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4095     "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4096     "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4097     "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
4098     "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4099     "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4100     "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
4101     "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4102     "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4103     "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4104     "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4105     "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4106     "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4107         "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
4108         "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
4109         "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
4110     "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4111         "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4112    };
4113    int loop;
4114    TEST_ASSERT_SUCCESS(status);
4115    if (U_FAILURE(status)) {
4116        return;
4117    }
4118    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4119        // printf("looping %d\n", loop);
4120        int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4121        if (t >= STRSIZE) {
4122            TEST_ASSERT(FALSE);
4123            continue;
4124        }
4125
4126
4127        UnicodeString ustr(str);
4128        RBBILineMonkey monkey;
4129        if (U_FAILURE(monkey.deferredStatus)) {
4130            continue;
4131        }
4132
4133        const int EXPECTEDSIZE = 50;
4134        int expected[EXPECTEDSIZE];
4135        int expectedcount = 0;
4136
4137        monkey.setText(ustr);
4138        int i;
4139        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4140            if (expectedcount >= EXPECTEDSIZE) {
4141                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4142                return;
4143            }
4144            expected[expectedcount ++] = i;
4145        }
4146
4147        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4148    }
4149    delete bi;
4150#endif
4151}
4152
4153void RBBITest::TestSentBreaks(void)
4154{
4155#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4156    Locale        locale("en");
4157    UErrorCode    status = U_ZERO_ERROR;
4158    BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4159    UChar         str[200];
4160    static const char *strlist[] =
4161    {
4162     "Now\ris\nthe\r\ntime\n\rfor\r\r",
4163     "This\n",
4164     "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4165     "\"Sentence ending with a quote.\" Bye.",
4166     "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
4167     "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4168     "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4169     "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4170     "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4171     "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4172     "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4173             "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4174             "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4175             "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4176     "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4177             "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4178             "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4179             "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4180             "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4181             "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4182    };
4183    int loop;
4184    if (U_FAILURE(status)) {
4185        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4186        return;
4187    }
4188    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4189        u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
4190        UnicodeString ustr(str);
4191
4192        RBBISentMonkey monkey;
4193        if (U_FAILURE(monkey.deferredStatus)) {
4194            continue;
4195        }
4196
4197        const int EXPECTEDSIZE = 50;
4198        int expected[EXPECTEDSIZE];
4199        int expectedcount = 0;
4200
4201        monkey.setText(ustr);
4202        int i;
4203        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4204            if (expectedcount >= EXPECTEDSIZE) {
4205                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4206                return;
4207            }
4208            expected[expectedcount ++] = i;
4209        }
4210
4211        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4212    }
4213    delete bi;
4214#endif
4215}
4216
4217void RBBITest::TestMonkey(char *params) {
4218#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4219
4220    UErrorCode     status    = U_ZERO_ERROR;
4221    int32_t        loopCount = 500;
4222    int32_t        seed      = 1;
4223    UnicodeString  breakType = "all";
4224    Locale         locale("en");
4225    UBool          useUText  = FALSE;
4226
4227    if (quick == FALSE) {
4228        loopCount = 10000;
4229    }
4230
4231    if (params) {
4232        UnicodeString p(params);
4233        loopCount = getIntParam("loop", p, loopCount);
4234        seed      = getIntParam("seed", p, seed);
4235
4236        RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4237        if (m.find()) {
4238            breakType = m.group(1, status);
4239            m.reset();
4240            p = m.replaceFirst("", status);
4241        }
4242
4243        RegexMatcher u(" *utext", p, 0, status);
4244        if (u.find()) {
4245            useUText = TRUE;
4246            u.reset();
4247            p = u.replaceFirst("", status);
4248        }
4249
4250
4251        // m.reset(p);
4252        if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4253            // Each option is stripped out of the option string as it is processed.
4254            // All options have been checked.  The option string should have been completely emptied..
4255            char buf[100];
4256            p.extract(buf, sizeof(buf), NULL, status);
4257            buf[sizeof(buf)-1] = 0;
4258            errln("Unrecognized or extra parameter:  %s\n", buf);
4259            return;
4260        }
4261
4262    }
4263
4264    if (breakType == "char" || breakType == "all") {
4265        RBBICharMonkey  m;
4266        BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4267        if (U_SUCCESS(status)) {
4268            RunMonkey(bi, m, "char", seed, loopCount, useUText);
4269            if (breakType == "all" && useUText==FALSE) {
4270                // Also run a quick test with UText when "all" is specified
4271                RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4272            }
4273        }
4274        else {
4275            errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4276        }
4277        delete bi;
4278    }
4279
4280    if (breakType == "word" || breakType == "all") {
4281        logln("Word Break Monkey Test");
4282        RBBIWordMonkey  m;
4283        BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4284        if (U_SUCCESS(status)) {
4285            RunMonkey(bi, m, "word", seed, loopCount, useUText);
4286        }
4287        else {
4288            errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4289        }
4290        delete bi;
4291    }
4292
4293    if (breakType == "line" || breakType == "all") {
4294        logln("Line Break Monkey Test");
4295        RBBILineMonkey  m;
4296        BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4297        if (loopCount >= 10) {
4298            loopCount = loopCount / 5;   // Line break runs slower than the others.
4299        }
4300        if (U_SUCCESS(status)) {
4301            RunMonkey(bi, m, "line", seed, loopCount, useUText);
4302        }
4303        else {
4304            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4305        }
4306        delete bi;
4307    }
4308
4309    if (breakType == "sent" || breakType == "all"  ) {
4310        logln("Sentence Break Monkey Test");
4311        RBBISentMonkey  m;
4312        BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4313        if (loopCount >= 10) {
4314            loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4315        }
4316        if (U_SUCCESS(status)) {
4317            RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4318        }
4319        else {
4320            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4321        }
4322        delete bi;
4323    }
4324
4325#endif
4326}
4327
4328//
4329//  Run a RBBI monkey test.  Common routine, for all break iterator types.
4330//    Parameters:
4331//       bi      - the break iterator to use
4332//       mk      - MonkeyKind, abstraction for obtaining expected results
4333//       name    - Name of test (char, word, etc.) for use in error messages
4334//       seed    - Seed for starting random number generator (parameter from user)
4335//       numIterations
4336//
4337void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4338                         int32_t numIterations, UBool useUText) {
4339
4340#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4341
4342    const int32_t    TESTSTRINGLEN = 500;
4343    UnicodeString    testText;
4344    int32_t          numCharClasses;
4345    UVector          *chClasses;
4346    int              expected[TESTSTRINGLEN*2 + 1];
4347    int              expectedCount = 0;
4348    char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4349    char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4350    char             reverseBreaks[TESTSTRINGLEN*2+1];
4351    char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4352    char             followingBreaks[TESTSTRINGLEN*2+1];
4353    char             precedingBreaks[TESTSTRINGLEN*2+1];
4354    int              i;
4355    int              loopCount = 0;
4356
4357    m_seed = seed;
4358
4359    numCharClasses = mk.charClasses()->size();
4360    chClasses      = mk.charClasses();
4361
4362    // Check for errors that occured during the construction of the MonkeyKind object.
4363    //  Can't report them where they occured because errln() is a method coming from intlTest,
4364    //  and is not visible outside of RBBITest :-(
4365    if (U_FAILURE(mk.deferredStatus)) {
4366        errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4367        return;
4368    }
4369
4370    // Verify that the character classes all have at least one member.
4371    for (i=0; i<numCharClasses; i++) {
4372        UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4373        if (s == NULL || s->size() == 0) {
4374            errln("Character Class #%d is null or of zero size.", i);
4375            return;
4376        }
4377    }
4378
4379    while (loopCount < numIterations || numIterations == -1) {
4380        if (numIterations == -1 && loopCount % 10 == 0) {
4381            // If test is running in an infinite loop, display a periodic tic so
4382            //   we can tell that it is making progress.
4383            fprintf(stderr, ".");
4384        }
4385        // Save current random number seed, so that we can recreate the random numbers
4386        //   for this loop iteration in event of an error.
4387        seed = m_seed;
4388
4389        // Populate a test string with data.
4390        testText.truncate(0);
4391        for (i=0; i<TESTSTRINGLEN; i++) {
4392            int32_t  aClassNum = m_rand() % numCharClasses;
4393            UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4394            int32_t   charIdx = m_rand() % classSet->size();
4395            UChar32   c = classSet->charAt(charIdx);
4396            if (c < 0) {   // TODO:  deal with sets containing strings.
4397                errln("c < 0");
4398                break;
4399            }
4400            testText.append(c);
4401        }
4402
4403        // Calculate the expected results for this test string.
4404        mk.setText(testText);
4405        memset(expectedBreaks, 0, sizeof(expectedBreaks));
4406        expectedBreaks[0] = 1;
4407        int32_t breakPos = 0;
4408        expectedCount = 0;
4409        for (;;) {
4410            breakPos = mk.next(breakPos);
4411            if (breakPos == -1) {
4412                break;
4413            }
4414            if (breakPos > testText.length()) {
4415                errln("breakPos > testText.length()");
4416            }
4417            expectedBreaks[breakPos] = 1;
4418            U_ASSERT(expectedCount<testText.length());
4419            expected[expectedCount ++] = breakPos;
4420        }
4421
4422        // Find the break positions using forward iteration
4423        memset(forwardBreaks, 0, sizeof(forwardBreaks));
4424        if (useUText) {
4425            UErrorCode status = U_ZERO_ERROR;
4426            UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4427            // testUText = utext_openUnicodeString(testUText, &testText, &status);
4428            bi->setText(testUText, status);
4429            TEST_ASSERT_SUCCESS(status);
4430            utext_close(testUText);   // The break iterator does a shallow clone of the UText
4431                                      //  This UText can be closed immediately, so long as the
4432                                      //  testText string continues to exist.
4433        } else {
4434            bi->setText(testText);
4435        }
4436
4437        for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4438            if (i < 0 || i > testText.length()) {
4439                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4440                break;
4441            }
4442            forwardBreaks[i] = 1;
4443        }
4444
4445        // Find the break positions using reverse iteration
4446        memset(reverseBreaks, 0, sizeof(reverseBreaks));
4447        for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4448            if (i < 0 || i > testText.length()) {
4449                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4450                break;
4451            }
4452            reverseBreaks[i] = 1;
4453        }
4454
4455        // Find the break positions using isBoundary() tests.
4456        memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4457        U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4458        for (i=0; i<=testText.length(); i++) {
4459            isBoundaryBreaks[i] = bi->isBoundary(i);
4460        }
4461
4462
4463        // Find the break positions using the following() function.
4464        // printf(".");
4465        memset(followingBreaks, 0, sizeof(followingBreaks));
4466        int32_t   lastBreakPos = 0;
4467        followingBreaks[0] = 1;
4468        for (i=0; i<testText.length(); i++) {
4469            breakPos = bi->following(i);
4470            if (breakPos <= i ||
4471                breakPos < lastBreakPos ||
4472                breakPos > testText.length() ||
4473                breakPos > lastBreakPos && lastBreakPos > i ) {
4474                errln("%s break monkey test: "
4475                    "Out of range value returned by BreakIterator::following().\n"
4476                        "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4477                         name, seed, i, breakPos, lastBreakPos);
4478                break;
4479            }
4480            followingBreaks[breakPos] = 1;
4481            lastBreakPos = breakPos;
4482        }
4483
4484        // Find the break positions using the preceding() function.
4485        memset(precedingBreaks, 0, sizeof(precedingBreaks));
4486        lastBreakPos = testText.length();
4487        precedingBreaks[testText.length()] = 1;
4488        for (i=testText.length(); i>0; i--) {
4489            breakPos = bi->preceding(i);
4490            if (breakPos >= i ||
4491                breakPos > lastBreakPos ||
4492                breakPos < 0 && testText.getChar32Start(i)>0 ||
4493                breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) {
4494                errln("%s break monkey test: "
4495                    "Out of range value returned by BreakIterator::preceding().\n"
4496                    "index=%d;  prev returned %d; lastBreak=%d" ,
4497                    name,  i, breakPos, lastBreakPos);
4498                if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4499                    precedingBreaks[i] = 2;   // Forces an error.
4500                }
4501            } else {
4502                if (breakPos >= 0) {
4503                    precedingBreaks[breakPos] = 1;
4504                }
4505                lastBreakPos = breakPos;
4506            }
4507        }
4508
4509        // Compare the expected and actual results.
4510        for (i=0; i<=testText.length(); i++) {
4511            const char *errorType = NULL;
4512            if  (forwardBreaks[i] != expectedBreaks[i]) {
4513                errorType = "next()";
4514            } else if (reverseBreaks[i] != forwardBreaks[i]) {
4515                errorType = "previous()";
4516            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4517                errorType = "isBoundary()";
4518            } else if (followingBreaks[i] != expectedBreaks[i]) {
4519                errorType = "following()";
4520            } else if (precedingBreaks[i] != expectedBreaks[i]) {
4521                errorType = "preceding()";
4522            }
4523
4524
4525            if (errorType != NULL) {
4526                // Format a range of the test text that includes the failure as
4527                //  a data item that can be included in the rbbi test data file.
4528
4529                // Start of the range is the last point where expected and actual results
4530                //   both agreed that there was a break position.
4531                int startContext = i;
4532                int32_t count = 0;
4533                for (;;) {
4534                    if (startContext==0) { break; }
4535                    startContext --;
4536                    if (expectedBreaks[startContext] != 0) {
4537                        if (count == 2) break;
4538                        count ++;
4539                    }
4540                }
4541
4542                // End of range is two expected breaks past the start position.
4543                int endContext = i + 1;
4544                int ci;
4545                for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4546                    for (;;) {
4547                        if (endContext >= testText.length()) {break;}
4548                        if (expectedBreaks[endContext-1] != 0) {
4549                            if (count == 0) break;
4550                            count --;
4551                        }
4552                        endContext ++;
4553                    }
4554                }
4555
4556                // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4557                UnicodeString errorText = "<data>";
4558                /***if (strcmp(errorType, "next()") == 0) {
4559                    startContext = 0;
4560                    endContext = testText.length();
4561
4562                    printStringBreaks(testText, expected, expectedCount);
4563                }***/
4564
4565                for (ci=startContext; ci<endContext;) {
4566                    UnicodeString hexChars("0123456789abcdef");
4567                    UChar32  c;
4568                    int      bn;
4569                    c = testText.char32At(ci);
4570                    if (ci == i) {
4571                        // This is the location of the error.
4572                        errorText.append("<?>");
4573                    } else if (expectedBreaks[ci] != 0) {
4574                        // This a non-error expected break position.
4575                        errorText.append("\\");
4576                    }
4577                    if (c < 0x10000) {
4578                        errorText.append("\\u");
4579                        for (bn=12; bn>=0; bn-=4) {
4580                            errorText.append(hexChars.charAt((c>>bn)&0xf));
4581                        }
4582                    } else {
4583                        errorText.append("\\U");
4584                        for (bn=28; bn>=0; bn-=4) {
4585                            errorText.append(hexChars.charAt((c>>bn)&0xf));
4586                        }
4587                    }
4588                    ci = testText.moveIndex32(ci, 1);
4589                }
4590                errorText.append("\\");
4591                errorText.append("</data>\n");
4592
4593                // Output the error
4594                char  charErrorTxt[500];
4595                UErrorCode status = U_ZERO_ERROR;
4596                errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4597                charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4598                errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4599                    name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4600                    errorType, seed, i, charErrorTxt);
4601                break;
4602            }
4603        }
4604
4605        loopCount++;
4606    }
4607#endif
4608}
4609
4610//
4611//  TestDebug    -  A place-holder test for debugging purposes.
4612//                  For putting in fragments of other tests that can be invoked
4613//                  for tracing  without a lot of unwanted extra stuff happening.
4614//
4615void RBBITest::TestDebug(void) {
4616#if 0
4617    UErrorCode   status = U_ZERO_ERROR;
4618    int pos = 0;
4619    int ruleStatus = 0;
4620
4621    RuleBasedBreakIterator* bi =
4622       // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4623       // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4624       (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4625    UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4626    // UnicodeString s("Aaa.  Bcd");
4627    s = s.unescape();
4628    bi->setText(s);
4629    UBool r = bi->isBoundary(8);
4630    printf("%s", r?"true":"false");
4631    return;
4632    pos = bi->last();
4633    do {
4634        // ruleStatus = bi->getRuleStatus();
4635        printf("%d\t%d\n", pos, ruleStatus);
4636        pos = bi->previous();
4637    } while (pos != BreakIterator::DONE);
4638#endif
4639}
4640
4641#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
4642