1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2010, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6/************************************************************************
7*   Date        Name        Description
8*   12/15/99    Madhu        Creation.
9*   01/12/2000  Madhu        Updated for changed API and added new tests
10************************************************************************/
11
12#include "unicode/utypeinfo.h"  // for 'typeid' to work
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/utypes.h"
19#include "unicode/brkiter.h"
20#include "unicode/rbbi.h"
21#include "unicode/uchar.h"
22#include "unicode/utf16.h"
23#include "unicode/ucnv.h"
24#include "unicode/schriter.h"
25#include "unicode/uniset.h"
26#include "unicode/regex.h"        // TODO: make conditional on regexp being built.
27#include "unicode/ustring.h"
28#include "unicode/utext.h"
29#include "intltest.h"
30#include "rbbitst.h"
31#include <string.h>
32#include "uvector.h"
33#include "uvectr32.h"
34#include "triedict.h"
35#include <string.h>
36#include <stdio.h>
37#include <stdlib.h>
38#include "unicode/numfmt.h"
39#include "unicode/uscript.h"
40
41#define TEST_ASSERT(x) {if (!(x)) { \
42    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
43
44#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
45    errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
46
47
48//---------------------------------------------
49// runIndexedTest
50//---------------------------------------------
51
52void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
53{
54    if (exec) logln("TestSuite RuleBasedBreakIterator: ");
55
56    switch (index) {
57#if !UCONFIG_NO_FILE_IO
58        case 0: name = "TestBug4153072";
59            if(exec) TestBug4153072();                         break;
60#else
61        case 0: name = "skip";
62            break;
63#endif
64
65        case 1: name = "TestJapaneseLineBreak";
66            if(exec) TestJapaneseLineBreak();                  break;
67        case 2: name = "TestStatusReturn";
68            if(exec) TestStatusReturn();                       break;
69
70#if !UCONFIG_NO_FILE_IO
71        case 3: name = "TestUnicodeFiles";
72            if(exec) TestUnicodeFiles();                       break;
73        case 4: name = "TestEmptyString";
74            if(exec) TestEmptyString();                        break;
75#else
76        case 3: case 4: name = "skip";
77            break;
78#endif
79
80        case 5: name = "TestGetAvailableLocales";
81            if(exec) TestGetAvailableLocales();                break;
82
83        case 6: name = "TestGetDisplayName";
84            if(exec) TestGetDisplayName();                     break;
85
86#if !UCONFIG_NO_FILE_IO
87        case 7: name = "TestEndBehaviour";
88            if(exec) TestEndBehaviour();                       break;
89        case 8: name = "TestMixedThaiLineBreak";
90             if(exec) TestMixedThaiLineBreak();                break;
91        case 9: name = "TestThaiLineBreak";
92             if(exec) TestThaiLineBreak();                     break;
93        case 10: name = "TestMaiyamok";
94             if(exec) TestMaiyamok();                          break;
95        case 11: name = "TestWordBreaks";
96             if(exec) TestWordBreaks();                        break;
97        case 12: name = "TestWordBoundary";
98             if(exec) TestWordBoundary();                      break;
99        case 13: name = "TestLineBreaks";
100             if(exec) TestLineBreaks();                        break;
101        case 14: name = "TestSentBreaks";
102             if(exec) TestSentBreaks();                        break;
103        case 15: name = "TestExtended";
104             if(exec) TestExtended();                          break;
105#else
106        case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
107             break;
108#endif
109
110        case 16:
111             if(exec) {
112 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
113               name = "TestMonkey";
114               TestMonkey(params);
115 #else
116               name = "skip";
117 #endif
118             }
119                                                               break;
120
121#if !UCONFIG_NO_FILE_IO
122        case 17: name = "TestBug3818";
123            if(exec) TestBug3818();                            break;
124        case 18: name = "TestJapaneseWordBreak";
125            if(exec) TestJapaneseWordBreak();                  break;
126#else
127        case 17: case 18: name = "skip";
128            break;
129#endif
130
131        case 19: name = "TestDebug";
132            if(exec) TestDebug();                              break;
133        case 20: name = "TestTrieDict";
134            if(exec) TestTrieDict();                           break;
135
136#if !UCONFIG_NO_FILE_IO
137        case 21: name = "TestBug5775";
138            if (exec) TestBug5775();                           break;
139        case 22: name = "TestThaiBreaks";
140            if (exec) TestThaiBreaks();                        break;
141        case 23: name = "TestTailoredBreaks";
142            if (exec) TestTailoredBreaks();                    break;
143        case 24: name = "TestTrieDictWithValue";
144            if(exec) TestTrieDictWithValue();                  break;
145#else
146        case 21: case 22: case 23: case 24: name = "skip";
147            break;
148#endif
149        case 25: name = "TestDictRules";
150            if (exec) TestDictRules();                         break;
151        case 25: name = "TestBug5532";
152            if (exec) TestBug5532();                           break;
153        default: name = ""; break; //needed to end loop
154    }
155}
156
157
158//---------------------------------------------------------------------------
159//
160//   class BITestData   Holds a set of Break iterator test data and results
161//                      Includes
162//                         - the string data to be broken
163//                         - a vector of the expected break positions.
164//                         - a vector of source line numbers for the data,
165//                               (to help see where errors occured.)
166//                         - The expected break tag values.
167//                         - Vectors of actual break positions and tag values.
168//                         - Functions for comparing actual with expected and
169//                            reporting errors.
170//
171//----------------------------------------------------------------------------
172class BITestData {
173public:
174    UnicodeString    fDataToBreak;
175    UVector          fExpectedBreakPositions;
176    UVector          fExpectedTags;
177    UVector          fLineNum;
178    UVector          fActualBreakPositions;   // Test Results.
179    UVector          fActualTags;
180
181    BITestData(UErrorCode &status);
182    void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
183    void             checkResults(const char *heading, RBBITest *test);
184    void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
185    void             clearResults();
186};
187
188//
189// Constructor.
190//
191BITestData::BITestData(UErrorCode &status)
192: fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
193  fActualTags(status)
194{
195}
196
197//
198// addDataChunk.   Add a section (non-breaking) piece if data to the test data.
199//                 The macro form collects the line number, which is helpful
200//                 when tracking down failures.
201//
202//                 A null data item is inserted at the start of each test's data
203//                  to put the starting zero into the data list.  The position saved for
204//                  each non-null item is its ending position.
205//
206#define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
207void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
208    if (U_FAILURE(status)) {return;}
209    if (data != NULL) {
210        fDataToBreak.append(CharsToUnicodeString(data));
211    }
212    fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
213    fExpectedTags.addElement(tag, status);
214    fLineNum.addElement(lineNum, status);
215}
216
217
218//
219//  checkResults.   Compare the actual and expected break positions, report any differences.
220//
221void BITestData::checkResults(const char *heading, RBBITest *test) {
222    int32_t   expectedIndex = 0;
223    int32_t   actualIndex = 0;
224
225    for (;;) {
226        // If we've run through both the expected and actual results vectors, we're done.
227        //   break out of the loop.
228        if (expectedIndex >= fExpectedBreakPositions.size() &&
229            actualIndex   >= fActualBreakPositions.size()) {
230            break;
231        }
232
233
234        if (expectedIndex >= fExpectedBreakPositions.size()) {
235            err(heading, test, expectedIndex-1, actualIndex);
236            actualIndex++;
237            continue;
238        }
239
240        if (actualIndex >= fActualBreakPositions.size()) {
241            err(heading, test, expectedIndex, actualIndex-1);
242            expectedIndex++;
243            continue;
244        }
245
246        if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
247            err(heading, test, expectedIndex, actualIndex);
248            // Try to resync the positions of the indices, to avoid a rash of spurious erros.
249            if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
250                actualIndex++;
251            } else {
252                expectedIndex++;
253            }
254            continue;
255        }
256
257        if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
258            test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
259                heading, fLineNum.elementAt(expectedIndex),
260                fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
261        }
262
263        actualIndex++;
264        expectedIndex++;
265    }
266}
267
268//
269//  err   -  An error was found.  Report it, along with information about where the
270//                                incorrectly broken test data appeared in the source file.
271//
272void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
273{
274    int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
275    int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
276    int32_t   o        = 0;
277    int32_t   line     = fLineNum.elementAti(expectedIdx);
278    if (expectedIdx > 0) {
279        // The line numbers are off by one because a premature break occurs somewhere
280        //    within the previous item, rather than at the start of the current (expected) item.
281        //    We want to report the offset of the unexpected break from the start of
282        //      this previous item.
283        o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
284    }
285    if (actual < expected) {
286        test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
287    } else {
288        test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
289    }
290}
291
292
293void BITestData::clearResults() {
294    fActualBreakPositions.removeAllElements();
295    fActualTags.removeAllElements();
296}
297
298
299//-----------------------------------------------------------------------------------
300//
301//    Cannned Test Characters
302//
303//-----------------------------------------------------------------------------------
304
305static const UChar cannedTestArray[] = {
306    0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
307    0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
308    0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
309    0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
310    0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
311    0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
312    0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
313    0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
314};
315
316static UnicodeString* cannedTestChars = 0;
317
318#define  halfNA     "\\u0928\\u094d\\u200d"
319#define  halfSA     "\\u0938\\u094d\\u200d"
320#define  halfCHA    "\\u091a\\u094d\\u200d"
321#define  halfKA     "\\u0915\\u094d\\u200d"
322#define  deadTA     "\\u0924\\u094d"
323
324//--------------------------------------------------------------------------------------
325//
326//    RBBITest    constructor and destructor
327//
328//--------------------------------------------------------------------------------------
329
330RBBITest::RBBITest() {
331    UnicodeString temp(cannedTestArray);
332    cannedTestChars = new UnicodeString();
333    *cannedTestChars += (UChar)0x0000;
334    *cannedTestChars += temp;
335}
336
337
338RBBITest::~RBBITest() {
339    delete cannedTestChars;
340}
341
342
343static const int T_NUMBER = 100;
344static const int T_LETTER = 200;
345static const int T_H_OR_K = 300;
346static const int T_IDEO   = 400;
347
348
349
350
351
352
353//--------------------------------------------------------------------
354//Testing the BreakIterator for devanagari script
355//--------------------------------------------------------------------
356
357#define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
358#define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
359#define deadTTHA "\\u0920\\u094d"
360#define deadPA   "\\u092a\\u094d"
361#define deadSA   "\\u0938\\u094d"
362#define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
363
364
365
366
367
368
369//-----------------------------------------------------------------------------------
370//
371//   Test for status {tag} return value from break rules.
372//        TODO:  a more thorough test.
373//
374//-----------------------------------------------------------------------------------
375void RBBITest::TestStatusReturn() {
376     UnicodeString rulesString1("$Letters = [:L:];\n"
377                                  "$Numbers = [:N:];\n"
378                                  "$Letters+{1};\n"
379                                  "$Numbers+{2};\n"
380                                  "Help\\ {4}/me\\!;\n"
381                                  "[^$Letters $Numbers];\n"
382                                  "!.*;\n", -1, US_INV);
383     UnicodeString testString1  = "abc123..abc Help me Help me!";
384                                // 01234567890123456789012345678
385     int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
386     int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
387
388     UErrorCode status=U_ZERO_ERROR;
389     UParseError    parseError;
390
391     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
392     if(U_FAILURE(status)) {
393         dataerrln("FAIL : in construction - %s", u_errorName(status));
394     } else {
395         int32_t  pos;
396         int32_t  i = 0;
397         bi->setText(testString1);
398         for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
399             if (pos != bounds1[i]) {
400                 errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
401                 break;
402             }
403
404             int tag = bi->getRuleStatus();
405             if (tag != brkStatus[i]) {
406                 errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
407                 break;
408             }
409             i++;
410         }
411     }
412     delete bi;
413}
414
415
416static void printStringBreaks(UnicodeString ustr, int expected[],
417                              int expectedcount)
418{
419    UErrorCode status = U_ZERO_ERROR;
420    char name[100];
421    printf("code    alpha extend alphanum type word sent line name\n");
422    int j;
423    for (j = 0; j < ustr.length(); j ++) {
424        if (expectedcount > 0) {
425            int k;
426            for (k = 0; k < expectedcount; k ++) {
427                if (j == expected[k]) {
428                    printf("------------------------------------------------ %d\n",
429                           j);
430                }
431            }
432        }
433        UChar32 c = ustr.char32At(j);
434        if (c > 0xffff) {
435            j ++;
436        }
437        u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
438        printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
439                           u_isUAlphabetic(c),
440                           u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
441                           u_isalnum(c),
442                           u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
443                                                  u_charType(c),
444                                                  U_SHORT_PROPERTY_NAME),
445                           u_getPropertyValueName(UCHAR_WORD_BREAK,
446                                                  u_getIntPropertyValue(c,
447                                                          UCHAR_WORD_BREAK),
448                                                  U_SHORT_PROPERTY_NAME),
449                           u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
450                                   u_getIntPropertyValue(c,
451                                           UCHAR_SENTENCE_BREAK),
452                                   U_SHORT_PROPERTY_NAME),
453                           u_getPropertyValueName(UCHAR_LINE_BREAK,
454                                   u_getIntPropertyValue(c,
455                                           UCHAR_LINE_BREAK),
456                                   U_SHORT_PROPERTY_NAME),
457                           name);
458    }
459}
460
461void RBBITest::TestThaiLineBreak() {
462    UErrorCode status = U_ZERO_ERROR;
463    BITestData thaiLineSelection(status);
464
465    // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
466    // represents elided letters at the end of a long word.  It should be bound to
467    // the end of the word and not treated as an independent punctuation mark.
468
469
470    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
471    ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
472    ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
473    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
474    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
475//        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
476//        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
477    ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
478    // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
479    ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
480    ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
481    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
482    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
483    ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
484    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
485
486    // the one time where the paiyannoi occurs somewhere other than at the end
487    // of a word is in the Thai abbrevation for "etc.", which both begins and
488    // ends with a paiyannoi
489    ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
490    ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
491    ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
492
493    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
494        Locale("th"), status);
495    if (U_FAILURE(status))
496    {
497        errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status));
498        return;
499    }
500
501    generalIteratorTest(*e, thaiLineSelection);
502    delete e;
503}
504
505
506
507void RBBITest::TestMixedThaiLineBreak()
508{
509    UErrorCode   status = U_ZERO_ERROR;
510    BITestData   thaiLineSelection(status);
511
512    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
513
514
515    // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
516    // start
517
518    ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
519    ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
520    ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
521    ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
522    ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
523    ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
524    ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
525    ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
526    ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
527    ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
528    ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
529    ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
530    ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
531    ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
532    ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
533    ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
534
535    // @suwit - end of changes
536
537
538    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
539    if (U_FAILURE(status))
540    {
541        errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status));
542        return;
543    }
544
545
546    generalIteratorTest(*e, thaiLineSelection);
547    delete e;
548}
549
550
551void RBBITest::TestMaiyamok()
552{
553    UErrorCode status = U_ZERO_ERROR;
554    BITestData   thaiLineSelection(status);
555    ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
556    // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
557    // word".  Instead of appearing as a word unto itself, however, it's kept together
558    // with the word before it
559    ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
560    ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
561    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
562    ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
563    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
564    ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
565    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
566    ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
567    ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
568
569    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
570        Locale("th"), status);
571
572    if (U_FAILURE(status))
573    {
574        errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status));
575        return;
576    }
577    generalIteratorTest(*e, thaiLineSelection);
578    delete e;
579}
580
581
582
583void RBBITest::TestBug3818() {
584    UErrorCode  status = U_ZERO_ERROR;
585
586    // Four Thai words...
587    static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
588                                           0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
589    UnicodeString  thaiStr(thaiWordData);
590
591    RuleBasedBreakIterator* bi =
592        (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
593    if (U_FAILURE(status) || bi == NULL) {
594        errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
595        return;
596    }
597    bi->setText(thaiStr);
598
599    int32_t  startOfSecondWord = bi->following(1);
600    if (startOfSecondWord != 4) {
601        errln("Fail at file %s, line %d expected start of word at 4, got %d",
602            __FILE__, __LINE__, startOfSecondWord);
603    }
604    startOfSecondWord = bi->following(0);
605    if (startOfSecondWord != 4) {
606        errln("Fail at file %s, line %d expected start of word at 4, got %d",
607            __FILE__, __LINE__, startOfSecondWord);
608    }
609    delete bi;
610}
611
612
613void RBBITest::TestJapaneseWordBreak() {
614// TODO: Rewrite this test for a dictionary-based word breaking.
615#if 0
616    UErrorCode status = U_ZERO_ERROR;
617    BITestData   japaneseWordSelection(status);
618
619    ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
620    ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
621    ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
622    ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
623    ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
624    ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
625    ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
626
627    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
628        Locale("ja"), status);
629    if (U_FAILURE(status))
630    {
631        errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
632        return;
633    }
634
635    generalIteratorTest(*e, japaneseWordSelection);
636    delete e;
637#endif
638}
639
640void RBBITest::TestTrieDict() {
641    UErrorCode      status  = U_ZERO_ERROR;
642
643    //
644    //  Open and read the test data file.
645    //
646    const char *testDataDirectory = IntlTest::getSourceTestData(status);
647    char testFileName[1000];
648    if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
649        errln("Can't open test data.  Path too long.");
650        return;
651    }
652    strcpy(testFileName, testDataDirectory);
653    strcat(testFileName, "riwords.txt");
654
655    // Items needing deleting at the end
656    MutableTrieDictionary *mutableDict = NULL;
657    CompactTrieDictionary *compactDict = NULL;
658    UnicodeSet            *breaks      = NULL;
659    UChar                 *testFile    = NULL;
660    StringEnumeration     *enumer1     = NULL;
661    StringEnumeration     *enumer2     = NULL;
662    MutableTrieDictionary *mutable2    = NULL;
663    StringEnumeration     *cloneEnum   = NULL;
664    CompactTrieDictionary *compact2    = NULL;
665
666
667    const UnicodeString *originalWord = NULL;
668    const UnicodeString *cloneWord    = NULL;
669    UChar *current;
670    UChar *word;
671    UChar uc;
672    int32_t wordLen;
673    int32_t wordCount;
674    int32_t testCount;
675
676    int    len;
677    testFile = ReadAndConvertFile(testFileName, len, NULL, status);
678    if (U_FAILURE(status)) {
679        goto cleanup; /* something went wrong, error already output */
680    }
681
682    mutableDict = new MutableTrieDictionary(0x0E1C, status);
683    if (U_FAILURE(status)) {
684        errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
685        goto cleanup;
686    }
687
688    breaks = new UnicodeSet;
689    breaks->add(0x000A);     // Line Feed
690    breaks->add(0x000D);     // Carriage Return
691    breaks->add(0x2028);     // Line Separator
692    breaks->add(0x2029);     // Paragraph Separator
693
694    // Now add each non-comment line of the file as a word.
695    current = testFile;
696    word = current;
697    uc = *current++;
698    wordLen = 0;
699    wordCount = 0;
700
701    while (uc) {
702        if (uc == 0x0023) {     // #comment line, skip
703            while (uc && !breaks->contains(uc)) {
704                uc = *current++;
705            }
706        }
707        else while (uc && !breaks->contains(uc)) {
708            ++wordLen;
709            uc = *current++;
710        }
711        if (wordLen > 0) {
712            mutableDict->addWord(word, wordLen, status);
713            if (U_FAILURE(status)) {
714                errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
715                goto cleanup;
716            }
717            wordCount += 1;
718        }
719
720        // Find beginning of next line
721        while (uc && breaks->contains(uc)) {
722            uc = *current++;
723        }
724        word = current-1;
725        wordLen = 0;
726    }
727
728    if (wordCount < 50) {
729        errln("Word count (%d) unreasonably small\n", wordCount);
730        goto cleanup;
731    }
732
733    enumer1 = mutableDict->openWords(status);
734    if (U_FAILURE(status)) {
735        errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
736        goto cleanup;
737    }
738
739    testCount = 0;
740    if (wordCount != (testCount = enumer1->count(status))) {
741        errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
742            testCount, wordCount, u_errorName(status));
743        goto cleanup;
744    }
745
746    // Now compact it
747    compactDict = new CompactTrieDictionary(*mutableDict, status);
748    if (U_FAILURE(status)) {
749        errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
750        goto cleanup;
751    }
752
753    enumer2 = compactDict->openWords(status);
754    if (U_FAILURE(status)) {
755        errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
756        goto cleanup;
757    }
758
759    if (wordCount != (testCount = enumer2->count(status))) {
760        errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
761            testCount, wordCount, u_errorName(status));
762        goto cleanup;
763    }
764
765    if (typeid(*enumer1) == typeid(*enumer2)) {
766        errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same");
767    }
768    delete enumer1;
769    enumer1 = NULL;
770    delete enumer2;
771    enumer2 = NULL;
772
773    // Now un-compact it
774    mutable2 = compactDict->cloneMutable(status);
775    if (U_FAILURE(status)) {
776        errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
777        goto cleanup;
778    }
779
780    cloneEnum = mutable2->openWords(status);
781    if (U_FAILURE(status)) {
782        errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
783        goto cleanup;
784    }
785
786    if (wordCount != (testCount = cloneEnum->count(status))) {
787        errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
788            testCount, wordCount, u_errorName(status));
789        goto cleanup;
790    }
791
792    // Compact original dictionary to clone. Note that we can only compare the same kind of
793    // dictionary as the order of the enumerators is not guaranteed to be the same between
794    // different kinds
795    enumer1 = mutableDict->openWords(status);
796    if (U_FAILURE(status)) {
797        errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
798        goto cleanup;
799     }
800
801    originalWord = enumer1->snext(status);
802    cloneWord = cloneEnum->snext(status);
803    while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
804        if (*originalWord != *cloneWord) {
805            errln("Original and cloned MutableTrieDictionary word mismatch\n");
806            goto cleanup;
807        }
808        originalWord = enumer1->snext(status);
809        cloneWord = cloneEnum->snext(status);
810    }
811
812    if (U_FAILURE(status)) {
813        errln("Enumeration failed: %s\n", u_errorName(status));
814        goto cleanup;
815    }
816
817    if (originalWord != cloneWord) {
818        errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
819        goto cleanup;
820    }
821
822    // Test the data copying constructor for CompactTrieDict, and the data access APIs.
823    compact2 = new CompactTrieDictionary(compactDict->data(), status);
824    if (U_FAILURE(status)) {
825        errln("CompactTrieDictionary(const void *,...) failed\n");
826        goto cleanup;
827    }
828
829    if (compact2->dataSize() == 0) {
830        errln("CompactTrieDictionary->dataSize() == 0\n");
831        goto cleanup;
832    }
833
834    // Now count the words via the second dictionary
835    delete enumer1;
836    enumer1 = compact2->openWords(status);
837    if (U_FAILURE(status)) {
838        errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
839        goto cleanup;
840    }
841
842    if (wordCount != (testCount = enumer1->count(status))) {
843        errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
844            testCount, wordCount, u_errorName(status));
845        goto cleanup;
846    }
847
848cleanup:
849    delete compactDict;
850    delete mutableDict;
851    delete breaks;
852    delete[] testFile;
853    delete enumer1;
854    delete mutable2;
855    delete cloneEnum;
856    delete compact2;
857}
858
859/*TODO: delete later*/
860inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){
861    UErrorCode      status  = U_ZERO_ERROR;
862    FILE *outfile = fopen(filename,"w");
863    UConverter *cvt = ucnv_open("UTF-8", &status);
864    if (U_FAILURE(status))
865        return;
866    if(outfile != NULL){
867        status = U_ZERO_ERROR;
868        const UnicodeString *word = enumer->snext(status);
869        while (word != NULL && U_SUCCESS(status)) {
870            char u8word[500];
871            status = U_ZERO_ERROR;
872            ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length(),
873                    &status);
874            fprintf(outfile,"%s\n", u8word);
875            status = U_ZERO_ERROR;
876            word = enumer->snext(status);
877        }
878        fclose(outfile);
879    }
880    ucnv_close(cvt);
881}
882
883// A very simple helper class to streamline the buffer handling in
884// TestTrieDictWithValue
885template<class T, size_t N>
886class AutoBuffer {
887 public:
888  AutoBuffer(size_t size) : buffer(stackBuffer) {
889    if (size > N)
890      buffer = new T[size];
891  }
892  ~AutoBuffer() {
893    if (buffer != stackBuffer)
894      delete [] buffer;
895  }
896  T* elems() {
897    return buffer;
898  }
899  const T& operator[] (size_t i) const {
900    return buffer[i];
901  }
902  T& operator[] (size_t i) {
903    return buffer[i];
904  }
905 private:
906  T stackBuffer[N];
907  T* buffer;
908  AutoBuffer();
909};
910
911//----------------------------------------------------------------------------
912//
913// TestTrieDictWithValue    Test trie dictionaries with logprob values and
914// more than 2^16 nodes after compaction.
915//
916//----------------------------------------------------------------------------
917void RBBITest::TestTrieDictWithValue() {
918    UErrorCode      status  = U_ZERO_ERROR;
919
920    //
921    //  Open and read the test data file.
922    //
923    const char *testDataDirectory = IntlTest::getSourceTestData(status);
924    const char *filename = "cjdict-truncated.txt";
925    char testFileName[1000];
926    if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filename) + 10 >= sizeof(testFileName)) {
927        errln("Can't open test data.  Path too long.");
928        return;
929    }
930    strcpy(testFileName, testDataDirectory);
931    strcat(testFileName, filename);
932
933    // Items needing deleting at the end
934    MutableTrieDictionary *mutableDict = NULL;
935    CompactTrieDictionary *compactDict = NULL;
936    UnicodeSet            *breaks      = NULL;
937    UChar                 *testFile    = NULL;
938    StringEnumeration     *enumer1     = NULL;
939    StringEnumeration     *enumer2     = NULL;
940    MutableTrieDictionary *mutable2    = NULL;
941    StringEnumeration     *cloneEnum   = NULL;
942    CompactTrieDictionary *compact2    = NULL;
943    NumberFormat          *nf           = NULL;
944    UText *originalText = NULL, *cloneText = NULL;
945
946    const UnicodeString *originalWord = NULL;
947    const UnicodeString *cloneWord    = NULL;
948    UChar *current;
949    UChar *word;
950    UChar uc;
951    int32_t wordLen;
952    int32_t wordCount;
953    int32_t testCount;
954    int32_t valueLen;
955    int counter = 0;
956
957    int    len;
958    testFile = ReadAndConvertFile(testFileName, len, NULL, status);
959    if (U_FAILURE(status)) {
960        goto cleanup; /* something went wrong, error already output */
961    }
962
963    mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE);
964    if (U_FAILURE(status)) {
965        errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
966        goto cleanup;
967    }
968
969    breaks = new UnicodeSet;
970    breaks->add(0x000A);     // Line Feed
971    breaks->add(0x000D);     // Carriage Return
972    breaks->add(0x2028);     // Line Separator
973    breaks->add(0x2029);     // Paragraph Separator
974    breaks->add(0x0009);     // Tab character
975
976    // Now add each non-comment line of the file as a word.
977    current = testFile;
978    word = current;
979    uc = *current++;
980    wordLen = 0;
981    wordCount = 0;
982    nf = NumberFormat::createInstance(status);
983
984    while (uc) {
985        UnicodeString ucharValue;
986        valueLen = 0;
987
988        if (uc == 0x0023) {     // #comment line, skip
989            while (uc && !breaks->contains(uc)) {
990                uc = *current++;
991            }
992        }
993        else{
994            while (uc && !breaks->contains(uc)) {
995                ++wordLen;
996                uc = *current++;
997            }
998            if(uc == 0x0009){ //separator is a tab char, read in num after tab
999                uc = *current++;
1000                while (uc && !breaks->contains(uc)) {
1001                    ucharValue.append(uc);
1002                    uc = *current++;
1003                }
1004            }
1005        }
1006        if (wordLen > 0) {
1007            Formattable value((int32_t)0);
1008            nf->parse(ucharValue.getTerminatedBuffer(), value, status);
1009
1010            if(U_FAILURE(status)){
1011                errln("parsing of value failed when reading in dictionary\n");
1012                goto cleanup;
1013            }
1014            mutableDict->addWord(word, wordLen, status, value.getLong());
1015            if (U_FAILURE(status)) {
1016                errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
1017                goto cleanup;
1018            }
1019            wordCount += 1;
1020        }
1021
1022        // Find beginning of next line
1023        while (uc && breaks->contains(uc)) {
1024            uc = *current++;
1025        }
1026        word = current-1;
1027        wordLen = 0;
1028    }
1029
1030    if (wordCount < 50) {
1031        errln("Word count (%d) unreasonably small\n", wordCount);
1032        goto cleanup;
1033    }
1034
1035    enumer1 = mutableDict->openWords(status);
1036    if (U_FAILURE(status)) {
1037        errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
1038        goto cleanup;
1039    }
1040
1041    testCount = 0;
1042    if (wordCount != (testCount = enumer1->count(status))) {
1043        errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
1044                testCount, wordCount, u_errorName(status));
1045        goto cleanup;
1046    }
1047
1048    // Now compact it
1049    compactDict = new CompactTrieDictionary(*mutableDict, status);
1050    if (U_FAILURE(status)) {
1051        errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
1052        goto cleanup;
1053    }
1054
1055    enumer2 = compactDict->openWords(status);
1056    if (U_FAILURE(status)) {
1057        errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
1058        goto cleanup;
1059    }
1060
1061
1062    //delete later
1063//    writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt");
1064//    writeEnumerationToFile(enumer2, "/home/jchye/compact.txt");
1065
1066    enumer1->reset(status);
1067    enumer2->reset(status);
1068
1069    originalWord = enumer1->snext(status);
1070    cloneWord = enumer2->snext(status);
1071    while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
1072        if (*originalWord != *cloneWord) {
1073            errln("MutableTrieDictionary and CompactTrieDictionary word mismatch at %d, lengths are %d and %d\n",
1074                    counter, originalWord->length(), cloneWord->length());
1075            goto cleanup;
1076        }
1077
1078        // check if attached values of the same word in both dictionaries tally
1079#if 0
1080        int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()];
1081        uint16_t values1[originalWord->length()], values2[cloneWord->length()];
1082#endif
1083        AutoBuffer<int32_t, 20> lengths1(originalWord->length());
1084        AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
1085        AutoBuffer<uint16_t, 20> values1(originalWord->length());
1086        AutoBuffer<uint16_t, 20> values2(cloneWord->length());
1087
1088        originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
1089        cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
1090
1091        int count1, count2;
1092        mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());
1093        compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());
1094
1095        if(values1[count1-1] != values2[count2-1]){
1096            errln("Values of word %d in MutableTrieDictionary and CompactTrieDictionary do not match, with values %d and %d\n",
1097                  counter, values1[count1-1], values2[count2-1]);
1098            goto cleanup;
1099        }
1100
1101        counter++;
1102        originalWord = enumer1->snext(status);
1103        cloneWord = enumer2->snext(status);
1104    }
1105    if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
1106        errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
1107    }
1108
1109    delete enumer1;
1110    enumer1 = NULL;
1111    delete enumer2;
1112    enumer2 = NULL;
1113
1114    // Now un-compact it
1115    mutable2 = compactDict->cloneMutable(status);
1116    if (U_FAILURE(status)) {
1117        errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
1118        goto cleanup;
1119    }
1120
1121    cloneEnum = mutable2->openWords(status);
1122    if (U_FAILURE(status)) {
1123        errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
1124        goto cleanup;
1125    }
1126
1127    if (wordCount != (testCount = cloneEnum->count(status))) {
1128        errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
1129                testCount, wordCount, u_errorName(status));
1130        goto cleanup;
1131    }
1132
1133    // Compact original dictionary to clone. Note that we can only compare the same kind of
1134    // dictionary as the order of the enumerators is not guaranteed to be the same between
1135    // different kinds
1136    enumer1 = mutableDict->openWords(status);
1137    if (U_FAILURE(status)) {
1138        errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
1139        goto cleanup;
1140    }
1141
1142    counter = 0;
1143    originalWord = enumer1->snext(status);
1144    cloneWord = cloneEnum->snext(status);
1145    while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
1146        if (*originalWord != *cloneWord) {
1147            errln("Original and cloned MutableTrieDictionary word mismatch\n");
1148            goto cleanup;
1149        }
1150
1151        // check if attached values of the same word in both dictionaries tally
1152        AutoBuffer<int32_t, 20> lengths1(originalWord->length());
1153        AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
1154        AutoBuffer<uint16_t, 20> values1(originalWord->length());
1155        AutoBuffer<uint16_t, 20> values2(cloneWord->length());
1156        originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
1157        cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
1158
1159        int count1, count2;
1160        mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());
1161        mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());
1162
1163        if(values1[count1-1] != values2[count2-1]){
1164            errln("Values of word %d in original and cloned MutableTrieDictionary do not match, with values %d and %d\n",
1165                  counter, values1[count1-1], values2[count2-1]);
1166            goto cleanup;
1167        }
1168
1169        counter++;
1170
1171        originalWord = enumer1->snext(status);
1172        cloneWord = cloneEnum->snext(status);
1173    }
1174
1175    if (U_FAILURE(status)) {
1176        errln("Enumeration failed: %s\n", u_errorName(status));
1177        goto cleanup;
1178    }
1179
1180    if (originalWord != cloneWord) {
1181        errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
1182        goto cleanup;
1183    }
1184
1185    // Test the data copying constructor for CompactTrieDict, and the data access APIs.
1186    compact2 = new CompactTrieDictionary(compactDict->data(), status);
1187    if (U_FAILURE(status)) {
1188        errln("CompactTrieDictionary(const void *,...) failed\n");
1189        goto cleanup;
1190    }
1191
1192    if (compact2->dataSize() == 0) {
1193        errln("CompactTrieDictionary->dataSize() == 0\n");
1194        goto cleanup;
1195    }
1196
1197    // Now count the words via the second dictionary
1198    delete enumer1;
1199    enumer1 = compact2->openWords(status);
1200    if (U_FAILURE(status)) {
1201        errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
1202        goto cleanup;
1203    }
1204
1205    if (wordCount != (testCount = enumer1->count(status))) {
1206        errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
1207                testCount, wordCount, u_errorName(status));
1208        goto cleanup;
1209    }
1210
1211    cleanup:
1212    delete compactDict;
1213    delete mutableDict;
1214    delete breaks;
1215    delete[] testFile;
1216    delete enumer1;
1217    delete mutable2;
1218    delete cloneEnum;
1219    delete compact2;
1220    utext_close(originalText);
1221    utext_close(cloneText);
1222
1223
1224}
1225
1226//----------------------------------------------------------------------------
1227//
1228// generalIteratorTest      Given a break iterator and a set of test data,
1229//                          Run the tests and report the results.
1230//
1231//----------------------------------------------------------------------------
1232void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
1233{
1234
1235    bi.setText(td.fDataToBreak);
1236
1237    testFirstAndNext(bi, td);
1238
1239    testLastAndPrevious(bi, td);
1240
1241    testFollowing(bi, td);
1242    testPreceding(bi, td);
1243    testIsBoundary(bi, td);
1244    doMultipleSelectionTest(bi, td);
1245}
1246
1247
1248//
1249//   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
1250//                       kind of loop.
1251//
1252void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
1253{
1254    UErrorCode  status = U_ZERO_ERROR;
1255    int32_t     p;
1256    int32_t     lastP = -1;
1257    int32_t     tag;
1258
1259    logln("Test first and next");
1260    bi.setText(td.fDataToBreak);
1261    td.clearResults();
1262
1263    for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
1264        td.fActualBreakPositions.addElement(p, status);  // Save result.
1265        tag = bi.getRuleStatus();
1266        td.fActualTags.addElement(tag, status);
1267        if (p <= lastP) {
1268            // If the iterator is not making forward progress, stop.
1269            //  No need to raise an error here, it'll be detected in the normal check of results.
1270            break;
1271        }
1272        lastP = p;
1273    }
1274    td.checkResults("testFirstAndNext", this);
1275}
1276
1277
1278//
1279//  TestLastAndPrevious.   Run the iterator backwards, starting with last().
1280//
1281void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
1282{
1283    UErrorCode  status = U_ZERO_ERROR;
1284    int32_t     p;
1285    int32_t     lastP  = 0x7ffffffe;
1286    int32_t     tag;
1287
1288    logln("Test last and previous");
1289    bi.setText(td.fDataToBreak);
1290    td.clearResults();
1291
1292    for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
1293        // Save break position.  Insert it at start of vector of results, shoving
1294        //    already-saved results further towards the end.
1295        td.fActualBreakPositions.insertElementAt(p, 0, status);
1296        // bi.previous();   // TODO:  Why does this fix things up????
1297        // bi.next();
1298        tag = bi.getRuleStatus();
1299        td.fActualTags.insertElementAt(tag, 0, status);
1300        if (p >= lastP) {
1301            // If the iterator is not making progress, stop.
1302            //  No need to raise an error here, it'll be detected in the normal check of results.
1303            break;
1304        }
1305        lastP = p;
1306    }
1307    td.checkResults("testLastAndPrevious", this);
1308}
1309
1310
1311void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
1312{
1313    UErrorCode  status = U_ZERO_ERROR;
1314    int32_t     p;
1315    int32_t     tag;
1316    int32_t     lastP  = -2;     // A value that will never be returned as a break position.
1317                                 //   cannot be -1; that is returned for DONE.
1318    int         i;
1319
1320    logln("testFollowing():");
1321    bi.setText(td.fDataToBreak);
1322    td.clearResults();
1323
1324    // Save the starting point, since we won't get that out of following.
1325    p = bi.first();
1326    td.fActualBreakPositions.addElement(p, status);  // Save result.
1327    tag = bi.getRuleStatus();
1328    td.fActualTags.addElement(tag, status);
1329
1330    for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
1331        p = bi.following(i);
1332        if (p != lastP) {
1333            if (p == RuleBasedBreakIterator::DONE) {
1334                break;
1335            }
1336            // We've reached a new break position.  Save it.
1337            td.fActualBreakPositions.addElement(p, status);  // Save result.
1338            tag = bi.getRuleStatus();
1339            td.fActualTags.addElement(tag, status);
1340            lastP = p;
1341        }
1342    }
1343    // The loop normally exits by means of the break in the middle.
1344    // Make sure that the index was at the correct position for the break iterator to have
1345    //   returned DONE.
1346    if (i != td.fDataToBreak.length()) {
1347        errln("testFollowing():  iterator returned DONE prematurely.");
1348    }
1349
1350    // Full check of all results.
1351    td.checkResults("testFollowing", this);
1352}
1353
1354
1355
1356void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
1357    UErrorCode  status = U_ZERO_ERROR;
1358    int32_t     p;
1359    int32_t     tag;
1360    int32_t     lastP  = 0x7ffffffe;
1361    int         i;
1362
1363    logln("testPreceding():");
1364    bi.setText(td.fDataToBreak);
1365    td.clearResults();
1366
1367    p = bi.last();
1368    td.fActualBreakPositions.addElement(p, status);
1369    tag = bi.getRuleStatus();
1370    td.fActualTags.addElement(tag, status);
1371
1372    for (i = td.fDataToBreak.length(); i>=-1; i--) {
1373        p = bi.preceding(i);
1374        if (p != lastP) {
1375            if (p == RuleBasedBreakIterator::DONE) {
1376                break;
1377            }
1378            // We've reached a new break position.  Save it.
1379            td.fActualBreakPositions.insertElementAt(p, 0, status);
1380            lastP = p;
1381            tag = bi.getRuleStatus();
1382            td.fActualTags.insertElementAt(tag, 0, status);
1383        }
1384    }
1385    // The loop normally exits by means of the break in the middle.
1386    // Make sure that the index was at the correct position for the break iterator to have
1387    //   returned DONE.
1388    if (i != 0) {
1389        errln("testPreceding():  iterator returned DONE prematurely.");
1390    }
1391
1392    // Full check of all results.
1393    td.checkResults("testPreceding", this);
1394}
1395
1396
1397
1398void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
1399    UErrorCode  status = U_ZERO_ERROR;
1400    int         i;
1401    int32_t     tag;
1402
1403    logln("testIsBoundary():");
1404    bi.setText(td.fDataToBreak);
1405    td.clearResults();
1406
1407    for (i = 0; i <= td.fDataToBreak.length(); i++) {
1408        if (bi.isBoundary(i)) {
1409            td.fActualBreakPositions.addElement(i, status);  // Save result.
1410            tag = bi.getRuleStatus();
1411            td.fActualTags.addElement(tag, status);
1412        }
1413    }
1414    td.checkResults("testIsBoundary: ", this);
1415}
1416
1417
1418
1419void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
1420{
1421    iterator.setText(td.fDataToBreak);
1422
1423    RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
1424    int32_t offset = iterator.first();
1425    int32_t testOffset;
1426    int32_t count = 0;
1427
1428    logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
1429
1430    if (*testIterator != iterator)
1431        errln("clone() or operator!= failed: two clones compared unequal");
1432
1433    do {
1434        testOffset = testIterator->first();
1435        testOffset = testIterator->next(count);
1436        if (offset != testOffset)
1437            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1438
1439        if (offset != RuleBasedBreakIterator::DONE) {
1440            count++;
1441            offset = iterator.next();
1442
1443            if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
1444                errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
1445                if (count > 10000 || offset == -1) {
1446                    errln("operator== failed too many times. Stopping test.");
1447                    if (offset == -1) {
1448                        errln("Does (RuleBasedBreakIterator::DONE == -1)?");
1449                    }
1450                    return;
1451                }
1452            }
1453        }
1454    } while (offset != RuleBasedBreakIterator::DONE);
1455
1456    // now do it backwards...
1457    offset = iterator.last();
1458    count = 0;
1459
1460    do {
1461        testOffset = testIterator->last();
1462        testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
1463        if (offset != testOffset)
1464            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1465
1466        if (offset != RuleBasedBreakIterator::DONE) {
1467            count--;
1468            offset = iterator.previous();
1469        }
1470    } while (offset != RuleBasedBreakIterator::DONE);
1471
1472    delete testIterator;
1473}
1474
1475
1476//---------------------------------------------
1477//
1478//     other tests
1479//
1480//---------------------------------------------
1481void RBBITest::TestEmptyString()
1482{
1483    UnicodeString text = "";
1484    UErrorCode status = U_ZERO_ERROR;
1485
1486    BITestData x(status);
1487    ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
1488    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
1489    if (U_FAILURE(status))
1490    {
1491        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
1492        return;
1493    }
1494    generalIteratorTest(*bi, x);
1495    delete bi;
1496}
1497
1498void RBBITest::TestGetAvailableLocales()
1499{
1500    int32_t locCount = 0;
1501    const Locale* locList = BreakIterator::getAvailableLocales(locCount);
1502
1503    if (locCount == 0)
1504        dataerrln("getAvailableLocales() returned an empty list!");
1505    // Just make sure that it's returning good memory.
1506    int32_t i;
1507    for (i = 0; i < locCount; ++i) {
1508        logln(locList[i].getName());
1509    }
1510}
1511
1512//Testing the BreakIterator::getDisplayName() function
1513void RBBITest::TestGetDisplayName()
1514{
1515    UnicodeString   result;
1516
1517    BreakIterator::getDisplayName(Locale::getUS(), result);
1518    if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
1519        dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1520                + result);
1521
1522    BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
1523    if (result != "French (France)")
1524        dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1525                + result);
1526}
1527/**
1528 * Test End Behaviour
1529 * @bug 4068137
1530 */
1531void RBBITest::TestEndBehaviour()
1532{
1533    UErrorCode status = U_ZERO_ERROR;
1534    UnicodeString testString("boo.");
1535    BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
1536    if (U_FAILURE(status))
1537    {
1538        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
1539        return;
1540    }
1541    wb->setText(testString);
1542
1543    if (wb->first() != 0)
1544        errln("Didn't get break at beginning of string.");
1545    if (wb->next() != 3)
1546        errln("Didn't get break before period in \"boo.\"");
1547    if (wb->current() != 4 && wb->next() != 4)
1548        errln("Didn't get break at end of string.");
1549    delete wb;
1550}
1551/*
1552 * @bug 4153072
1553 */
1554void RBBITest::TestBug4153072() {
1555    UErrorCode status = U_ZERO_ERROR;
1556    BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
1557    if (U_FAILURE(status))
1558    {
1559        errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
1560        return;
1561    }
1562    UnicodeString str("...Hello, World!...");
1563    int32_t begin = 3;
1564    int32_t end = str.length() - 3;
1565    UBool onBoundary;
1566
1567    StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
1568    iter->adoptText(textIterator);
1569    int index;
1570    // Note: with the switch to UText, there is no way to restrict the
1571    //       iteration range to begin at an index other than zero.
1572    //       String character iterators created with a non-zero bound are
1573    //         treated by RBBI as being empty.
1574    for (index = -1; index < begin + 1; ++index) {
1575        onBoundary = iter->isBoundary(index);
1576        if (index == 0?  !onBoundary : onBoundary) {
1577            errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
1578                            " and begin index = " + begin);
1579        }
1580    }
1581    delete iter;
1582}
1583
1584
1585//
1586// Test for problem reported by Ashok Matoria on 9 July 2007
1587//    One.<kSoftHyphen><kSpace>Two.
1588//
1589//    Sentence break at start (0) and then on calling next() it breaks at
1590//   'T' of "Two". Now, at this point if I do next() and
1591//    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
1592//
1593void RBBITest::TestBug5775() {
1594    UErrorCode status = U_ZERO_ERROR;
1595    BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1596    TEST_ASSERT_SUCCESS(status);
1597    if (U_FAILURE(status)) {
1598        return;
1599    }
1600// Check for status first for better handling of no data errors.
1601    TEST_ASSERT(bi != NULL);
1602    if (bi == NULL) {
1603        return;
1604    }
1605
1606    UnicodeString s("One.\\u00ad Two.", -1, US_INV);
1607    //               01234      56789
1608    s = s.unescape();
1609    bi->setText(s);
1610    int pos = bi->next();
1611    TEST_ASSERT(pos == 6);
1612    pos = bi->next();
1613    TEST_ASSERT(pos == 10);
1614    pos = bi->previous();
1615    TEST_ASSERT(pos == 6);
1616    delete bi;
1617}
1618
1619
1620
1621/**
1622 * Test Japanese Line Break
1623 * @bug 4095322
1624 */
1625void RBBITest::TestJapaneseLineBreak()
1626{
1627#if 0
1628    // Test needs updating some more...   Dump it for now.
1629
1630
1631    // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
1632    //        as opening and closing punctuation for line breaking.
1633    //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
1634    //        from these tests.    6-13-2002
1635    //
1636    UErrorCode status = U_ZERO_ERROR;
1637    UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
1638    UnicodeString precedingChars = CharsToUnicodeString(
1639        //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1640        "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1641    UnicodeString followingChars = CharsToUnicodeString(
1642        // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1643        ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1644        // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1645        ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1646        "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1647    BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
1648
1649    int32_t i;
1650    if (U_FAILURE(status))
1651    {
1652        errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1653        return;
1654    }
1655
1656    for (i = 0; i < precedingChars.length(); i++) {
1657        testString.setCharAt(1, precedingChars[i]);
1658        iter->setText(testString);
1659        int32_t j = iter->first();
1660        if (j != 0)
1661            errln("ja line break failure: failed to start at 0");
1662        j = iter->next();
1663        if (j != 1)
1664            errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
1665                        + "' (" + ((int)(precedingChars[i])) + ")");
1666        j = iter->next();
1667        if (j != 3)
1668            errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
1669                        + "' (" + ((int)(precedingChars[i])) + ")");
1670    }
1671
1672    for (i = 0; i < followingChars.length(); i++) {
1673        testString.setCharAt(1, followingChars[i]);
1674        iter->setText(testString);
1675        int j = iter->first();
1676        if (j != 0)
1677            errln("ja line break failure: failed to start at 0");
1678        j = iter->next();
1679        if (j != 2)
1680            errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
1681                        + "' (" + ((int)(followingChars[i])) + ")");
1682        j = iter->next();
1683        if (j != 3)
1684            errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
1685                        + "' (" + ((int)(followingChars[i])) + ")");
1686    }
1687    delete iter;
1688#endif
1689}
1690
1691
1692//------------------------------------------------------------------------------
1693//
1694//   RBBITest::Extended    Run  RBBI Tests from an external test data file
1695//
1696//------------------------------------------------------------------------------
1697
1698struct TestParams {
1699    BreakIterator   *bi;
1700    UnicodeString    dataToBreak;
1701    UVector32       *expectedBreaks;
1702    UVector32       *srcLine;
1703    UVector32       *srcCol;
1704};
1705
1706void RBBITest::executeTest(TestParams *t) {
1707    int32_t    bp;
1708    int32_t    prevBP;
1709    int32_t    i;
1710
1711    if (t->bi == NULL) {
1712        return;
1713    }
1714
1715    t->bi->setText(t->dataToBreak);
1716    //
1717    //  Run the iterator forward
1718    //
1719    prevBP = -1;
1720    for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1721        if (prevBP ==  bp) {
1722            // Fail for lack of forward progress.
1723            errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
1724                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1725            break;
1726        }
1727
1728        // Check that there were we didn't miss an expected break between the last one
1729        //  and this one.
1730        for (i=prevBP+1; i<bp; i++) {
1731            if (t->expectedBreaks->elementAti(i) != 0) {
1732                int expected[] = {0, i};
1733                printStringBreaks(t->dataToBreak, expected, 2);
1734                errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1735                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1736            }
1737        }
1738
1739        // Check that the break we did find was expected
1740        if (t->expectedBreaks->elementAti(bp) == 0) {
1741            int expected[] = {0, bp};
1742            printStringBreaks(t->dataToBreak, expected, 2);
1743            errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1744                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1745        } else {
1746            // The break was expected.
1747            //   Check that the {nnn} tag value is correct.
1748            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1749            if (expectedTagVal == -1) {
1750                expectedTagVal = 0;
1751            }
1752            int32_t line = t->srcLine->elementAti(bp);
1753            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1754            if (rs != expectedTagVal) {
1755                errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1756                      "          Actual, Expected status = %4d, %4d",
1757                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1758            }
1759        }
1760
1761
1762        prevBP = bp;
1763    }
1764
1765    // Verify that there were no missed expected breaks after the last one found
1766    for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
1767        if (t->expectedBreaks->elementAti(i) != 0) {
1768            errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1769                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1770        }
1771    }
1772
1773    //
1774    //  Run the iterator backwards, verify that the same breaks are found.
1775    //
1776    prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
1777    for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1778        if (prevBP ==  bp) {
1779            // Fail for lack of progress.
1780            errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1781                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1782            break;
1783        }
1784
1785        // Check that there were we didn't miss an expected break between the last one
1786        //  and this one.  (UVector returns zeros for index out of bounds.)
1787        for (i=prevBP-1; i>bp; i--) {
1788            if (t->expectedBreaks->elementAti(i) != 0) {
1789                errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1790                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1791            }
1792        }
1793
1794        // Check that the break we did find was expected
1795        if (t->expectedBreaks->elementAti(bp) == 0) {
1796            errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1797                   bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1798        } else {
1799            // The break was expected.
1800            //   Check that the {nnn} tag value is correct.
1801            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1802            if (expectedTagVal == -1) {
1803                expectedTagVal = 0;
1804            }
1805            int line = t->srcLine->elementAti(bp);
1806            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1807            if (rs != expectedTagVal) {
1808                errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1809                      "          Actual, Expected status = %4d, %4d",
1810                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1811            }
1812        }
1813
1814        prevBP = bp;
1815    }
1816
1817    // Verify that there were no missed breaks prior to the last one found
1818    for (i=prevBP-1; i>=0; i--) {
1819        if (t->expectedBreaks->elementAti(i) != 0) {
1820            errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1821                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1822        }
1823    }
1824}
1825
1826
1827void RBBITest::TestExtended() {
1828#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1829    UErrorCode      status  = U_ZERO_ERROR;
1830    Locale          locale("");
1831
1832    UnicodeString       rules;
1833    TestParams          tp;
1834    tp.bi             = NULL;
1835    tp.expectedBreaks = new UVector32(status);
1836    tp.srcLine        = new UVector32(status);
1837    tp.srcCol         = new UVector32(status);
1838
1839    RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
1840    if (U_FAILURE(status)) {
1841        dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1842    }
1843
1844
1845    //
1846    //  Open and read the test data file.
1847    //
1848    const char *testDataDirectory = IntlTest::getSourceTestData(status);
1849    char testFileName[1000];
1850    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1851        errln("Can't open test data.  Path too long.");
1852        return;
1853    }
1854    strcpy(testFileName, testDataDirectory);
1855    strcat(testFileName, "rbbitst.txt");
1856
1857    int    len;
1858    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1859    if (U_FAILURE(status)) {
1860        return; /* something went wrong, error already output */
1861    }
1862
1863
1864
1865
1866    //
1867    //  Put the test data into a UnicodeString
1868    //
1869    UnicodeString testString(FALSE, testFile, len);
1870
1871    enum EParseState{
1872        PARSE_COMMENT,
1873        PARSE_TAG,
1874        PARSE_DATA,
1875        PARSE_NUM
1876    }
1877    parseState = PARSE_TAG;
1878
1879    EParseState savedState = PARSE_TAG;
1880
1881    static const UChar CH_LF        = 0x0a;
1882    static const UChar CH_CR        = 0x0d;
1883    static const UChar CH_HASH      = 0x23;
1884    /*static const UChar CH_PERIOD    = 0x2e;*/
1885    static const UChar CH_LT        = 0x3c;
1886    static const UChar CH_GT        = 0x3e;
1887    static const UChar CH_BACKSLASH = 0x5c;
1888    static const UChar CH_BULLET    = 0x2022;
1889
1890    int32_t    lineNum  = 1;
1891    int32_t    colStart = 0;
1892    int32_t    column   = 0;
1893    int32_t    charIdx  = 0;
1894
1895    int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1896
1897    for (charIdx = 0; charIdx < len; ) {
1898        status = U_ZERO_ERROR;
1899        UChar  c = testString.charAt(charIdx);
1900        charIdx++;
1901        if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1902            // treat CRLF as a unit
1903            c = CH_LF;
1904            charIdx++;
1905        }
1906        if (c == CH_LF || c == CH_CR) {
1907            lineNum++;
1908            colStart = charIdx;
1909        }
1910        column = charIdx - colStart + 1;
1911
1912        switch (parseState) {
1913        case PARSE_COMMENT:
1914            if (c == 0x0a || c == 0x0d) {
1915                parseState = savedState;
1916            }
1917            break;
1918
1919        case PARSE_TAG:
1920            {
1921            if (c == CH_HASH) {
1922                parseState = PARSE_COMMENT;
1923                savedState = PARSE_TAG;
1924                break;
1925            }
1926            if (u_isUWhiteSpace(c)) {
1927                break;
1928            }
1929            if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1930                delete tp.bi;
1931                tp.bi = BreakIterator::createWordInstance(locale,  status);
1932                charIdx += 5;
1933                break;
1934            }
1935            if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1936                delete tp.bi;
1937                tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1938                charIdx += 5;
1939                break;
1940            }
1941            if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1942                delete tp.bi;
1943                tp.bi = BreakIterator::createLineInstance(locale,  status);
1944                charIdx += 5;
1945                break;
1946            }
1947            if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1948                delete tp.bi;
1949                tp.bi = NULL;
1950                tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1951                charIdx += 5;
1952                break;
1953            }
1954            if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1955                delete tp.bi;
1956                tp.bi = BreakIterator::createTitleInstance(locale,  status);
1957                charIdx += 6;
1958                break;
1959            }
1960
1961            // <locale  loc_name>
1962            localeMatcher.reset(testString);
1963            if (localeMatcher.lookingAt(charIdx-1, status)) {
1964                UnicodeString localeName = localeMatcher.group(1, status);
1965                char localeName8[100];
1966                localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1967                locale = Locale::createFromName(localeName8);
1968                charIdx += localeMatcher.group(0, status).length();
1969                TEST_ASSERT_SUCCESS(status);
1970                break;
1971            }
1972            if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1973                parseState = PARSE_DATA;
1974                charIdx += 5;
1975                tp.dataToBreak = "";
1976                tp.expectedBreaks->removeAllElements();
1977                tp.srcCol ->removeAllElements();
1978                tp.srcLine->removeAllElements();
1979                break;
1980            }
1981
1982            errln("line %d: Tag expected in test file.", lineNum);
1983            parseState = PARSE_COMMENT;
1984            savedState = PARSE_DATA;
1985            goto end_test; // Stop the test.
1986            }
1987            break;
1988
1989        case PARSE_DATA:
1990            if (c == CH_BULLET) {
1991                int32_t  breakIdx = tp.dataToBreak.length();
1992                tp.expectedBreaks->setSize(breakIdx+1);
1993                tp.expectedBreaks->setElementAt(-1, breakIdx);
1994                tp.srcLine->setSize(breakIdx+1);
1995                tp.srcLine->setElementAt(lineNum, breakIdx);
1996                tp.srcCol ->setSize(breakIdx+1);
1997                tp.srcCol ->setElementAt(column, breakIdx);
1998                break;
1999            }
2000
2001            if (testString.compare(charIdx-1, 7, "</data>") == 0) {
2002                // Add final entry to mappings from break location to source file position.
2003                //  Need one extra because last break position returned is after the
2004                //    last char in the data, not at the last char.
2005                tp.srcLine->addElement(lineNum, status);
2006                tp.srcCol ->addElement(column, status);
2007
2008                parseState = PARSE_TAG;
2009                charIdx += 6;
2010
2011                // RUN THE TEST!
2012                executeTest(&tp);
2013                break;
2014            }
2015
2016            if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
2017                // Named character, e.g. \N{COMBINING GRAVE ACCENT}
2018                // Get the code point from the name and insert it into the test data.
2019                //   (Damn, no API takes names in Unicode  !!!
2020                //    we've got to take it back to char *)
2021                int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
2022                int32_t nameLength = nameEndIdx - (charIdx+2);
2023                char charNameBuf[200];
2024                UChar32 theChar = -1;
2025                if (nameEndIdx != -1) {
2026                    UErrorCode status = U_ZERO_ERROR;
2027                    testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
2028                    charNameBuf[sizeof(charNameBuf)-1] = 0;
2029                    theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
2030                    if (U_FAILURE(status)) {
2031                        theChar = -1;
2032                    }
2033                }
2034                if (theChar == -1) {
2035                    errln("Error in named character in test file at line %d, col %d",
2036                        lineNum, column);
2037                } else {
2038                    // Named code point was recognized.  Insert it
2039                    //   into the test data.
2040                    tp.dataToBreak.append(theChar);
2041                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
2042                        tp.srcLine->addElement(lineNum, status);
2043                        tp.srcCol ->addElement(column, status);
2044                    }
2045                }
2046                if (nameEndIdx > charIdx) {
2047                    charIdx = nameEndIdx+1;
2048
2049                }
2050                break;
2051            }
2052
2053
2054
2055
2056            if (testString.compare(charIdx-1, 2, "<>") == 0) {
2057                charIdx++;
2058                int32_t  breakIdx = tp.dataToBreak.length();
2059                tp.expectedBreaks->setSize(breakIdx+1);
2060                tp.expectedBreaks->setElementAt(-1, breakIdx);
2061                tp.srcLine->setSize(breakIdx+1);
2062                tp.srcLine->setElementAt(lineNum, breakIdx);
2063                tp.srcCol ->setSize(breakIdx+1);
2064                tp.srcCol ->setElementAt(column, breakIdx);
2065                break;
2066            }
2067
2068            if (c == CH_LT) {
2069                tagValue   = 0;
2070                parseState = PARSE_NUM;
2071                break;
2072            }
2073
2074            if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
2075                parseState = PARSE_COMMENT;
2076                savedState = PARSE_DATA;
2077                break;
2078            }
2079
2080            if (c == CH_BACKSLASH) {
2081                // Check for \ at end of line, a line continuation.
2082                //     Advance over (discard) the newline
2083                UChar32 cp = testString.char32At(charIdx);
2084                if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
2085                    // We have a CR LF
2086                    //  Need an extra increment of the input ptr to move over both of them
2087                    charIdx++;
2088                }
2089                if (cp == CH_LF || cp == CH_CR) {
2090                    lineNum++;
2091                    colStart = charIdx;
2092                    charIdx++;
2093                    break;
2094                }
2095
2096                // Let unescape handle the back slash.
2097                cp = testString.unescapeAt(charIdx);
2098                if (cp != -1) {
2099                    // Escape sequence was recognized.  Insert the char
2100                    //   into the test data.
2101                    tp.dataToBreak.append(cp);
2102                    while (tp.dataToBreak.length() > tp.srcLine->size()) {
2103                        tp.srcLine->addElement(lineNum, status);
2104                        tp.srcCol ->addElement(column, status);
2105                    }
2106                    break;
2107                }
2108
2109
2110                // Not a recognized backslash escape sequence.
2111                // Take the next char as a literal.
2112                //  TODO:  Should this be an error?
2113                c = testString.charAt(charIdx);
2114                charIdx = testString.moveIndex32(charIdx, 1);
2115            }
2116
2117            // Normal, non-escaped data char.
2118            tp.dataToBreak.append(c);
2119
2120            // Save the mapping from offset in the data to line/column numbers in
2121            //   the original input file.  Will be used for better error messages only.
2122            //   If there's an expected break before this char, the slot in the mapping
2123            //     vector will already be set for this char; don't overwrite it.
2124            if (tp.dataToBreak.length() > tp.srcLine->size()) {
2125                tp.srcLine->addElement(lineNum, status);
2126                tp.srcCol ->addElement(column, status);
2127            }
2128            break;
2129
2130
2131        case PARSE_NUM:
2132            // We are parsing an expected numeric tag value, like <1234>,
2133            //   within a chunk of data.
2134            if (u_isUWhiteSpace(c)) {
2135                break;
2136            }
2137
2138            if (c == CH_GT) {
2139                // Finished the number.  Add the info to the expected break data,
2140                //   and switch parse state back to doing plain data.
2141                parseState = PARSE_DATA;
2142                if (tagValue == 0) {
2143                    tagValue = -1;
2144                }
2145                int32_t  breakIdx = tp.dataToBreak.length();
2146                tp.expectedBreaks->setSize(breakIdx+1);
2147                tp.expectedBreaks->setElementAt(tagValue, breakIdx);
2148                tp.srcLine->setSize(breakIdx+1);
2149                tp.srcLine->setElementAt(lineNum, breakIdx);
2150                tp.srcCol ->setSize(breakIdx+1);
2151                tp.srcCol ->setElementAt(column, breakIdx);
2152                break;
2153            }
2154
2155            if (u_isdigit(c)) {
2156                tagValue = tagValue*10 + u_charDigitValue(c);
2157                break;
2158            }
2159
2160            errln("Syntax Error in test file at line %d, col %d",
2161                lineNum, column);
2162            parseState = PARSE_COMMENT;
2163            goto end_test; // Stop the test
2164            break;
2165        }
2166
2167
2168        if (U_FAILURE(status)) {
2169            errln("ICU Error %s while parsing test file at line %d.",
2170                u_errorName(status), lineNum);
2171            status = U_ZERO_ERROR;
2172            goto end_test; // Stop the test
2173        }
2174
2175    }
2176
2177end_test:
2178    delete tp.bi;
2179    delete tp.expectedBreaks;
2180    delete tp.srcLine;
2181    delete tp.srcCol;
2182    delete [] testFile;
2183#endif
2184}
2185
2186void RBBITest::TestThaiBreaks() {
2187    UErrorCode status=U_ZERO_ERROR;
2188    BreakIterator* b;
2189    Locale locale = Locale("th");
2190    int32_t p, index;
2191    UChar c[]= {
2192            0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
2193            0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
2194            0x0E16, 0x0E49, 0x0E33, 0x0000
2195    };
2196    int32_t expectedWordResult[] = {
2197            2, 3, 6, 10, 11, 15, 17, 20, 22
2198    };
2199    int32_t expectedLineResult[] = {
2200            3, 6, 11, 15, 17, 20, 22
2201    };
2202
2203    int32_t size = u_strlen(c);
2204    UnicodeString text=UnicodeString(c);
2205
2206    b = BreakIterator::createWordInstance(locale, status);
2207    if (U_FAILURE(status)) {
2208        errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
2209        return;
2210    }
2211    b->setText(text);
2212    p = index = 0;
2213    while ((p=b->next())!=BreakIterator::DONE && p < size) {
2214        if (p != expectedWordResult[index++]) {
2215            errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
2216        }
2217    }
2218    delete b;
2219
2220    b = BreakIterator::createLineInstance(locale, status);
2221    if (U_FAILURE(status)) {
2222        printf("Unable to create thai line break iterator.\n");
2223        return;
2224    }
2225    b->setText(text);
2226    p = index = 0;
2227    while ((p=b->next())!=BreakIterator::DONE && p < size) {
2228        if (p != expectedLineResult[index++]) {
2229            errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
2230        }
2231    }
2232
2233    delete b;
2234}
2235
2236// UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
2237// Words don't include colon or period (cldrbug #1969).
2238static const char    posxWordText[]     = "Can't have breaks in xx:yy or struct.field for CS-types.";
2239static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
2240static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21,         26, 27, 29, 30,         42, 43, 46, 47, 49, 50, 55, 56 };
2241
2242// UBreakIteratorType UBRK_WORD, Locale "ja"
2243// Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
2244static const char    jaWordText[]     = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
2245                                        "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
2246#if 0
2247static const int32_t jaWordTOffsets[] = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };
2248static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
2249#endif
2250// There's no separate Japanese word break iterator. Root is the same as Japanese.
2251// Our dictionary-based iterator has to be tweaked to better handle U+3005,
2252// U+3007, U+300B and some other cases.
2253static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5,    7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
2254static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5,    7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
2255
2256// UBreakIteratorType UBRK_SENTENCE, Locale "el"
2257// Add break after Greek question mark (cldrbug #2069).
2258static const char    elSentText[]     = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "
2259                                        "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3";
2260static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 };
2261static const int32_t elSentROffsets[] = {        20, 27, 35, 36 };
2262
2263// UBreakIteratorType UBRK_CHARACTER, Locale "th"
2264// Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
2265static const char    thCharText[]     = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 "
2266                                        "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) "
2267                                        "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 ";
2268static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,
2269                                          12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,
2270                                          29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };
2271static const int32_t thCharROffsets[] = { 1,    3, 5, 6, 7, 8, 9,     11,
2272                                          12, 13, 15,     17, 19, 20, 22,     24,     26, 27, 28,
2273                                          29,     32, 33, 35, 37, 38,     40, 41 };
2274
2275typedef struct {
2276    UBreakIteratorType  type;
2277    const char *        locale;
2278    const char *        escapedText;
2279    const int32_t *     tailoredOffsets;
2280    int32_t             tailoredOffsetsCount;
2281    const int32_t *     rootOffsets;
2282    int32_t             rootOffsetsCount;
2283} TailoredBreakItem;
2284
2285#define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0]))
2286
2287static const TailoredBreakItem tbItems[] = {
2288    { UBRK_WORD,      "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) },
2289    { UBRK_WORD,      "ja",          jaWordText,   ARRAY_PTR_LEN(jaWordTOffsets),   ARRAY_PTR_LEN(jaWordROffsets)   },
2290    { UBRK_SENTENCE,  "el",          elSentText,   ARRAY_PTR_LEN(elSentTOffsets),   ARRAY_PTR_LEN(elSentROffsets)   },
2291    { UBRK_CHARACTER, "th",          thCharText,   ARRAY_PTR_LEN(thCharTOffsets),   ARRAY_PTR_LEN(thCharROffsets)   },
2292    { UBRK_CHARACTER, NULL,          NULL,         NULL,0,                          NULL,0                          } // terminator
2293};
2294
2295static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) {
2296    while (count-- > 0) {
2297        int writeCount;
2298        sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */
2299        buffer += writeCount;
2300        buflen -= writeCount;
2301    }
2302}
2303
2304enum { kMaxOffsetCount = 128 };
2305
2306void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) {
2307    brkitr->setText( CharsToUnicodeString(escapedText) );
2308    int32_t foundOffsets[kMaxOffsetCount];
2309    int32_t offset, foundOffsetsCount = 0;
2310    // do forwards iteration test
2311    while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) {
2312        foundOffsets[foundOffsetsCount++] = offset;
2313    }
2314    if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) {
2315        // log error for forwards test
2316        char formatExpect[512], formatFound[512];
2317        formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
2318        formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets);
2319        errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n",
2320                type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound);
2321    } else {
2322        // do backwards iteration test
2323        --foundOffsetsCount; // back off one from the end offset
2324        while ( foundOffsetsCount > 0 ) {
2325            offset = brkitr->previous();
2326            if ( offset != foundOffsets[--foundOffsetsCount] ) {
2327                // log error for backwards test
2328                char formatExpect[512];
2329                formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
2330                errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n",
2331                        type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]);
2332                break;
2333            }
2334        }
2335    }
2336}
2337
2338void RBBITest::TestTailoredBreaks() {
2339    const TailoredBreakItem * tbItemPtr;
2340    Locale rootLocale = Locale("root");
2341    for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) {
2342        Locale testLocale = Locale(tbItemPtr->locale);
2343        BreakIterator * tailoredBrkiter = NULL;
2344        BreakIterator * rootBrkiter = NULL;
2345        UErrorCode status = U_ZERO_ERROR;
2346        switch (tbItemPtr->type) {
2347            case UBRK_CHARACTER:
2348                tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status);
2349                rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status);
2350                break;
2351            case UBRK_WORD:
2352                tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status);
2353                rootBrkiter = BreakIterator::createWordInstance(rootLocale, status);
2354                break;
2355            case UBRK_LINE:
2356                tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status);
2357                rootBrkiter = BreakIterator::createLineInstance(rootLocale, status);
2358                break;
2359            case UBRK_SENTENCE:
2360                tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status);
2361                rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status);
2362                break;
2363            default:
2364                status = U_UNSUPPORTED_ERROR;
2365                break;
2366        }
2367        if (U_FAILURE(status)) {
2368            errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status));
2369            continue;
2370        }
2371        TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount);
2372        TBTest(rootBrkiter,     (int)(tbItemPtr->type), "root",            tbItemPtr->escapedText, tbItemPtr->rootOffsets,     tbItemPtr->rootOffsetsCount);
2373
2374        delete rootBrkiter;
2375        delete tailoredBrkiter;
2376    }
2377}
2378
2379
2380//-------------------------------------------------------------------------------
2381//
2382//  TestDictRules   create a break iterator from source rules that includes a
2383//                  dictionary range.   Regression for bug #7130.  Source rules
2384//                  do not declare a break iterator type (word, line, sentence, etc.
2385//                  but the dictionary code, without a type, would loop.
2386//
2387//-------------------------------------------------------------------------------
2388void RBBITest::TestDictRules() {
2389    const char *rules =  "$dictionary = [a-z]; \n"
2390                         "!!forward; \n"
2391                         "$dictionary $dictionary; \n"
2392                         "!!reverse; \n"
2393                         "$dictionary $dictionary; \n";
2394    const char *text = "aa";
2395    UErrorCode status = U_ZERO_ERROR;
2396    UParseError parseError;
2397
2398    RuleBasedBreakIterator bi(rules, parseError, status);
2399    if (U_SUCCESS(status)) {
2400        UnicodeString utext = text;
2401        bi.setText(utext);
2402        int32_t position;
2403        int32_t loops;
2404        for (loops = 0; loops<10; loops++) {
2405            position = bi.next();
2406            if (position == RuleBasedBreakIterator::DONE) {
2407                break;
2408            }
2409        }
2410        TEST_ASSERT(loops == 1);
2411    } else {
2412        dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
2413    }
2414}
2415
2416
2417
2418//-------------------------------------------------------------------------------
2419//
2420//    ReadAndConvertFile   Read a text data file, convert it to UChars, and
2421//    return the datain one big UChar * buffer, which the caller must delete.
2422//
2423//    parameters:
2424//          fileName:   the name of the file, with no directory part.  The test data directory
2425//                      is assumed.
2426//          ulen        an out parameter, receives the actual length (in UChars) of the file data.
2427//          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
2428//                      specified here.  The BOM, if it exists, will be stripped from the returned data.
2429//                      Pass NULL for the system default encoding.
2430//          status
2431//    returns:
2432//                      The file data, converted to UChar.
2433//                      The caller must delete this when done with
2434//                           delete [] theBuffer;
2435//
2436//    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
2437//           Move this function to some common place.
2438//
2439//--------------------------------------------------------------------------------
2440UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
2441    UChar       *retPtr  = NULL;
2442    char        *fileBuf = NULL;
2443    UConverter* conv     = NULL;
2444    FILE        *f       = NULL;
2445
2446    ulen = 0;
2447    if (U_FAILURE(status)) {
2448        return retPtr;
2449    }
2450
2451    //
2452    //  Open the file.
2453    //
2454    f = fopen(fileName, "rb");
2455    if (f == 0) {
2456        dataerrln("Error opening test data file %s\n", fileName);
2457        status = U_FILE_ACCESS_ERROR;
2458        return NULL;
2459    }
2460    //
2461    //  Read it in
2462    //
2463    int   fileSize;
2464    int   amt_read;
2465
2466    fseek( f, 0, SEEK_END);
2467    fileSize = ftell(f);
2468    fileBuf = new char[fileSize];
2469    fseek(f, 0, SEEK_SET);
2470    amt_read = fread(fileBuf, 1, fileSize, f);
2471    if (amt_read != fileSize || fileSize <= 0) {
2472        errln("Error reading test data file.");
2473        goto cleanUpAndReturn;
2474    }
2475
2476    //
2477    // Look for a Unicode Signature (BOM) on the data just read
2478    //
2479    int32_t        signatureLength;
2480    const char *   fileBufC;
2481    const char*    bomEncoding;
2482
2483    fileBufC = fileBuf;
2484    bomEncoding = ucnv_detectUnicodeSignature(
2485        fileBuf, fileSize, &signatureLength, &status);
2486    if(bomEncoding!=NULL ){
2487        fileBufC  += signatureLength;
2488        fileSize  -= signatureLength;
2489        encoding = bomEncoding;
2490    }
2491
2492    //
2493    // Open a converter to take the rule file to UTF-16
2494    //
2495    conv = ucnv_open(encoding, &status);
2496    if (U_FAILURE(status)) {
2497        goto cleanUpAndReturn;
2498    }
2499
2500    //
2501    // Convert the rules to UChar.
2502    //  Preflight first to determine required buffer size.
2503    //
2504    ulen = ucnv_toUChars(conv,
2505        NULL,           //  dest,
2506        0,              //  destCapacity,
2507        fileBufC,
2508        fileSize,
2509        &status);
2510    if (status == U_BUFFER_OVERFLOW_ERROR) {
2511        // Buffer Overflow is expected from the preflight operation.
2512        status = U_ZERO_ERROR;
2513
2514        retPtr = new UChar[ulen+1];
2515        ucnv_toUChars(conv,
2516            retPtr,       //  dest,
2517            ulen+1,
2518            fileBufC,
2519            fileSize,
2520            &status);
2521    }
2522
2523cleanUpAndReturn:
2524    fclose(f);
2525    delete []fileBuf;
2526    ucnv_close(conv);
2527    if (U_FAILURE(status)) {
2528        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
2529        delete retPtr;
2530        retPtr = 0;
2531        ulen   = 0;
2532    };
2533    return retPtr;
2534}
2535
2536
2537
2538//--------------------------------------------------------------------------------------------
2539//
2540//   Run tests from each of the boundary test data files distributed by the Unicode Consortium
2541//
2542//-------------------------------------------------------------------------------------------
2543void RBBITest::TestUnicodeFiles() {
2544    RuleBasedBreakIterator  *bi;
2545    UErrorCode               status = U_ZERO_ERROR;
2546
2547    bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2548    TEST_ASSERT_SUCCESS(status);
2549    if (U_SUCCESS(status)) {
2550        runUnicodeTestData("GraphemeBreakTest.txt", bi);
2551    }
2552    delete bi;
2553
2554    bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
2555    TEST_ASSERT_SUCCESS(status);
2556    if (U_SUCCESS(status)) {
2557        runUnicodeTestData("WordBreakTest.txt", bi);
2558    }
2559    delete bi;
2560
2561    bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
2562    TEST_ASSERT_SUCCESS(status);
2563    if (U_SUCCESS(status)) {
2564        runUnicodeTestData("SentenceBreakTest.txt", bi);
2565    }
2566    delete bi;
2567
2568    bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
2569    TEST_ASSERT_SUCCESS(status);
2570    if (U_SUCCESS(status)) {
2571        runUnicodeTestData("LineBreakTest.txt", bi);
2572    }
2573    delete bi;
2574}
2575
2576
2577//--------------------------------------------------------------------------------------------
2578//
2579//   Run tests from one of the boundary test data files distributed by the Unicode Consortium
2580//
2581//-------------------------------------------------------------------------------------------
2582void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
2583#if !UCONFIG_NO_REGULAR_EXPRESSIONS
2584// TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb.
2585  UVersionInfo icu4601 = { 4, 6, 0, 1 };
2586UBool isICUVersionPast46 = isICUVersionAtLeast(icu4601);
2587UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
2588    UErrorCode  status = U_ZERO_ERROR;
2589
2590    //
2591    //  Open and read the test data file, put it into a UnicodeString.
2592    //
2593    const char *testDataDirectory = IntlTest::getSourceTestData(status);
2594    char testFileName[1000];
2595    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
2596        dataerrln("Can't open test data.  Path too long.");
2597        return;
2598    }
2599    strcpy(testFileName, testDataDirectory);
2600    strcat(testFileName, fileName);
2601
2602    logln("Opening data file %s\n", fileName);
2603
2604    int    len;
2605    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
2606    if (status != U_FILE_ACCESS_ERROR) {
2607        TEST_ASSERT_SUCCESS(status);
2608        TEST_ASSERT(testFile != NULL);
2609    }
2610    if (U_FAILURE(status) || testFile == NULL) {
2611        return; /* something went wrong, error already output */
2612    }
2613    UnicodeString testFileAsString(TRUE, testFile, len);
2614
2615    //
2616    //  Parse the test data file using a regular expression.
2617    //  Each kind of token is recognized in its own capture group; what type of item was scanned
2618    //     is identified by which group had a match.
2619    //
2620    //    Caputure Group #                  1          2            3            4           5
2621    //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
2622    //
2623    UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
2624    RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
2625    UnicodeString   testString;
2626    UVector32       breakPositions(status);
2627    int             lineNumber = 1;
2628    TEST_ASSERT_SUCCESS(status);
2629    if (U_FAILURE(status)) {
2630        return;
2631    }
2632
2633    //
2634    //  Scan through each test case, building up the string to be broken in testString,
2635    //   and the positions that should be boundaries in the breakPositions vector.
2636    //
2637    int spin = 0;
2638    while (tokenMatcher.find()) {
2639      	if(tokenMatcher.hitEnd()) {
2640          /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
2641             This occurred when the text file was corrupt (wasn't marked as UTF-8)
2642             and caused an infinite loop here on EBCDIC systems!
2643          */
2644          fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
2645          //	   return;
2646      	}
2647        if (tokenMatcher.start(1, status) >= 0) {
2648            // Scanned a divide sign, indicating a break position in the test data.
2649            if (testString.length()>0) {
2650                breakPositions.addElement(testString.length(), status);
2651            }
2652        }
2653        else if (tokenMatcher.start(2, status) >= 0) {
2654            // Scanned an 'x', meaning no break at this position in the test data
2655            //   Nothing to be done here.
2656            }
2657        else if (tokenMatcher.start(3, status) >= 0) {
2658            // Scanned Hex digits.  Convert them to binary, append to the character data string.
2659            const UnicodeString &hexNumber = tokenMatcher.group(3, status);
2660            int length = hexNumber.length();
2661            if (length<=8) {
2662                char buf[10];
2663                hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
2664                UChar32 c = (UChar32)strtol(buf, NULL, 16);
2665                if (c<=0x10ffff) {
2666                    testString.append(c);
2667                } else {
2668                    errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
2669                       fileName, lineNumber);
2670                }
2671            } else {
2672                errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
2673                       fileName, lineNumber);
2674             }
2675        }
2676        else if (tokenMatcher.start(4, status) >= 0) {
2677            // Scanned to end of a line, possibly skipping over a comment in the process.
2678            //   If the line from the file contained test data, run the test now.
2679            //
2680            if (testString.length() > 0) {
2681// TODO(andy): Remove this time bomb code.
2682if (!isLineBreak || isICUVersionPast46 || !(4658 <= lineNumber && lineNumber <= 4758)) {
2683                checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
2684}
2685            }
2686
2687            // Clear out this test case.
2688            //    The string and breakPositions vector will be refilled as the next
2689            //       test case is parsed.
2690            testString.remove();
2691            breakPositions.removeAllElements();
2692            lineNumber++;
2693        } else {
2694            // Scanner catchall.  Something unrecognized appeared on the line.
2695            char token[16];
2696            UnicodeString uToken = tokenMatcher.group(0, status);
2697            uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
2698            token[sizeof(token)-1] = 0;
2699            errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
2700
2701            // Clean up, in preparation for continuing with the next line.
2702            testString.remove();
2703            breakPositions.removeAllElements();
2704            lineNumber++;
2705        }
2706        TEST_ASSERT_SUCCESS(status);
2707        if (U_FAILURE(status)) {
2708            break;
2709        }
2710    }
2711
2712    delete [] testFile;
2713 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
2714}
2715
2716//--------------------------------------------------------------------------------------------
2717//
2718//   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
2719//                            test data files.  Do only a simple, forward-only check -
2720//                            this test is mostly to check that ICU and the Unicode
2721//                            data agree with each other.
2722//
2723//--------------------------------------------------------------------------------------------
2724void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
2725                         const UnicodeString &testString,   // Text data to be broken
2726                         UVector32 *breakPositions,         // Positions where breaks should be found.
2727                         RuleBasedBreakIterator *bi) {
2728    int32_t pos;                 // Break Position in the test string
2729    int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
2730    int32_t expectedPos;         // Expected break position (index into test string)
2731
2732    bi->setText(testString);
2733    pos = bi->first();
2734    pos = bi->next();
2735
2736    while (pos != BreakIterator::DONE) {
2737        if (expectedI >= breakPositions->size()) {
2738            errln("Test file \"%s\", line %d, unexpected break found at position %d",
2739                testFileName, lineNumber, pos);
2740            break;
2741        }
2742        expectedPos = breakPositions->elementAti(expectedI);
2743        if (pos < expectedPos) {
2744            errln("Test file \"%s\", line %d, unexpected break found at position %d",
2745                testFileName, lineNumber, pos);
2746            break;
2747        }
2748        if (pos > expectedPos) {
2749            errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2750                testFileName, lineNumber, expectedPos);
2751            break;
2752        }
2753        pos = bi->next();
2754        expectedI++;
2755    }
2756
2757    if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
2758        errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2759            testFileName, lineNumber, breakPositions->elementAti(expectedI));
2760    }
2761}
2762
2763
2764
2765#if !UCONFIG_NO_REGULAR_EXPRESSIONS
2766//---------------------------------------------------------------------------------------
2767//
2768//   classs RBBIMonkeyKind
2769//
2770//      Monkey Test for Break Iteration
2771//      Abstract interface class.   Concrete derived classes independently
2772//      implement the break rules for different iterator types.
2773//
2774//      The Monkey Test itself uses doesn't know which type of break iterator it is
2775//      testing, but works purely in terms of the interface defined here.
2776//
2777//---------------------------------------------------------------------------------------
2778class RBBIMonkeyKind {
2779public:
2780    // Return a UVector of UnicodeSets, representing the character classes used
2781    //   for this type of iterator.
2782    virtual  UVector  *charClasses() = 0;
2783
2784    // Set the test text on which subsequent calls to next() will operate
2785    virtual  void      setText(const UnicodeString &s) = 0;
2786
2787    // Find the next break postion, starting from the prev break position, or from zero.
2788    // Return -1 after reaching end of string.
2789    virtual  int32_t   next(int32_t i) = 0;
2790
2791    virtual ~RBBIMonkeyKind();
2792    UErrorCode       deferredStatus;
2793
2794
2795protected:
2796    RBBIMonkeyKind();
2797
2798private:
2799};
2800
2801RBBIMonkeyKind::RBBIMonkeyKind() {
2802    deferredStatus = U_ZERO_ERROR;
2803}
2804
2805RBBIMonkeyKind::~RBBIMonkeyKind() {
2806}
2807
2808
2809//----------------------------------------------------------------------------------------
2810//
2811//   Random Numbers.  Similar to standard lib rand() and srand()
2812//                    Not using library to
2813//                      1.  Get same results on all platforms.
2814//                      2.  Get access to current seed, to more easily reproduce failures.
2815//
2816//---------------------------------------------------------------------------------------
2817static uint32_t m_seed = 1;
2818
2819static uint32_t m_rand()
2820{
2821    m_seed = m_seed * 1103515245 + 12345;
2822    return (uint32_t)(m_seed/65536) % 32768;
2823}
2824
2825
2826//------------------------------------------------------------------------------------------
2827//
2828//   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
2829//                             of RBBIMonkeyKind.
2830//
2831//------------------------------------------------------------------------------------------
2832class RBBICharMonkey: public RBBIMonkeyKind {
2833public:
2834    RBBICharMonkey();
2835    virtual          ~RBBICharMonkey();
2836    virtual  UVector *charClasses();
2837    virtual  void     setText(const UnicodeString &s);
2838    virtual  int32_t  next(int32_t i);
2839private:
2840    UVector   *fSets;
2841
2842    UnicodeSet  *fCRLFSet;
2843    UnicodeSet  *fControlSet;
2844    UnicodeSet  *fExtendSet;
2845    UnicodeSet  *fPrependSet;
2846    UnicodeSet  *fSpacingSet;
2847    UnicodeSet  *fLSet;
2848    UnicodeSet  *fVSet;
2849    UnicodeSet  *fTSet;
2850    UnicodeSet  *fLVSet;
2851    UnicodeSet  *fLVTSet;
2852    UnicodeSet  *fHangulSet;
2853    UnicodeSet  *fAnySet;
2854
2855    const UnicodeString *fText;
2856};
2857
2858
2859RBBICharMonkey::RBBICharMonkey() {
2860    UErrorCode  status = U_ZERO_ERROR;
2861
2862    fText = NULL;
2863
2864    fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2865    fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
2866    fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
2867    fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2868    fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2869    fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2870    fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2871    fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2872    fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2873    fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2874    fHangulSet  = new UnicodeSet();
2875    fHangulSet->addAll(*fLSet);
2876    fHangulSet->addAll(*fVSet);
2877    fHangulSet->addAll(*fTSet);
2878    fHangulSet->addAll(*fLVSet);
2879    fHangulSet->addAll(*fLVTSet);
2880    fAnySet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status);
2881
2882    fSets       = new UVector(status);
2883    fSets->addElement(fCRLFSet,    status);
2884    fSets->addElement(fControlSet, status);
2885    fSets->addElement(fExtendSet,  status);
2886    fSets->addElement(fPrependSet, status);
2887    fSets->addElement(fSpacingSet, status);
2888    fSets->addElement(fHangulSet,  status);
2889    fSets->addElement(fAnySet,     status);
2890    if (U_FAILURE(status)) {
2891        deferredStatus = status;
2892    }
2893}
2894
2895
2896void RBBICharMonkey::setText(const UnicodeString &s) {
2897    fText = &s;
2898}
2899
2900
2901
2902int32_t RBBICharMonkey::next(int32_t prevPos) {
2903    int    p0, p1, p2, p3;    // Indices of the significant code points around the
2904                              //   break position being tested.  The candidate break
2905                              //   location is before p2.
2906
2907    int     breakPos = -1;
2908
2909    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2910
2911    if (U_FAILURE(deferredStatus)) {
2912        return -1;
2913    }
2914
2915    // Previous break at end of string.  return DONE.
2916    if (prevPos >= fText->length()) {
2917        return -1;
2918    }
2919    p0 = p1 = p2 = p3 = prevPos;
2920    c3 =  fText->char32At(prevPos);
2921    c0 = c1 = c2 = 0;
2922
2923    // Loop runs once per "significant" character position in the input text.
2924    for (;;) {
2925        // Move all of the positions forward in the input string.
2926        p0 = p1;  c0 = c1;
2927        p1 = p2;  c1 = c2;
2928        p2 = p3;  c2 = c3;
2929
2930        // Advancd p3 by one codepoint
2931        p3 = fText->moveIndex32(p3, 1);
2932        c3 = fText->char32At(p3);
2933
2934        if (p1 == p2) {
2935            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2936            continue;
2937        }
2938        if (p2 == fText->length()) {
2939            // Reached end of string.  Always a break position.
2940            break;
2941        }
2942
2943        // Rule  GB3   CR x LF
2944        //     No Extend or Format characters may appear between the CR and LF,
2945        //     which requires the additional check for p2 immediately following p1.
2946        //
2947        if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2948            continue;
2949        }
2950
2951        // Rule (GB4).   ( Control | CR | LF ) <break>
2952        if (fControlSet->contains(c1) ||
2953            c1 == 0x0D ||
2954            c1 == 0x0A)  {
2955            break;
2956        }
2957
2958        // Rule (GB5)    <break>  ( Control | CR | LF )
2959        //
2960        if (fControlSet->contains(c2) ||
2961            c2 == 0x0D ||
2962            c2 == 0x0A)  {
2963            break;
2964        }
2965
2966
2967        // Rule (GB6)  L x ( L | V | LV | LVT )
2968        if (fLSet->contains(c1) &&
2969               (fLSet->contains(c2)  ||
2970                fVSet->contains(c2)  ||
2971                fLVSet->contains(c2) ||
2972                fLVTSet->contains(c2))) {
2973            continue;
2974        }
2975
2976        // Rule (GB7)    ( LV | V )  x  ( V | T )
2977        if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2978            (fVSet->contains(c2) || fTSet->contains(c2)))  {
2979            continue;
2980        }
2981
2982        // Rule (GB8)    ( LVT | T)  x T
2983        if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2984            fTSet->contains(c2))  {
2985            continue;
2986        }
2987
2988        // Rule (GB9)    Numeric x ALetter
2989        if (fExtendSet->contains(c2))  {
2990            continue;
2991        }
2992
2993        // Rule (GB9a)   x  SpacingMark
2994        if (fSpacingSet->contains(c2)) {
2995            continue;
2996        }
2997
2998        // Rule (GB9b)   Prepend x
2999        if (fPrependSet->contains(c1)) {
3000            continue;
3001        }
3002
3003        // Rule (GB10)  Any  <break>  Any
3004        break;
3005    }
3006
3007    breakPos = p2;
3008    return breakPos;
3009}
3010
3011
3012
3013UVector  *RBBICharMonkey::charClasses() {
3014    return fSets;
3015}
3016
3017
3018RBBICharMonkey::~RBBICharMonkey() {
3019    delete fSets;
3020    delete fCRLFSet;
3021    delete fControlSet;
3022    delete fExtendSet;
3023    delete fPrependSet;
3024    delete fSpacingSet;
3025    delete fLSet;
3026    delete fVSet;
3027    delete fTSet;
3028    delete fLVSet;
3029    delete fLVTSet;
3030    delete fHangulSet;
3031    delete fAnySet;
3032}
3033
3034//------------------------------------------------------------------------------------------
3035//
3036//   class RBBIWordMonkey      Word Break specific implementation
3037//                             of RBBIMonkeyKind.
3038//
3039//------------------------------------------------------------------------------------------
3040class RBBIWordMonkey: public RBBIMonkeyKind {
3041public:
3042    RBBIWordMonkey();
3043    virtual          ~RBBIWordMonkey();
3044    virtual  UVector *charClasses();
3045    virtual  void     setText(const UnicodeString &s);
3046    virtual int32_t   next(int32_t i);
3047private:
3048    UVector      *fSets;
3049
3050    UnicodeSet  *fCRSet;
3051    UnicodeSet  *fLFSet;
3052    UnicodeSet  *fNewlineSet;
3053    UnicodeSet  *fKatakanaSet;
3054    UnicodeSet  *fALetterSet;
3055    // TODO(jungshik): Do we still need this change?
3056    // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
3057    UnicodeSet  *fMidNumLetSet;
3058    UnicodeSet  *fMidLetterSet;
3059    UnicodeSet  *fMidNumSet;
3060    UnicodeSet  *fNumericSet;
3061    UnicodeSet  *fFormatSet;
3062    UnicodeSet  *fOtherSet;
3063    UnicodeSet  *fExtendSet;
3064    UnicodeSet  *fExtendNumLetSet;
3065    UnicodeSet  *fDictionaryCjkSet;
3066
3067    RegexMatcher  *fMatcher;
3068
3069    const UnicodeString  *fText;
3070};
3071
3072
3073RBBIWordMonkey::RBBIWordMonkey()
3074{
3075    UErrorCode  status = U_ZERO_ERROR;
3076
3077    fSets            = new UVector(status);
3078
3079    fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
3080    fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
3081    fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
3082    fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
3083    // Exclude Hangul syllables from ALetterSet during testing.
3084    // Leave CJK dictionary characters out from the monkey tests!
3085#if 0
3086    fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
3087                                      "[\\p{Line_Break = Complex_Context}"
3088                                      "-\\p{Grapheme_Cluster_Break = Extend}"
3089                                      "-\\p{Grapheme_Cluster_Break = Control}"
3090                                      "]]",
3091                                      status);
3092#endif
3093    fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
3094    fALetterSet->removeAll(*fDictionaryCjkSet);
3095    fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
3096    fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
3097    fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
3098    fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
3099    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"),      status);
3100    fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
3101    fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
3102    fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
3103
3104    fOtherSet        = new UnicodeSet();
3105    if(U_FAILURE(status)) {
3106      deferredStatus = status;
3107      return;
3108    }
3109
3110    fOtherSet->complement();
3111    fOtherSet->removeAll(*fCRSet);
3112    fOtherSet->removeAll(*fLFSet);
3113    fOtherSet->removeAll(*fNewlineSet);
3114    fOtherSet->removeAll(*fKatakanaSet);
3115    fOtherSet->removeAll(*fALetterSet);
3116    fOtherSet->removeAll(*fMidLetterSet);
3117    fOtherSet->removeAll(*fMidNumSet);
3118    fOtherSet->removeAll(*fNumericSet);
3119    fOtherSet->removeAll(*fExtendNumLetSet);
3120    fOtherSet->removeAll(*fFormatSet);
3121    fOtherSet->removeAll(*fExtendSet);
3122    // Inhibit dictionary characters from being tested at all.
3123    fOtherSet->removeAll(*fDictionaryCjkSet);
3124    fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
3125
3126    fSets->addElement(fCRSet,        status);
3127    fSets->addElement(fLFSet,        status);
3128    fSets->addElement(fNewlineSet,   status);
3129    fSets->addElement(fALetterSet,   status);
3130    //fSets->addElement(fKatakanaSet,  status); //TODO: work out how to test katakana
3131    fSets->addElement(fMidLetterSet, status);
3132    fSets->addElement(fMidNumLetSet, status);
3133    fSets->addElement(fMidNumSet,    status);
3134    fSets->addElement(fNumericSet,   status);
3135    fSets->addElement(fFormatSet,    status);
3136    fSets->addElement(fExtendSet,    status);
3137    fSets->addElement(fOtherSet,     status);
3138    fSets->addElement(fExtendNumLetSet, status);
3139
3140    if (U_FAILURE(status)) {
3141        deferredStatus = status;
3142    }
3143}
3144
3145void RBBIWordMonkey::setText(const UnicodeString &s) {
3146    fText       = &s;
3147}
3148
3149
3150int32_t RBBIWordMonkey::next(int32_t prevPos) {
3151    int    p0, p1, p2, p3;    // Indices of the significant code points around the
3152                              //   break position being tested.  The candidate break
3153                              //   location is before p2.
3154
3155    int     breakPos = -1;
3156
3157    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
3158
3159    if (U_FAILURE(deferredStatus)) {
3160        return -1;
3161    }
3162
3163    // Prev break at end of string.  return DONE.
3164    if (prevPos >= fText->length()) {
3165        return -1;
3166    }
3167    p0 = p1 = p2 = p3 = prevPos;
3168    c3 =  fText->char32At(prevPos);
3169    c0 = c1 = c2 = 0;
3170
3171    // Loop runs once per "significant" character position in the input text.
3172    for (;;) {
3173        // Move all of the positions forward in the input string.
3174        p0 = p1;  c0 = c1;
3175        p1 = p2;  c1 = c2;
3176        p2 = p3;  c2 = c3;
3177
3178        // Advancd p3 by    X(Extend | Format)*   Rule 4
3179        //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
3180        do {
3181            p3 = fText->moveIndex32(p3, 1);
3182            c3 = fText->char32At(p3);
3183            if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
3184               break;
3185            };
3186        }
3187        while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
3188
3189
3190        if (p1 == p2) {
3191            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
3192            continue;
3193        }
3194        if (p2 == fText->length()) {
3195            // Reached end of string.  Always a break position.
3196            break;
3197        }
3198
3199        // Rule  (3)   CR x LF
3200        //     No Extend or Format characters may appear between the CR and LF,
3201        //     which requires the additional check for p2 immediately following p1.
3202        //
3203        if (c1==0x0D && c2==0x0A) {
3204            continue;
3205        }
3206
3207        // Rule (3a)  Break before and after newlines (including CR and LF)
3208        //
3209        if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
3210            break;
3211        };
3212        if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
3213            break;
3214        };
3215
3216        // Rule (5).   ALetter x ALetter
3217        if (fALetterSet->contains(c1) &&
3218            fALetterSet->contains(c2))  {
3219            continue;
3220        }
3221
3222        // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
3223        //
3224        if ( fALetterSet->contains(c1)   &&
3225             (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
3226             fALetterSet->contains(c3)) {
3227            continue;
3228        }
3229
3230
3231        // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
3232        if (fALetterSet->contains(c0) &&
3233            (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
3234            fALetterSet->contains(c2)) {
3235            continue;
3236        }
3237
3238        // Rule (8)    Numeric x Numeric
3239        if (fNumericSet->contains(c1) &&
3240            fNumericSet->contains(c2))  {
3241            continue;
3242        }
3243
3244        // Rule (9)    ALetter x Numeric
3245        if (fALetterSet->contains(c1) &&
3246            fNumericSet->contains(c2))  {
3247            continue;
3248        }
3249
3250        // Rule (10)    Numeric x ALetter
3251        if (fNumericSet->contains(c1) &&
3252            fALetterSet->contains(c2))  {
3253            continue;
3254        }
3255
3256        // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
3257        if (fNumericSet->contains(c0) &&
3258            (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
3259            fNumericSet->contains(c2)) {
3260            continue;
3261        }
3262
3263        // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
3264        if (fNumericSet->contains(c1) &&
3265            (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
3266            fNumericSet->contains(c3)) {
3267            continue;
3268        }
3269
3270        // Rule (13)  Katakana x Katakana
3271        if (fKatakanaSet->contains(c1) &&
3272            fKatakanaSet->contains(c2))  {
3273            continue;
3274        }
3275
3276        // Rule 13a
3277        if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
3278             fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
3279             fExtendNumLetSet->contains(c2)) {
3280                continue;
3281             }
3282
3283        // Rule 13b
3284        if (fExtendNumLetSet->contains(c1) &&
3285                (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
3286                fKatakanaSet->contains(c2)))  {
3287                continue;
3288             }
3289
3290        // Rule 14.  Break found here.
3291        break;
3292    }
3293
3294    breakPos = p2;
3295    return breakPos;
3296}
3297
3298
3299UVector  *RBBIWordMonkey::charClasses() {
3300    return fSets;
3301}
3302
3303
3304RBBIWordMonkey::~RBBIWordMonkey() {
3305    delete fSets;
3306    delete fCRSet;
3307    delete fLFSet;
3308    delete fNewlineSet;
3309    delete fKatakanaSet;
3310    delete fALetterSet;
3311    delete fMidNumLetSet;
3312    delete fMidLetterSet;
3313    delete fMidNumSet;
3314    delete fNumericSet;
3315    delete fFormatSet;
3316    delete fExtendSet;
3317    delete fExtendNumLetSet;
3318    delete fOtherSet;
3319}
3320
3321
3322
3323
3324//------------------------------------------------------------------------------------------
3325//
3326//   class RBBISentMonkey      Sentence Break specific implementation
3327//                             of RBBIMonkeyKind.
3328//
3329//------------------------------------------------------------------------------------------
3330class RBBISentMonkey: public RBBIMonkeyKind {
3331public:
3332    RBBISentMonkey();
3333    virtual          ~RBBISentMonkey();
3334    virtual  UVector *charClasses();
3335    virtual  void     setText(const UnicodeString &s);
3336    virtual int32_t   next(int32_t i);
3337private:
3338    int               moveBack(int posFrom);
3339    int               moveForward(int posFrom);
3340    UChar32           cAt(int pos);
3341
3342    UVector      *fSets;
3343
3344    UnicodeSet  *fSepSet;
3345    UnicodeSet  *fFormatSet;
3346    UnicodeSet  *fSpSet;
3347    UnicodeSet  *fLowerSet;
3348    UnicodeSet  *fUpperSet;
3349    UnicodeSet  *fOLetterSet;
3350    UnicodeSet  *fNumericSet;
3351    UnicodeSet  *fATermSet;
3352    UnicodeSet  *fSContinueSet;
3353    UnicodeSet  *fSTermSet;
3354    UnicodeSet  *fCloseSet;
3355    UnicodeSet  *fOtherSet;
3356    UnicodeSet  *fExtendSet;
3357
3358    const UnicodeString  *fText;
3359
3360};
3361
3362RBBISentMonkey::RBBISentMonkey()
3363{
3364    UErrorCode  status = U_ZERO_ERROR;
3365
3366    fSets            = new UVector(status);
3367
3368    //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
3369    //                       set and made into character classes of their own.  For the monkey impl,
3370    //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
3371    fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
3372    fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
3373    fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
3374    fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
3375    fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
3376    fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
3377    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
3378    fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
3379    fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
3380    fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
3381    fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
3382    fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
3383    fOtherSet        = new UnicodeSet();
3384
3385    if(U_FAILURE(status)) {
3386      deferredStatus = status;
3387      return;
3388    }
3389
3390    fOtherSet->complement();
3391    fOtherSet->removeAll(*fSepSet);
3392    fOtherSet->removeAll(*fFormatSet);
3393    fOtherSet->removeAll(*fSpSet);
3394    fOtherSet->removeAll(*fLowerSet);
3395    fOtherSet->removeAll(*fUpperSet);
3396    fOtherSet->removeAll(*fOLetterSet);
3397    fOtherSet->removeAll(*fNumericSet);
3398    fOtherSet->removeAll(*fATermSet);
3399    fOtherSet->removeAll(*fSContinueSet);
3400    fOtherSet->removeAll(*fSTermSet);
3401    fOtherSet->removeAll(*fCloseSet);
3402    fOtherSet->removeAll(*fExtendSet);
3403
3404    fSets->addElement(fSepSet,       status);
3405    fSets->addElement(fFormatSet,    status);
3406    fSets->addElement(fSpSet,        status);
3407    fSets->addElement(fLowerSet,     status);
3408    fSets->addElement(fUpperSet,     status);
3409    fSets->addElement(fOLetterSet,   status);
3410    fSets->addElement(fNumericSet,   status);
3411    fSets->addElement(fATermSet,     status);
3412    fSets->addElement(fSContinueSet, status);
3413    fSets->addElement(fSTermSet,     status);
3414    fSets->addElement(fCloseSet,     status);
3415    fSets->addElement(fOtherSet,     status);
3416    fSets->addElement(fExtendSet,    status);
3417
3418    if (U_FAILURE(status)) {
3419        deferredStatus = status;
3420    }
3421}
3422
3423
3424
3425void RBBISentMonkey::setText(const UnicodeString &s) {
3426    fText       = &s;
3427}
3428
3429UVector  *RBBISentMonkey::charClasses() {
3430    return fSets;
3431}
3432
3433
3434//  moveBack()   Find the "significant" code point preceding the index i.
3435//               Skips over ($Extend | $Format)* .
3436//
3437int RBBISentMonkey::moveBack(int i) {
3438    if (i <= 0) {
3439        return -1;
3440    }
3441    UChar32   c;
3442    int32_t   j = i;
3443    do {
3444        j = fText->moveIndex32(j, -1);
3445        c = fText->char32At(j);
3446    }
3447    while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
3448    return j;
3449
3450 }
3451
3452
3453int RBBISentMonkey::moveForward(int i) {
3454    if (i>=fText->length()) {
3455        return fText->length();
3456    }
3457    UChar32   c;
3458    int32_t   j = i;
3459    do {
3460        j = fText->moveIndex32(j, 1);
3461        c = cAt(j);
3462    }
3463    while (fFormatSet->contains(c) || fExtendSet->contains(c));
3464    return j;
3465}
3466
3467UChar32 RBBISentMonkey::cAt(int pos) {
3468    if (pos<0 || pos>=fText->length()) {
3469        return -1;
3470    } else {
3471        return fText->char32At(pos);
3472    }
3473}
3474
3475int32_t RBBISentMonkey::next(int32_t prevPos) {
3476    int    p0, p1, p2, p3;    // Indices of the significant code points around the
3477                              //   break position being tested.  The candidate break
3478                              //   location is before p2.
3479
3480    int     breakPos = -1;
3481
3482    UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
3483    UChar32 c;
3484
3485    if (U_FAILURE(deferredStatus)) {
3486        return -1;
3487    }
3488
3489    // Prev break at end of string.  return DONE.
3490    if (prevPos >= fText->length()) {
3491        return -1;
3492    }
3493    p0 = p1 = p2 = p3 = prevPos;
3494    c3 =  fText->char32At(prevPos);
3495    c0 = c1 = c2 = 0;
3496
3497    // Loop runs once per "significant" character position in the input text.
3498    for (;;) {
3499        // Move all of the positions forward in the input string.
3500        p0 = p1;  c0 = c1;
3501        p1 = p2;  c1 = c2;
3502        p2 = p3;  c2 = c3;
3503
3504        // Advancd p3 by    X(Extend | Format)*   Rule 4
3505        p3 = moveForward(p3);
3506        c3 = cAt(p3);
3507
3508        // Rule (3)  CR x LF
3509        if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
3510            continue;
3511        }
3512
3513        // Rule (4).   Sep  <break>
3514        if (fSepSet->contains(c1)) {
3515            p2 = p1+1;   // Separators don't combine with Extend or Format.
3516            break;
3517        }
3518
3519        if (p2 >= fText->length()) {
3520            // Reached end of string.  Always a break position.
3521            break;
3522        }
3523
3524        if (p2 == prevPos) {
3525            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
3526            continue;
3527        }
3528
3529        // Rule (6).   ATerm x Numeric
3530        if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
3531            continue;
3532        }
3533
3534        // Rule (7).  Upper ATerm  x  Uppper
3535        if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
3536            continue;
3537        }
3538
3539        // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
3540        //           Note:  STerm | ATerm are added to the negated part of the expression by a
3541        //                  note to the Unicode 5.0 documents.
3542        int p8 = p1;
3543        while (fSpSet->contains(cAt(p8))) {
3544            p8 = moveBack(p8);
3545        }
3546        while (fCloseSet->contains(cAt(p8))) {
3547            p8 = moveBack(p8);
3548        }
3549        if (fATermSet->contains(cAt(p8))) {
3550            p8=p2;
3551            for (;;) {
3552                c = cAt(p8);
3553                if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
3554                    fLowerSet->contains(c) || fSepSet->contains(c) ||
3555                    fATermSet->contains(c) || fSTermSet->contains(c))  {
3556                    break;
3557                }
3558                p8 = moveForward(p8);
3559            }
3560            if (fLowerSet->contains(cAt(p8))) {
3561                continue;
3562            }
3563        }
3564
3565        // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
3566        if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
3567            p8 = p1;
3568            while (fSpSet->contains(cAt(p8))) {
3569                p8 = moveBack(p8);
3570            }
3571            while (fCloseSet->contains(cAt(p8))) {
3572                p8 = moveBack(p8);
3573            }
3574            c = cAt(p8);
3575            if (fSTermSet->contains(c) || fATermSet->contains(c)) {
3576                continue;
3577            }
3578        }
3579
3580        // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
3581        int p9 = p1;
3582        while (fCloseSet->contains(cAt(p9))) {
3583            p9 = moveBack(p9);
3584        }
3585        c = cAt(p9);
3586        if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
3587            if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
3588                continue;
3589            }
3590        }
3591
3592        // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
3593        int p10 = p1;
3594        while (fSpSet->contains(cAt(p10))) {
3595            p10 = moveBack(p10);
3596        }
3597        while (fCloseSet->contains(cAt(p10))) {
3598            p10 = moveBack(p10);
3599        }
3600        if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
3601            if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
3602                continue;
3603            }
3604        }
3605
3606        // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
3607        int p11 = p1;
3608        if (fSepSet->contains(cAt(p11))) {
3609            p11 = moveBack(p11);
3610        }
3611        while (fSpSet->contains(cAt(p11))) {
3612            p11 = moveBack(p11);
3613        }
3614        while (fCloseSet->contains(cAt(p11))) {
3615            p11 = moveBack(p11);
3616        }
3617        if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
3618            break;
3619        }
3620
3621        //  Rule (12)  Any x Any
3622        continue;
3623    }
3624    breakPos = p2;
3625    return breakPos;
3626}
3627
3628RBBISentMonkey::~RBBISentMonkey() {
3629    delete fSets;
3630    delete fSepSet;
3631    delete fFormatSet;
3632    delete fSpSet;
3633    delete fLowerSet;
3634    delete fUpperSet;
3635    delete fOLetterSet;
3636    delete fNumericSet;
3637    delete fATermSet;
3638    delete fSContinueSet;
3639    delete fSTermSet;
3640    delete fCloseSet;
3641    delete fOtherSet;
3642    delete fExtendSet;
3643}
3644
3645
3646
3647//-------------------------------------------------------------------------------------------
3648//
3649//  RBBILineMonkey
3650//
3651//-------------------------------------------------------------------------------------------
3652
3653class RBBILineMonkey: public RBBIMonkeyKind {
3654public:
3655    RBBILineMonkey();
3656    virtual          ~RBBILineMonkey();
3657    virtual  UVector *charClasses();
3658    virtual  void     setText(const UnicodeString &s);
3659    virtual  int32_t  next(int32_t i);
3660    virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
3661private:
3662    UVector      *fSets;
3663
3664    UnicodeSet  *fBK;
3665    UnicodeSet  *fCR;
3666    UnicodeSet  *fLF;
3667    UnicodeSet  *fCM;
3668    UnicodeSet  *fNL;
3669    UnicodeSet  *fSG;
3670    UnicodeSet  *fWJ;
3671    UnicodeSet  *fZW;
3672    UnicodeSet  *fGL;
3673    UnicodeSet  *fCB;
3674    UnicodeSet  *fSP;
3675    UnicodeSet  *fB2;
3676    UnicodeSet  *fBA;
3677    UnicodeSet  *fBB;
3678    UnicodeSet  *fHY;
3679    UnicodeSet  *fH2;
3680    UnicodeSet  *fH3;
3681    UnicodeSet  *fCL;
3682    UnicodeSet  *fCP;
3683    UnicodeSet  *fEX;
3684    UnicodeSet  *fIN;
3685    UnicodeSet  *fJL;
3686    UnicodeSet  *fJV;
3687    UnicodeSet  *fJT;
3688    UnicodeSet  *fNS;
3689    UnicodeSet  *fOP;
3690    UnicodeSet  *fQU;
3691    UnicodeSet  *fIS;
3692    UnicodeSet  *fNU;
3693    UnicodeSet  *fPO;
3694    UnicodeSet  *fPR;
3695    UnicodeSet  *fSY;
3696    UnicodeSet  *fAI;
3697    UnicodeSet  *fAL;
3698    UnicodeSet  *fID;
3699    UnicodeSet  *fSA;
3700    UnicodeSet  *fXX;
3701
3702    BreakIterator  *fCharBI;
3703
3704    const UnicodeString  *fText;
3705    int32_t              *fOrigPositions;
3706
3707    RegexMatcher         *fNumberMatcher;
3708    RegexMatcher         *fLB11Matcher;
3709};
3710
3711
3712RBBILineMonkey::RBBILineMonkey()
3713{
3714    UErrorCode  status = U_ZERO_ERROR;
3715
3716    fSets  = new UVector(status);
3717
3718    fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3719    fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3720    fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3721    fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3722    fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3723    fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3724    fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3725    fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3726    fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3727    fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3728    fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3729    fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3730    fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3731    fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3732    fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3733    fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3734    fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3735    fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
3736    fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3737    fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3738    fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3739    fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3740    fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3741    fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3742    fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3743    fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3744    fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3745    fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3746    fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3747    fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3748    fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3749    fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3750    fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3751    fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3752    fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
3753    fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3754    fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3755
3756    if (U_FAILURE(status)) {
3757        deferredStatus = status;
3758        fCharBI = NULL;
3759        fNumberMatcher = NULL;
3760        return;
3761    }
3762
3763    fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
3764    fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
3765    fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
3766    fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
3767
3768    fSets->addElement(fBK, status);
3769    fSets->addElement(fCR, status);
3770    fSets->addElement(fLF, status);
3771    fSets->addElement(fCM, status);
3772    fSets->addElement(fNL, status);
3773    fSets->addElement(fWJ, status);
3774    fSets->addElement(fZW, status);
3775    fSets->addElement(fGL, status);
3776    fSets->addElement(fCB, status);
3777    fSets->addElement(fSP, status);
3778    fSets->addElement(fB2, status);
3779    fSets->addElement(fBA, status);
3780    fSets->addElement(fBB, status);
3781    fSets->addElement(fHY, status);
3782    fSets->addElement(fH2, status);
3783    fSets->addElement(fH3, status);
3784    fSets->addElement(fCL, status);
3785    fSets->addElement(fCP, status);
3786    fSets->addElement(fEX, status);
3787    fSets->addElement(fIN, status);
3788    fSets->addElement(fJL, status);
3789    fSets->addElement(fJT, status);
3790    fSets->addElement(fJV, status);
3791    fSets->addElement(fNS, status);
3792    fSets->addElement(fOP, status);
3793    fSets->addElement(fQU, status);
3794    fSets->addElement(fIS, status);
3795    fSets->addElement(fNU, status);
3796    fSets->addElement(fPO, status);
3797    fSets->addElement(fPR, status);
3798    fSets->addElement(fSY, status);
3799    fSets->addElement(fAI, status);
3800    fSets->addElement(fAL, status);
3801    fSets->addElement(fID, status);
3802    fSets->addElement(fWJ, status);
3803    fSets->addElement(fSA, status);
3804    fSets->addElement(fSG, status);
3805
3806    const char *rules =
3807            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3808            "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3809            "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3810            "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3811            "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3812            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3813
3814    fNumberMatcher = new RegexMatcher(
3815        UnicodeString(rules, -1, US_INV), 0, status);
3816
3817    fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3818
3819    if (U_FAILURE(status)) {
3820        deferredStatus = status;
3821    }
3822}
3823
3824
3825void RBBILineMonkey::setText(const UnicodeString &s) {
3826    fText       = &s;
3827    fCharBI->setText(s);
3828    fNumberMatcher->reset(s);
3829}
3830
3831//
3832//  rule9Adjust
3833//     Line Break TR rules 9 and 10 implementation.
3834//     This deals with combining marks and other sequences that
3835//     that must be treated as if they were something other than what they actually are.
3836//
3837//     This is factored out into a separate function because it must be applied twice for
3838//     each potential break, once to the chars before the position being checked, then
3839//     again to the text following the possible break.
3840//
3841void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3842    if (pos == -1) {
3843        // Invalid initial position.  Happens during the warmup iteration of the
3844        //   main loop in next().
3845        return;
3846    }
3847
3848    int32_t  nPos = *nextPos;
3849
3850    // LB 9  Keep combining sequences together.
3851    //  advance over any CM class chars.  Note that Line Break CM is different
3852    //  from the normal Grapheme Extend property.
3853    if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3854          *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3855        for (;;) {
3856            *nextChar = fText->char32At(nPos);
3857            if (!fCM->contains(*nextChar)) {
3858                break;
3859            }
3860            nPos = fText->moveIndex32(nPos, 1);
3861        }
3862    }
3863
3864
3865    // LB 9 Treat X CM* as if it were x.
3866    //       No explicit action required.
3867
3868    // LB 10  Treat any remaining combining mark as AL
3869    if (fCM->contains(*posChar)) {
3870        *posChar = 0x41;   // thisChar = 'A';
3871    }
3872
3873    // Push the updated nextPos and nextChar back to our caller.
3874    // This only makes a difference if posChar got bigger by consuming a
3875    // combining sequence.
3876    *nextPos  = nPos;
3877    *nextChar = fText->char32At(nPos);
3878}
3879
3880
3881
3882int32_t RBBILineMonkey::next(int32_t startPos) {
3883    UErrorCode status = U_ZERO_ERROR;
3884    int32_t    pos;       //  Index of the char following a potential break position
3885    UChar32    thisChar;  //  Character at above position "pos"
3886
3887    int32_t    prevPos;   //  Index of the char preceding a potential break position
3888    UChar32    prevChar;  //  Character at above position.  Note that prevChar
3889                          //   and thisChar may not be adjacent because combining
3890                          //   characters between them will be ignored.
3891
3892    int32_t    nextPos;   //  Index of the next character following pos.
3893                          //     Usually skips over combining marks.
3894    int32_t    nextCPPos; //  Index of the code point following "pos."
3895                          //     May point to a combining mark.
3896    int32_t    tPos;      //  temp value.
3897    UChar32    c;
3898
3899    if (U_FAILURE(deferredStatus)) {
3900        return -1;
3901    }
3902
3903    if (startPos >= fText->length()) {
3904        return -1;
3905    }
3906
3907
3908    // Initial values for loop.  Loop will run the first time without finding breaks,
3909    //                           while the invalid values shift out and the "this" and
3910    //                           "prev" positions are filled in with good values.
3911    pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
3912    thisChar = prevChar  = 0;
3913    nextPos  = nextCPPos = startPos;
3914
3915
3916    // Loop runs once per position in the test text, until a break position
3917    //  is found.
3918    for (;;) {
3919        prevPos   = pos;
3920        prevChar  = thisChar;
3921
3922        pos       = nextPos;
3923        thisChar  = fText->char32At(pos);
3924
3925        nextCPPos = fText->moveIndex32(pos, 1);
3926        nextPos   = nextCPPos;
3927
3928        // Rule LB2 - Break at end of text.
3929        if (pos >= fText->length()) {
3930            break;
3931        }
3932
3933        // Rule LB 9 - adjust for combining sequences.
3934        //             We do this one out-of-order because the adjustment does not change anything
3935        //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3936        //             be applied.
3937        rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3938        nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3939        c = fText->char32At(nextPos);
3940        rule9Adjust(pos,     &thisChar, &nextPos, &c);
3941
3942        // If the loop is still warming up - if we haven't shifted the initial
3943        //   -1 positions out of prevPos yet - loop back to advance the
3944        //    position in the input without any further looking for breaks.
3945        if (prevPos == -1) {
3946            continue;
3947        }
3948
3949        // LB 4  Always break after hard line breaks,
3950        if (fBK->contains(prevChar)) {
3951            break;
3952        }
3953
3954        // LB 5  Break after CR, LF, NL, but not inside CR LF
3955        if (prevChar == 0x0d && thisChar == 0x0a) {
3956            continue;
3957        }
3958        if (prevChar == 0x0d ||
3959            prevChar == 0x0a ||
3960            prevChar == 0x85)  {
3961            break;
3962        }
3963
3964        // LB 6  Don't break before hard line breaks
3965        if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3966            fBK->contains(thisChar)) {
3967                continue;
3968        }
3969
3970
3971        // LB 7  Don't break before spaces or zero-width space.
3972        if (fSP->contains(thisChar)) {
3973            continue;
3974        }
3975
3976        if (fZW->contains(thisChar)) {
3977            continue;
3978        }
3979
3980        // LB 8  Break after zero width space
3981        if (fZW->contains(prevChar)) {
3982            break;
3983        }
3984
3985        // LB 9, 10  Already done, at top of loop.
3986        //
3987
3988
3989        // LB 11  Do not break before or after WORD JOINER and related characters.
3990        //    x  WJ
3991        //    WJ  x
3992        //
3993        if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3994            continue;
3995        }
3996
3997        // LB 12
3998        //    GL  x
3999        if (fGL->contains(prevChar)) {
4000            continue;
4001        }
4002
4003        // LB 12a
4004        //    [^SP BA HY] x GL
4005        if (!(fSP->contains(prevChar) ||
4006              fBA->contains(prevChar) ||
4007              fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
4008            continue;
4009        }
4010
4011
4012
4013        // LB 13  Don't break before closings.
4014        //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
4015        //        fall into LB 17 and the more general number regular expression.
4016        //
4017        if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
4018            (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
4019                                         fEX->contains(thisChar)  ||
4020            (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
4021            (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
4022            continue;
4023        }
4024
4025        // LB 14 Don't break after OP SP*
4026        //       Scan backwards, checking for this sequence.
4027        //       The OP char could include combining marks, so we actually check for
4028        //           OP CM* SP*
4029        //       Another Twist: The Rule 67 fixes may have changed a SP CM
4030        //       sequence into a ID char, so before scanning back through spaces,
4031        //       verify that prevChar is indeed a space.  The prevChar variable
4032        //       may differ from fText[prevPos]
4033        tPos = prevPos;
4034        if (fSP->contains(prevChar)) {
4035            while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
4036                tPos=fText->moveIndex32(tPos, -1);
4037            }
4038        }
4039        while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
4040            tPos=fText->moveIndex32(tPos, -1);
4041        }
4042        if (fOP->contains(fText->char32At(tPos))) {
4043            continue;
4044        }
4045
4046
4047        // LB 15    QU SP* x OP
4048        if (fOP->contains(thisChar)) {
4049            // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
4050            int tPos = prevPos;
4051            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
4052                tPos = fText->moveIndex32(tPos, -1);
4053            }
4054            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
4055                tPos = fText->moveIndex32(tPos, -1);
4056            }
4057            if (fQU->contains(fText->char32At(tPos))) {
4058                continue;
4059            }
4060        }
4061
4062
4063
4064        // LB 16   (CL | CP) SP* x NS
4065        //    Scan backwards for SP* CM* (CL | CP)
4066        if (fNS->contains(thisChar)) {
4067            int tPos = prevPos;
4068            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
4069                tPos = fText->moveIndex32(tPos, -1);
4070            }
4071            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
4072                tPos = fText->moveIndex32(tPos, -1);
4073            }
4074            if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
4075                continue;
4076            }
4077        }
4078
4079
4080        // LB 17        B2 SP* x B2
4081        if (fB2->contains(thisChar)) {
4082            //  Scan backwards, checking for the B2 CM* SP* sequence.
4083            tPos = prevPos;
4084            if (fSP->contains(prevChar)) {
4085                while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
4086                    tPos=fText->moveIndex32(tPos, -1);
4087                }
4088            }
4089            while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
4090                tPos=fText->moveIndex32(tPos, -1);
4091            }
4092            if (fB2->contains(fText->char32At(tPos))) {
4093                continue;
4094            }
4095        }
4096
4097
4098        // LB 18    break after space
4099        if (fSP->contains(prevChar)) {
4100            break;
4101        }
4102
4103        // LB 19
4104        //    x   QU
4105        //    QU  x
4106        if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
4107            continue;
4108        }
4109
4110        // LB 20  Break around a CB
4111        if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
4112            break;
4113        }
4114
4115        // LB 21
4116        if (fBA->contains(thisChar) ||
4117            fHY->contains(thisChar) ||
4118            fNS->contains(thisChar) ||
4119            fBB->contains(prevChar) )   {
4120            continue;
4121        }
4122
4123        // LB 22
4124        if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
4125            (fID->contains(prevChar) && fIN->contains(thisChar)) ||
4126            (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
4127            (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
4128            continue;
4129        }
4130
4131
4132        // LB 23    ID x PO
4133        //          AL x NU
4134        //          NU x AL
4135        if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
4136            (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
4137            (fNU->contains(prevChar) && fAL->contains(thisChar)) )   {
4138            continue;
4139        }
4140
4141        // LB 24  Do not break between prefix and letters or ideographs.
4142        //        PR x ID
4143        //        PR x AL
4144        //        PO x AL
4145        if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
4146            (fPR->contains(prevChar) && fAL->contains(thisChar)) ||
4147            (fPO->contains(prevChar) && fAL->contains(thisChar)) )   {
4148            continue;
4149        }
4150
4151
4152
4153        // LB 25    Numbers
4154        if (fNumberMatcher->lookingAt(prevPos, status)) {
4155            if (U_FAILURE(status)) {
4156                break;
4157            }
4158            // Matched a number.  But could have been just a single digit, which would
4159            //    not represent a "no break here" between prevChar and thisChar
4160            int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
4161            if (numEndIdx > pos) {
4162                // Number match includes at least our two chars being checked
4163                if (numEndIdx > nextPos) {
4164                    // Number match includes additional chars.  Update pos and nextPos
4165                    //   so that next loop iteration will continue at the end of the number,
4166                    //   checking for breaks between last char in number & whatever follows.
4167                    pos = nextPos = numEndIdx;
4168                    do {
4169                        pos = fText->moveIndex32(pos, -1);
4170                        thisChar = fText->char32At(pos);
4171                    } while (fCM->contains(thisChar));
4172                }
4173                continue;
4174            }
4175        }
4176
4177
4178        // LB 26 Do not break a Korean syllable.
4179        if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
4180                                        fJV->contains(thisChar) ||
4181                                        fH2->contains(thisChar) ||
4182                                        fH3->contains(thisChar))) {
4183                                            continue;
4184                                        }
4185
4186        if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
4187            (fJV->contains(thisChar) || fJT->contains(thisChar))) {
4188                continue;
4189        }
4190
4191        if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
4192            fJT->contains(thisChar)) {
4193                continue;
4194        }
4195
4196        // LB 27 Treat a Korean Syllable Block the same as ID.
4197        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
4198            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
4199            fIN->contains(thisChar)) {
4200                continue;
4201            }
4202        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
4203            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
4204            fPO->contains(thisChar)) {
4205                continue;
4206            }
4207        if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
4208            fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
4209                continue;
4210            }
4211
4212
4213
4214        // LB 28  Do not break between alphabetics ("at").
4215        if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
4216            continue;
4217        }
4218
4219        // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
4220        if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
4221            continue;
4222        }
4223
4224        // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
4225        //          (AL | NU) x OP
4226        //          CP x (AL | NU)
4227        if ((fAL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
4228            continue;
4229        }
4230        if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fNU->contains(thisChar))) {
4231            continue;
4232        }
4233
4234        // LB 31    Break everywhere else
4235        break;
4236
4237    }
4238
4239    return pos;
4240}
4241
4242
4243UVector  *RBBILineMonkey::charClasses() {
4244    return fSets;
4245}
4246
4247
4248RBBILineMonkey::~RBBILineMonkey() {
4249    delete fSets;
4250
4251    delete fBK;
4252    delete fCR;
4253    delete fLF;
4254    delete fCM;
4255    delete fNL;
4256    delete fWJ;
4257    delete fZW;
4258    delete fGL;
4259    delete fCB;
4260    delete fSP;
4261    delete fB2;
4262    delete fBA;
4263    delete fBB;
4264    delete fHY;
4265    delete fH2;
4266    delete fH3;
4267    delete fCL;
4268    delete fCP;
4269    delete fEX;
4270    delete fIN;
4271    delete fJL;
4272    delete fJV;
4273    delete fJT;
4274    delete fNS;
4275    delete fOP;
4276    delete fQU;
4277    delete fIS;
4278    delete fNU;
4279    delete fPO;
4280    delete fPR;
4281    delete fSY;
4282    delete fAI;
4283    delete fAL;
4284    delete fID;
4285    delete fSA;
4286    delete fSG;
4287    delete fXX;
4288
4289    delete fCharBI;
4290    delete fNumberMatcher;
4291}
4292
4293
4294//-------------------------------------------------------------------------------------------
4295//
4296//   TestMonkey
4297//
4298//     params
4299//       seed=nnnnn        Random number starting seed.
4300//                         Setting the seed allows errors to be reproduced.
4301//       loop=nnn          Looping count.  Controls running time.
4302//                         -1:  run forever.
4303//                          0 or greater:  run length.
4304//
4305//       type = char | word | line | sent | title
4306//
4307//-------------------------------------------------------------------------------------------
4308
4309static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
4310    int32_t val = defaultVal;
4311    name.append(" *= *(-?\\d+)");
4312    UErrorCode status = U_ZERO_ERROR;
4313    RegexMatcher m(name, params, 0, status);
4314    if (m.find()) {
4315        // The param exists.  Convert the string to an int.
4316        char valString[100];
4317        int32_t paramLength = m.end(1, status) - m.start(1, status);
4318        if (paramLength >= (int32_t)(sizeof(valString)-1)) {
4319            paramLength = (int32_t)(sizeof(valString)-2);
4320        }
4321        params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
4322        val = strtol(valString,  NULL, 10);
4323
4324        // Delete this parameter from the params string.
4325        m.reset();
4326        params = m.replaceFirst("", status);
4327    }
4328    U_ASSERT(U_SUCCESS(status));
4329    return val;
4330}
4331#endif
4332
4333static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
4334                                    BreakIterator *bi,
4335                                    int expected[],
4336                                    int expectedcount)
4337{
4338    int count = 0;
4339    int i = 0;
4340    int forward[50];
4341    bi->setText(ustr);
4342    for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
4343        forward[count] = i;
4344        if (count < expectedcount && expected[count] != i) {
4345            test->errln("break forward test failed: expected %d but got %d",
4346                        expected[count], i);
4347            break;
4348        }
4349        count ++;
4350    }
4351    if (count != expectedcount) {
4352        printStringBreaks(ustr, expected, expectedcount);
4353        test->errln("break forward test failed: missed %d match",
4354                    expectedcount - count);
4355        return;
4356    }
4357    // testing boundaries
4358    for (i = 1; i < expectedcount; i ++) {
4359        int j = expected[i - 1];
4360        if (!bi->isBoundary(j)) {
4361            printStringBreaks(ustr, expected, expectedcount);
4362            test->errln("isBoundary() failed.  Expected boundary at position %d", j);
4363            return;
4364        }
4365        for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
4366            if (bi->isBoundary(j)) {
4367                printStringBreaks(ustr, expected, expectedcount);
4368                test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
4369                return;
4370            }
4371        }
4372    }
4373
4374    for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
4375        count --;
4376        if (forward[count] != i) {
4377            printStringBreaks(ustr, expected, expectedcount);
4378            test->errln("happy break test previous() failed: expected %d but got %d",
4379                        forward[count], i);
4380            break;
4381        }
4382    }
4383    if (count != 0) {
4384        printStringBreaks(ustr, expected, expectedcount);
4385        test->errln("break test previous() failed: missed a match");
4386        return;
4387    }
4388
4389    // testing preceding
4390    for (i = 0; i < expectedcount - 1; i ++) {
4391        // int j = expected[i] + 1;
4392        int j = ustr.moveIndex32(expected[i], 1);
4393        for (; j <= expected[i + 1]; j ++) {
4394            if (bi->preceding(j) != expected[i]) {
4395                printStringBreaks(ustr, expected, expectedcount);
4396                test->errln("preceding(): Not expecting boundary at position %d", j);
4397                return;
4398            }
4399        }
4400    }
4401}
4402
4403void RBBITest::TestWordBreaks(void)
4404{
4405#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4406
4407    Locale        locale("en");
4408    UErrorCode    status = U_ZERO_ERROR;
4409    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4410    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4411    // Replaced any C+J characters in a row with a random sequence of characters
4412    // of the same length to make our C+J segmentation not get in the way.
4413    static const char *strlist[] =
4414    {
4415    "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
4416    "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
4417    "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
4418    "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
4419    "\\uac00\\u3588\\u009c\\u0953\\u194b",
4420    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
4421    "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
4422    "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
4423    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
4424    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
4425    "\\u2027\\U000e0067\\u0a47\\u00b7",
4426    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
4427    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
4428    "\\u0589\\U000e006e\\u0a42\\U000104a5",
4429    "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
4430    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
4431    "\\u0027\\u11af\\U000e0057\\u0602",
4432    "\\U0001d7f2\\U000e007\\u0004\\u0589",
4433    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
4434    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
4435    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
4436    "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
4437    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
4438    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
4439    "\\u0233\\U000e0020\\u0a69\\u0d6a",
4440    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
4441    "\\u18f4\\U000e0049\\u20e7\\u2027",
4442    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
4443    "\\ua183\\u102d\\u0bec\\u003a",
4444    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
4445    "\\u003a\\u0e57\\u0fad\\u002e",
4446    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
4447    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
4448    "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
4449    "\\u003a\\u0664\\u00b7\\u1fba",
4450    "\\u003b\\u0027\\u00b7\\u47a3",
4451    "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
4452    "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
4453    "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
4454    };
4455    int loop;
4456    if (U_FAILURE(status)) {
4457        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4458        return;
4459    }
4460    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4461        // printf("looping %d\n", loop);
4462        UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
4463        // RBBICharMonkey monkey;
4464        RBBIWordMonkey monkey;
4465
4466        int expected[50];
4467        int expectedcount = 0;
4468
4469        monkey.setText(ustr);
4470        int i;
4471        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4472            expected[expectedcount ++] = i;
4473        }
4474
4475        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4476    }
4477    delete bi;
4478#endif
4479}
4480
4481void RBBITest::TestWordBoundary(void)
4482{
4483    // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
4484    Locale        locale("en");
4485    UErrorCode    status = U_ZERO_ERROR;
4486    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4487    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4488    UChar         str[50];
4489    static const char *strlist[] =
4490    {
4491    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
4492    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
4493    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
4494    "\\u2027\\U000e0067\\u0a47\\u00b7",
4495    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
4496    "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
4497    "\\u0589\\U000e006e\\u0a42\\U000104a5",
4498    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
4499    "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
4500    "\\u0027\\u11af\\U000e0057\\u0602",
4501    "\\U0001d7f2\\U000e007\\u0004\\u0589",
4502    "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
4503    "\\U0001d7f2\\U000e007d\\u0004\\u0589",
4504    "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
4505    "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
4506    "\\U000e0065\\u302c\\u09ee\\U000e0068",
4507    "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
4508    "\\u0233\\U000e0020\\u0a69\\u0d6a",
4509    "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
4510    "\\u58f4\\U000e0049\\u20e7\\u2027",
4511    "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
4512    "\\ua183\\u102d\\u0bec\\u003a",
4513    "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
4514    "\\u003a\\u0e57\\u0fad\\u002e",
4515    "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
4516    "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
4517    "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
4518    "\\u003a\\u0664\\u00b7\\u1fba",
4519    "\\u003b\\u0027\\u00b7\\u47a3",
4520    };
4521    int loop;
4522    if (U_FAILURE(status)) {
4523        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4524        return;
4525    }
4526    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4527        // printf("looping %d\n", loop);
4528        u_unescape(strlist[loop], str, 20);
4529        UnicodeString ustr(str);
4530        int forward[50];
4531        int count = 0;
4532
4533        bi->setText(ustr);
4534        int prev = 0;
4535        int i;
4536        for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
4537            forward[count ++] = i;
4538            if (i > prev) {
4539                int j;
4540                for (j = prev + 1; j < i; j ++) {
4541                    if (bi->isBoundary(j)) {
4542                        printStringBreaks(ustr, forward, count);
4543                        errln("happy boundary test failed: expected %d not a boundary",
4544                               j);
4545                        return;
4546                    }
4547                }
4548            }
4549            if (!bi->isBoundary(i)) {
4550                printStringBreaks(ustr, forward, count);
4551                errln("happy boundary test failed: expected %d a boundary",
4552                       i);
4553                return;
4554            }
4555            prev = i;
4556        }
4557    }
4558    delete bi;
4559}
4560
4561void RBBITest::TestLineBreaks(void)
4562{
4563#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4564    Locale        locale("en");
4565    UErrorCode    status = U_ZERO_ERROR;
4566    BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4567    const int32_t  STRSIZE = 50;
4568    UChar         str[STRSIZE];
4569    static const char *strlist[] =
4570    {
4571     "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
4572     "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
4573             "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
4574     "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
4575             "u2014\\U000e0105\\u118c\\u000a\\u07f8",
4576     "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
4577     "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4578     "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4579     "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4580     "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
4581     "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
4582     "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4583     "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4584     "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4585     "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4586     "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4587     "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4588     "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4589     "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4590     "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4591     "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4592     "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4593     "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4594     "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4595     "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4596     "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4597     "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
4598     "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4599     "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4600     "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4601     "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4602     "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4603     "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
4604     "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4605     "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4606     "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
4607     "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4608     "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4609     "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4610     "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4611     "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4612     "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4613         "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
4614         "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
4615         "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
4616     "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4617         "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4618    };
4619    int loop;
4620    TEST_ASSERT_SUCCESS(status);
4621    if (U_FAILURE(status)) {
4622        return;
4623    }
4624    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4625        // printf("looping %d\n", loop);
4626        int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4627        if (t >= STRSIZE) {
4628            TEST_ASSERT(FALSE);
4629            continue;
4630        }
4631
4632
4633        UnicodeString ustr(str);
4634        RBBILineMonkey monkey;
4635        if (U_FAILURE(monkey.deferredStatus)) {
4636            continue;
4637        }
4638
4639        const int EXPECTEDSIZE = 50;
4640        int expected[EXPECTEDSIZE];
4641        int expectedcount = 0;
4642
4643        monkey.setText(ustr);
4644        int i;
4645        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4646            if (expectedcount >= EXPECTEDSIZE) {
4647                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4648                return;
4649            }
4650            expected[expectedcount ++] = i;
4651        }
4652
4653        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4654    }
4655    delete bi;
4656#endif
4657}
4658
4659void RBBITest::TestSentBreaks(void)
4660{
4661#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4662    Locale        locale("en");
4663    UErrorCode    status = U_ZERO_ERROR;
4664    BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4665    UChar         str[200];
4666    static const char *strlist[] =
4667    {
4668     "Now\ris\nthe\r\ntime\n\rfor\r\r",
4669     "This\n",
4670     "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4671     "\"Sentence ending with a quote.\" Bye.",
4672     "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
4673     "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4674     "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4675     "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4676     "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4677     "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4678     "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4679             "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4680             "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4681             "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4682     "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4683             "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4684             "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4685             "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4686             "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4687             "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4688    };
4689    int loop;
4690    if (U_FAILURE(status)) {
4691        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4692        return;
4693    }
4694    for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4695        u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
4696        UnicodeString ustr(str);
4697
4698        RBBISentMonkey monkey;
4699        if (U_FAILURE(monkey.deferredStatus)) {
4700            continue;
4701        }
4702
4703        const int EXPECTEDSIZE = 50;
4704        int expected[EXPECTEDSIZE];
4705        int expectedcount = 0;
4706
4707        monkey.setText(ustr);
4708        int i;
4709        for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4710            if (expectedcount >= EXPECTEDSIZE) {
4711                TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4712                return;
4713            }
4714            expected[expectedcount ++] = i;
4715        }
4716
4717        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4718    }
4719    delete bi;
4720#endif
4721}
4722
4723void RBBITest::TestMonkey(char *params) {
4724#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4725
4726    UErrorCode     status    = U_ZERO_ERROR;
4727    int32_t        loopCount = 500;
4728    int32_t        seed      = 1;
4729    UnicodeString  breakType = "all";
4730    Locale         locale("en");
4731    UBool          useUText  = FALSE;
4732
4733    if (quick == FALSE) {
4734        loopCount = 10000;
4735    }
4736
4737    if (params) {
4738        UnicodeString p(params);
4739        loopCount = getIntParam("loop", p, loopCount);
4740        seed      = getIntParam("seed", p, seed);
4741
4742        RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4743        if (m.find()) {
4744            breakType = m.group(1, status);
4745            m.reset();
4746            p = m.replaceFirst("", status);
4747        }
4748
4749        RegexMatcher u(" *utext", p, 0, status);
4750        if (u.find()) {
4751            useUText = TRUE;
4752            u.reset();
4753            p = u.replaceFirst("", status);
4754        }
4755
4756
4757        // m.reset(p);
4758        if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4759            // Each option is stripped out of the option string as it is processed.
4760            // All options have been checked.  The option string should have been completely emptied..
4761            char buf[100];
4762            p.extract(buf, sizeof(buf), NULL, status);
4763            buf[sizeof(buf)-1] = 0;
4764            errln("Unrecognized or extra parameter:  %s\n", buf);
4765            return;
4766        }
4767
4768    }
4769
4770    if (breakType == "char" || breakType == "all") {
4771        RBBICharMonkey  m;
4772        BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4773        if (U_SUCCESS(status)) {
4774            RunMonkey(bi, m, "char", seed, loopCount, useUText);
4775            if (breakType == "all" && useUText==FALSE) {
4776                // Also run a quick test with UText when "all" is specified
4777                RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4778            }
4779        }
4780        else {
4781            errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4782        }
4783        delete bi;
4784    }
4785
4786    if (breakType == "word" || breakType == "all") {
4787        logln("Word Break Monkey Test");
4788        RBBIWordMonkey  m;
4789        BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4790        if (U_SUCCESS(status)) {
4791            RunMonkey(bi, m, "word", seed, loopCount, useUText);
4792        }
4793        else {
4794            errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4795        }
4796        delete bi;
4797    }
4798
4799    if (breakType == "line" || breakType == "all") {
4800        logln("Line Break Monkey Test");
4801        RBBILineMonkey  m;
4802        BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4803        if (loopCount >= 10) {
4804            loopCount = loopCount / 5;   // Line break runs slower than the others.
4805        }
4806        if (U_SUCCESS(status)) {
4807            RunMonkey(bi, m, "line", seed, loopCount, useUText);
4808        }
4809        else {
4810            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4811        }
4812        delete bi;
4813    }
4814
4815    if (breakType == "sent" || breakType == "all"  ) {
4816        logln("Sentence Break Monkey Test");
4817        RBBISentMonkey  m;
4818        BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4819        if (loopCount >= 10) {
4820            loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4821        }
4822        if (U_SUCCESS(status)) {
4823            RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4824        }
4825        else {
4826            errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4827        }
4828        delete bi;
4829    }
4830
4831#endif
4832}
4833
4834//
4835//  Run a RBBI monkey test.  Common routine, for all break iterator types.
4836//    Parameters:
4837//       bi      - the break iterator to use
4838//       mk      - MonkeyKind, abstraction for obtaining expected results
4839//       name    - Name of test (char, word, etc.) for use in error messages
4840//       seed    - Seed for starting random number generator (parameter from user)
4841//       numIterations
4842//
4843void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4844                         int32_t numIterations, UBool useUText) {
4845
4846#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4847
4848    const int32_t    TESTSTRINGLEN = 500;
4849    UnicodeString    testText;
4850    int32_t          numCharClasses;
4851    UVector          *chClasses;
4852    int              expected[TESTSTRINGLEN*2 + 1];
4853    int              expectedCount = 0;
4854    char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4855    char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4856    char             reverseBreaks[TESTSTRINGLEN*2+1];
4857    char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4858    char             followingBreaks[TESTSTRINGLEN*2+1];
4859    char             precedingBreaks[TESTSTRINGLEN*2+1];
4860    int              i;
4861    int              loopCount = 0;
4862
4863    m_seed = seed;
4864
4865    numCharClasses = mk.charClasses()->size();
4866    chClasses      = mk.charClasses();
4867
4868    // Check for errors that occured during the construction of the MonkeyKind object.
4869    //  Can't report them where they occured because errln() is a method coming from intlTest,
4870    //  and is not visible outside of RBBITest :-(
4871    if (U_FAILURE(mk.deferredStatus)) {
4872        errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4873        return;
4874    }
4875
4876    // Verify that the character classes all have at least one member.
4877    for (i=0; i<numCharClasses; i++) {
4878        UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4879        if (s == NULL || s->size() == 0) {
4880            errln("Character Class #%d is null or of zero size.", i);
4881            return;
4882        }
4883    }
4884
4885    while (loopCount < numIterations || numIterations == -1) {
4886        if (numIterations == -1 && loopCount % 10 == 0) {
4887            // If test is running in an infinite loop, display a periodic tic so
4888            //   we can tell that it is making progress.
4889            fprintf(stderr, ".");
4890        }
4891        // Save current random number seed, so that we can recreate the random numbers
4892        //   for this loop iteration in event of an error.
4893        seed = m_seed;
4894
4895        // Populate a test string with data.
4896        testText.truncate(0);
4897        for (i=0; i<TESTSTRINGLEN; i++) {
4898            int32_t  aClassNum = m_rand() % numCharClasses;
4899            UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4900            int32_t   charIdx = m_rand() % classSet->size();
4901            UChar32   c = classSet->charAt(charIdx);
4902            if (c < 0) {   // TODO:  deal with sets containing strings.
4903                errln("c < 0");
4904                break;
4905            }
4906            testText.append(c);
4907        }
4908
4909        // Calculate the expected results for this test string.
4910        mk.setText(testText);
4911        memset(expectedBreaks, 0, sizeof(expectedBreaks));
4912        expectedBreaks[0] = 1;
4913        int32_t breakPos = 0;
4914        expectedCount = 0;
4915        for (;;) {
4916            breakPos = mk.next(breakPos);
4917            if (breakPos == -1) {
4918                break;
4919            }
4920            if (breakPos > testText.length()) {
4921                errln("breakPos > testText.length()");
4922            }
4923            expectedBreaks[breakPos] = 1;
4924            U_ASSERT(expectedCount<testText.length());
4925            expected[expectedCount ++] = breakPos;
4926        }
4927
4928        // Find the break positions using forward iteration
4929        memset(forwardBreaks, 0, sizeof(forwardBreaks));
4930        if (useUText) {
4931            UErrorCode status = U_ZERO_ERROR;
4932            UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4933            // testUText = utext_openUnicodeString(testUText, &testText, &status);
4934            bi->setText(testUText, status);
4935            TEST_ASSERT_SUCCESS(status);
4936            utext_close(testUText);   // The break iterator does a shallow clone of the UText
4937                                      //  This UText can be closed immediately, so long as the
4938                                      //  testText string continues to exist.
4939        } else {
4940            bi->setText(testText);
4941        }
4942
4943        for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4944            if (i < 0 || i > testText.length()) {
4945                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4946                break;
4947            }
4948            forwardBreaks[i] = 1;
4949        }
4950
4951        // Find the break positions using reverse iteration
4952        memset(reverseBreaks, 0, sizeof(reverseBreaks));
4953        for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4954            if (i < 0 || i > testText.length()) {
4955                errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4956                break;
4957            }
4958            reverseBreaks[i] = 1;
4959        }
4960
4961        // Find the break positions using isBoundary() tests.
4962        memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4963        U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4964        for (i=0; i<=testText.length(); i++) {
4965            isBoundaryBreaks[i] = bi->isBoundary(i);
4966        }
4967
4968
4969        // Find the break positions using the following() function.
4970        // printf(".");
4971        memset(followingBreaks, 0, sizeof(followingBreaks));
4972        int32_t   lastBreakPos = 0;
4973        followingBreaks[0] = 1;
4974        for (i=0; i<testText.length(); i++) {
4975            breakPos = bi->following(i);
4976            if (breakPos <= i ||
4977                breakPos < lastBreakPos ||
4978                breakPos > testText.length() ||
4979                (breakPos > lastBreakPos && lastBreakPos > i)) {
4980                errln("%s break monkey test: "
4981                    "Out of range value returned by BreakIterator::following().\n"
4982                        "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4983                         name, seed, i, breakPos, lastBreakPos);
4984                break;
4985            }
4986            followingBreaks[breakPos] = 1;
4987            lastBreakPos = breakPos;
4988        }
4989
4990        // Find the break positions using the preceding() function.
4991        memset(precedingBreaks, 0, sizeof(precedingBreaks));
4992        lastBreakPos = testText.length();
4993        precedingBreaks[testText.length()] = 1;
4994        for (i=testText.length(); i>0; i--) {
4995            breakPos = bi->preceding(i);
4996            if (breakPos >= i ||
4997                breakPos > lastBreakPos ||
4998                (breakPos < 0 && testText.getChar32Start(i)>0) ||
4999                (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
5000                errln("%s break monkey test: "
5001                    "Out of range value returned by BreakIterator::preceding().\n"
5002                    "index=%d;  prev returned %d; lastBreak=%d" ,
5003                    name,  i, breakPos, lastBreakPos);
5004                if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
5005                    precedingBreaks[i] = 2;   // Forces an error.
5006                }
5007            } else {
5008                if (breakPos >= 0) {
5009                    precedingBreaks[breakPos] = 1;
5010                }
5011                lastBreakPos = breakPos;
5012            }
5013        }
5014
5015        // Compare the expected and actual results.
5016        for (i=0; i<=testText.length(); i++) {
5017            const char *errorType = NULL;
5018            if  (forwardBreaks[i] != expectedBreaks[i]) {
5019                errorType = "next()";
5020            } else if (reverseBreaks[i] != forwardBreaks[i]) {
5021                errorType = "previous()";
5022            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
5023                errorType = "isBoundary()";
5024            } else if (followingBreaks[i] != expectedBreaks[i]) {
5025                errorType = "following()";
5026            } else if (precedingBreaks[i] != expectedBreaks[i]) {
5027                errorType = "preceding()";
5028            }
5029
5030
5031            if (errorType != NULL) {
5032                // Format a range of the test text that includes the failure as
5033                //  a data item that can be included in the rbbi test data file.
5034
5035                // Start of the range is the last point where expected and actual results
5036                //   both agreed that there was a break position.
5037                int startContext = i;
5038                int32_t count = 0;
5039                for (;;) {
5040                    if (startContext==0) { break; }
5041                    startContext --;
5042                    if (expectedBreaks[startContext] != 0) {
5043                        if (count == 2) break;
5044                        count ++;
5045                    }
5046                }
5047
5048                // End of range is two expected breaks past the start position.
5049                int endContext = i + 1;
5050                int ci;
5051                for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
5052                    for (;;) {
5053                        if (endContext >= testText.length()) {break;}
5054                        if (expectedBreaks[endContext-1] != 0) {
5055                            if (count == 0) break;
5056                            count --;
5057                        }
5058                        endContext ++;
5059                    }
5060                }
5061
5062                // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
5063                UnicodeString errorText = "<data>";
5064                /***if (strcmp(errorType, "next()") == 0) {
5065                    startContext = 0;
5066                    endContext = testText.length();
5067
5068                    printStringBreaks(testText, expected, expectedCount);
5069                }***/
5070
5071                for (ci=startContext; ci<endContext;) {
5072                    UnicodeString hexChars("0123456789abcdef");
5073                    UChar32  c;
5074                    int      bn;
5075                    c = testText.char32At(ci);
5076                    if (ci == i) {
5077                        // This is the location of the error.
5078                        errorText.append("<?>");
5079                    } else if (expectedBreaks[ci] != 0) {
5080                        // This a non-error expected break position.
5081                        errorText.append("\\");
5082                    }
5083                    if (c < 0x10000) {
5084                        errorText.append("\\u");
5085                        for (bn=12; bn>=0; bn-=4) {
5086                            errorText.append(hexChars.charAt((c>>bn)&0xf));
5087                        }
5088                    } else {
5089                        errorText.append("\\U");
5090                        for (bn=28; bn>=0; bn-=4) {
5091                            errorText.append(hexChars.charAt((c>>bn)&0xf));
5092                        }
5093                    }
5094                    ci = testText.moveIndex32(ci, 1);
5095                }
5096                errorText.append("\\");
5097                errorText.append("</data>\n");
5098
5099                // Output the error
5100                char  charErrorTxt[500];
5101                UErrorCode status = U_ZERO_ERROR;
5102                errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
5103                charErrorTxt[sizeof(charErrorTxt)-1] = 0;
5104                errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
5105                    name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
5106                    errorType, seed, i, charErrorTxt);
5107                break;
5108            }
5109        }
5110
5111        loopCount++;
5112    }
5113#endif
5114}
5115
5116
5117//  Bug 5532.  UTF-8 based UText fails in dictionary code.
5118//             This test checks the initial patch,
5119//             which is to just keep it from crashing.  Correct word boundaries
5120//             await a proper fix to the dictionary code.
5121//
5122void RBBITest::TestBug5532(void)  {
5123   // Text includes a mixture of Thai and Latin.
5124   const unsigned char utf8Data[] = {
5125           0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
5126           0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
5127           0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
5128           0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
5129           0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
5130           0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
5131           0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
5132           0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
5133           0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
5134           0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
5135           0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
5136
5137    UErrorCode status = U_ZERO_ERROR;
5138    UText utext=UTEXT_INITIALIZER;
5139    utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
5140    TEST_ASSERT_SUCCESS(status);
5141
5142    BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
5143    TEST_ASSERT_SUCCESS(status);
5144    if (U_SUCCESS(status)) {
5145        bi->setText(&utext, status);
5146        TEST_ASSERT_SUCCESS(status);
5147
5148        int32_t breakCount = 0;
5149        int32_t previousBreak = -1;
5150        for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
5151            // For now, just make sure that the break iterator doesn't hang.
5152            TEST_ASSERT(previousBreak < bi->current());
5153            previousBreak = bi->current();
5154        }
5155        TEST_ASSERT(breakCount > 0);
5156    }
5157    delete bi;
5158    utext_close(&utext);
5159}
5160
5161
5162//
5163//  TestDebug    -  A place-holder test for debugging purposes.
5164//                  For putting in fragments of other tests that can be invoked
5165//                  for tracing  without a lot of unwanted extra stuff happening.
5166//
5167void RBBITest::TestDebug(void) {
5168#if 0
5169    UErrorCode   status = U_ZERO_ERROR;
5170    int pos = 0;
5171    int ruleStatus = 0;
5172
5173    RuleBasedBreakIterator* bi =
5174       // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
5175       // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
5176       (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
5177    UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
5178    // UnicodeString s("Aaa.  Bcd");
5179    s = s.unescape();
5180    bi->setText(s);
5181    UBool r = bi->isBoundary(8);
5182    printf("%s", r?"true":"false");
5183    return;
5184    pos = bi->last();
5185    do {
5186        // ruleStatus = bi->getRuleStatus();
5187        printf("%d\t%d\n", pos, ruleStatus);
5188        pos = bi->previous();
5189    } while (pos != BreakIterator::DONE);
5190#endif
5191}
5192
5193#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
5194