1/******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1999-2009, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6/************************************************************************ 7* Date Name Description 8* 12/15/99 Madhu Creation. 9* 01/12/2000 Madhu Updated for changed API and added new tests 10************************************************************************/ 11 12#include "unicode/utypes.h" 13 14#if !UCONFIG_NO_BREAK_ITERATION 15 16#include "unicode/utypes.h" 17#include "unicode/brkiter.h" 18#include "unicode/rbbi.h" 19#include "unicode/uchar.h" 20#include "unicode/utf16.h" 21#include "unicode/ucnv.h" 22#include "unicode/schriter.h" 23#include "unicode/uniset.h" 24#include "unicode/regex.h" // TODO: make conditional on regexp being built. 25#include "unicode/ustring.h" 26#include "unicode/utext.h" 27#include "intltest.h" 28#include "rbbitst.h" 29#include <string.h> 30#include "uvector.h" 31#include "uvectr32.h" 32#include "triedict.h" 33#include <string.h> 34#include <stdio.h> 35#include <stdlib.h> 36 37#define TEST_ASSERT(x) {if (!(x)) { \ 38 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 39 40#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 41 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 42 43 44//--------------------------------------------- 45// runIndexedTest 46//--------------------------------------------- 47 48void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 49{ 50 if (exec) logln("TestSuite RuleBasedBreakIterator: "); 51 52 switch (index) { 53 case 0: name = "TestBug4153072"; 54 if(exec) TestBug4153072(); break; 55 case 1: name = "TestJapaneseLineBreak"; 56 if(exec) TestJapaneseLineBreak(); break; 57 case 2: name = "TestStatusReturn"; 58 if(exec) TestStatusReturn(); break; 59 case 3: name = "TestUnicodeFiles"; 60 if(exec) TestUnicodeFiles(); break; 61 case 4: name = "TestEmptyString"; 62 if(exec) TestEmptyString(); break; 63 64 case 5: name = "TestGetAvailableLocales"; 65 if(exec) TestGetAvailableLocales(); break; 66 67 case 6: name = "TestGetDisplayName"; 68 if(exec) TestGetDisplayName(); break; 69 70 case 7: name = "TestEndBehaviour"; 71 if(exec) TestEndBehaviour(); break; 72 case 8: name = "TestMixedThaiLineBreak"; 73 // BEGIN android-removed 74 // Disable all Thai breakiterator tests. 75 /* if(exec) TestMixedThaiLineBreak(); */ break; 76 // END android-removed 77 case 9: name = "TestThaiLineBreak"; 78 // BEGIN android-removed 79 // Disable all Thai breakiterator tests. 80 /* if(exec) TestThaiLineBreak(); */ break; 81 // END android-removed 82 case 10: name = "TestMaiyamok"; 83 // BEGIN android-removed 84 // Disable all Thai breakiterator tests. 85 /* if(exec) TestMaiyamok(); */ break; 86 // END android-removed 87 case 11: name = "TestWordBreaks"; 88 if(exec) TestWordBreaks(); break; 89 case 12: name = "TestWordBoundary"; 90 if(exec) TestWordBoundary(); break; 91 case 13: name = "TestLineBreaks"; 92 if(exec) TestLineBreaks(); break; 93 case 14: name = "TestSentBreaks"; 94 if(exec) TestSentBreaks(); break; 95 case 15: name = "TestExtended"; 96 if(exec) TestExtended(); break; 97 case 16: name = "TestMonkey"; 98 if(exec) { 99 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 100 TestMonkey(params); 101 #else 102 logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)"); 103 #endif 104 } 105 break; 106 case 17: name = "TestBug3818"; 107 // BEGIN android-removed 108 // Disable all Thai breakiterator tests. 109 /* if(exec) TestBug3818(); */ break; 110 // END android-removed 111 case 18: name = "TestJapaneseWordBreak"; 112 if(exec) TestJapaneseWordBreak(); break; 113 case 19: name = "TestDebug"; 114 if(exec) TestDebug(); break; 115 case 20: name = "TestTrieDict"; 116 if(exec) TestTrieDict(); break; 117 case 21: name = "TestBug5775"; 118 if (exec) TestBug5775(); break; 119 case 22: name = "TestThaiBreaks"; 120 // BEGIN android-removed 121 // Disable all Thai breakiterator tests. 122 /* if (exec) TestThaiBreaks(); */ break; 123 // END android-removed 124 case 23: name = "TestTailoredBreaks"; 125 if (exec) TestTailoredBreaks(); break; 126 127 default: name = ""; break; //needed to end loop 128 } 129} 130 131 132//--------------------------------------------------------------------------- 133// 134// class BITestData Holds a set of Break iterator test data and results 135// Includes 136// - the string data to be broken 137// - a vector of the expected break positions. 138// - a vector of source line numbers for the data, 139// (to help see where errors occured.) 140// - The expected break tag values. 141// - Vectors of actual break positions and tag values. 142// - Functions for comparing actual with expected and 143// reporting errors. 144// 145//---------------------------------------------------------------------------- 146class BITestData { 147public: 148 UnicodeString fDataToBreak; 149 UVector fExpectedBreakPositions; 150 UVector fExpectedTags; 151 UVector fLineNum; 152 UVector fActualBreakPositions; // Test Results. 153 UVector fActualTags; 154 155 BITestData(UErrorCode &status); 156 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); 157 void checkResults(const char *heading, RBBITest *test); 158 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); 159 void clearResults(); 160}; 161 162// 163// Constructor. 164// 165BITestData::BITestData(UErrorCode &status) 166: fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), 167 fActualTags(status) 168{ 169} 170 171// 172// addDataChunk. Add a section (non-breaking) piece if data to the test data. 173// The macro form collects the line number, which is helpful 174// when tracking down failures. 175// 176// A null data item is inserted at the start of each test's data 177// to put the starting zero into the data list. The position saved for 178// each non-null item is its ending position. 179// 180#define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); 181void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { 182 if (U_FAILURE(status)) {return;} 183 if (data != NULL) { 184 fDataToBreak.append(CharsToUnicodeString(data)); 185 } 186 fExpectedBreakPositions.addElement(fDataToBreak.length(), status); 187 fExpectedTags.addElement(tag, status); 188 fLineNum.addElement(lineNum, status); 189} 190 191 192// 193// checkResults. Compare the actual and expected break positions, report any differences. 194// 195void BITestData::checkResults(const char *heading, RBBITest *test) { 196 int32_t expectedIndex = 0; 197 int32_t actualIndex = 0; 198 199 for (;;) { 200 // If we've run through both the expected and actual results vectors, we're done. 201 // break out of the loop. 202 if (expectedIndex >= fExpectedBreakPositions.size() && 203 actualIndex >= fActualBreakPositions.size()) { 204 break; 205 } 206 207 208 if (expectedIndex >= fExpectedBreakPositions.size()) { 209 err(heading, test, expectedIndex-1, actualIndex); 210 actualIndex++; 211 continue; 212 } 213 214 if (actualIndex >= fActualBreakPositions.size()) { 215 err(heading, test, expectedIndex, actualIndex-1); 216 expectedIndex++; 217 continue; 218 } 219 220 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { 221 err(heading, test, expectedIndex, actualIndex); 222 // Try to resync the positions of the indices, to avoid a rash of spurious erros. 223 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { 224 actualIndex++; 225 } else { 226 expectedIndex++; 227 } 228 continue; 229 } 230 231 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { 232 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", 233 heading, fLineNum.elementAt(expectedIndex), 234 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); 235 } 236 237 actualIndex++; 238 expectedIndex++; 239 } 240} 241 242// 243// err - An error was found. Report it, along with information about where the 244// incorrectly broken test data appeared in the source file. 245// 246void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) 247{ 248 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); 249 int32_t actual = fActualBreakPositions.elementAti(actualIdx); 250 int32_t o = 0; 251 int32_t line = fLineNum.elementAti(expectedIdx); 252 if (expectedIdx > 0) { 253 // The line numbers are off by one because a premature break occurs somewhere 254 // within the previous item, rather than at the start of the current (expected) item. 255 // We want to report the offset of the unexpected break from the start of 256 // this previous item. 257 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); 258 } 259 if (actual < expected) { 260 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); 261 } else { 262 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); 263 } 264} 265 266 267void BITestData::clearResults() { 268 fActualBreakPositions.removeAllElements(); 269 fActualTags.removeAllElements(); 270} 271 272 273//----------------------------------------------------------------------------------- 274// 275// Cannned Test Characters 276// 277//----------------------------------------------------------------------------------- 278 279static const UChar cannedTestArray[] = { 280 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031, 281 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b, 282 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2, 283 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3, 284 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303, 285 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000, 286 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f, 287 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000 288}; 289 290static UnicodeString* cannedTestChars = 0; 291 292#define halfNA "\\u0928\\u094d\\u200d" 293#define halfSA "\\u0938\\u094d\\u200d" 294#define halfCHA "\\u091a\\u094d\\u200d" 295#define halfKA "\\u0915\\u094d\\u200d" 296#define deadTA "\\u0924\\u094d" 297 298//-------------------------------------------------------------------------------------- 299// 300// RBBITest constructor and destructor 301// 302//-------------------------------------------------------------------------------------- 303 304RBBITest::RBBITest() { 305 UnicodeString temp(cannedTestArray); 306 cannedTestChars = new UnicodeString(); 307 *cannedTestChars += (UChar)0x0000; 308 *cannedTestChars += temp; 309} 310 311 312RBBITest::~RBBITest() { 313 delete cannedTestChars; 314} 315 316 317static const int T_NUMBER = 100; 318static const int T_LETTER = 200; 319static const int T_H_OR_K = 300; 320static const int T_IDEO = 400; 321 322 323 324 325 326 327//-------------------------------------------------------------------- 328//Testing the BreakIterator for devanagari script 329//-------------------------------------------------------------------- 330 331#define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/ 332#define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/ 333#define deadTTHA "\\u0920\\u094d" 334#define deadPA "\\u092a\\u094d" 335#define deadSA "\\u0938\\u094d" 336#define visarga "\\u0903" /*devanagari visarga looks like a english colon*/ 337 338 339 340 341 342 343//----------------------------------------------------------------------------------- 344// 345// Test for status {tag} return value from break rules. 346// TODO: a more thorough test. 347// 348//----------------------------------------------------------------------------------- 349void RBBITest::TestStatusReturn() { 350 UnicodeString rulesString1("$Letters = [:L:];\n" 351 "$Numbers = [:N:];\n" 352 "$Letters+{1};\n" 353 "$Numbers+{2};\n" 354 "Help\\ {4}/me\\!;\n" 355 "[^$Letters $Numbers];\n" 356 "!.*;\n", -1, US_INV); 357 UnicodeString testString1 = "abc123..abc Help me Help me!"; 358 // 01234567890123456789012345678 359 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; 360 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; 361 362 UErrorCode status=U_ZERO_ERROR; 363 UParseError parseError; 364 365 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 366 if(U_FAILURE(status)) { 367 dataerrln("FAIL : in construction - %s", u_errorName(status)); 368 } else { 369 int32_t pos; 370 int32_t i = 0; 371 bi->setText(testString1); 372 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { 373 if (pos != bounds1[i]) { 374 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos); 375 break; 376 } 377 378 int tag = bi->getRuleStatus(); 379 if (tag != brkStatus[i]) { 380 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag); 381 break; 382 } 383 i++; 384 } 385 } 386 delete bi; 387} 388 389 390static void printStringBreaks(UnicodeString ustr, int expected[], 391 int expectedcount) 392{ 393 UErrorCode status = U_ZERO_ERROR; 394 char name[100]; 395 printf("code alpha extend alphanum type word sent line name\n"); 396 int j; 397 for (j = 0; j < ustr.length(); j ++) { 398 if (expectedcount > 0) { 399 int k; 400 for (k = 0; k < expectedcount; k ++) { 401 if (j == expected[k]) { 402 printf("------------------------------------------------ %d\n", 403 j); 404 } 405 } 406 } 407 UChar32 c = ustr.char32At(j); 408 if (c > 0xffff) { 409 j ++; 410 } 411 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 412 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 413 u_isUAlphabetic(c), 414 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 415 u_isalnum(c), 416 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 417 u_charType(c), 418 U_SHORT_PROPERTY_NAME), 419 u_getPropertyValueName(UCHAR_WORD_BREAK, 420 u_getIntPropertyValue(c, 421 UCHAR_WORD_BREAK), 422 U_SHORT_PROPERTY_NAME), 423 u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 424 u_getIntPropertyValue(c, 425 UCHAR_SENTENCE_BREAK), 426 U_SHORT_PROPERTY_NAME), 427 u_getPropertyValueName(UCHAR_LINE_BREAK, 428 u_getIntPropertyValue(c, 429 UCHAR_LINE_BREAK), 430 U_SHORT_PROPERTY_NAME), 431 name); 432 } 433} 434 435void RBBITest::TestThaiLineBreak() { 436 UErrorCode status = U_ZERO_ERROR; 437 BITestData thaiLineSelection(status); 438 439 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that 440 // represents elided letters at the end of a long word. It should be bound to 441 // the end of the word and not treated as an independent punctuation mark. 442 443 444 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 445 ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status); 446 ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status); 447 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status); 448 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status); 449// ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status); 450// ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status); 451 ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status); 452 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us 453 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status); 454 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status); 455 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status); 456 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status); 457 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status); 458 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status); 459 460 // the one time where the paiyannoi occurs somewhere other than at the end 461 // of a word is in the Thai abbrevation for "etc.", which both begins and 462 // ends with a paiyannoi 463 ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status); 464 ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status); 465 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status); 466 467 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance( 468 Locale("th"), status); 469 if (U_FAILURE(status)) 470 { 471 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status)); 472 return; 473 } 474 475 generalIteratorTest(*e, thaiLineSelection); 476 delete e; 477} 478 479 480 481void RBBITest::TestMixedThaiLineBreak() 482{ 483 UErrorCode status = U_ZERO_ERROR; 484 BITestData thaiLineSelection(status); 485 486 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 487 488 489 // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters 490 // start 491 492 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status); 493 ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status); 494 ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status); 495 ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status); 496 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status); 497 ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status); 498 ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status); 499 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status); 500 ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status); 501 ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status); 502 ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status); 503 ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status); 504 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status); 505 ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status); 506 ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status); 507 ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status); 508 509 // @suwit - end of changes 510 511 512 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status); 513 if (U_FAILURE(status)) 514 { 515 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status)); 516 return; 517 } 518 519 520 generalIteratorTest(*e, thaiLineSelection); 521 delete e; 522} 523 524 525void RBBITest::TestMaiyamok() 526{ 527 UErrorCode status = U_ZERO_ERROR; 528 BITestData thaiLineSelection(status); 529 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 530 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous 531 // word". Instead of appearing as a word unto itself, however, it's kept together 532 // with the word before it 533 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status); 534 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status); 535 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status); 536 ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status); 537 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status); 538 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status); 539 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status); 540 ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status); 541 ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status); 542 543 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance( 544 Locale("th"), status); 545 546 if (U_FAILURE(status)) 547 { 548 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status)); 549 return; 550 } 551 generalIteratorTest(*e, thaiLineSelection); 552 delete e; 553} 554 555 556 557void RBBITest::TestBug3818() { 558 UErrorCode status = U_ZERO_ERROR; 559 560 // Four Thai words... 561 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 562 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 563 UnicodeString thaiStr(thaiWordData); 564 565 RuleBasedBreakIterator* bi = 566 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status); 567 if (U_FAILURE(status) || bi == NULL) { 568 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 569 return; 570 } 571 bi->setText(thaiStr); 572 573 int32_t startOfSecondWord = bi->following(1); 574 if (startOfSecondWord != 4) { 575 errln("Fail at file %s, line %d expected start of word at 4, got %d", 576 __FILE__, __LINE__, startOfSecondWord); 577 } 578 startOfSecondWord = bi->following(0); 579 if (startOfSecondWord != 4) { 580 errln("Fail at file %s, line %d expected start of word at 4, got %d", 581 __FILE__, __LINE__, startOfSecondWord); 582 } 583 delete bi; 584} 585 586 587void RBBITest::TestJapaneseWordBreak() { 588 UErrorCode status = U_ZERO_ERROR; 589 BITestData japaneseWordSelection(status); 590 591 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data 592 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2 593 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5 594 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7 595 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10 596 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11 597 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12 598 599 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance( 600 Locale("ja"), status); 601 if (U_FAILURE(status)) 602 { 603 errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n"); 604 return; 605 } 606 607 generalIteratorTest(*e, japaneseWordSelection); 608 delete e; 609} 610 611void RBBITest::TestTrieDict() { 612 UErrorCode status = U_ZERO_ERROR; 613 614 // 615 // Open and read the test data file. 616 // 617 const char *testDataDirectory = IntlTest::getSourceTestData(status); 618 char testFileName[1000]; 619 if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) { 620 errln("Can't open test data. Path too long."); 621 return; 622 } 623 strcpy(testFileName, testDataDirectory); 624 strcat(testFileName, "riwords.txt"); 625 626 // Items needing deleting at the end 627 MutableTrieDictionary *mutableDict = NULL; 628 CompactTrieDictionary *compactDict = NULL; 629 UnicodeSet *breaks = NULL; 630 UChar *testFile = NULL; 631 StringEnumeration *enumer1 = NULL; 632 StringEnumeration *enumer2 = NULL; 633 MutableTrieDictionary *mutable2 = NULL; 634 StringEnumeration *cloneEnum = NULL; 635 CompactTrieDictionary *compact2 = NULL; 636 637 638 const UnicodeString *originalWord = NULL; 639 const UnicodeString *cloneWord = NULL; 640 UChar *current; 641 UChar *word; 642 UChar uc; 643 int32_t wordLen; 644 int32_t wordCount; 645 int32_t testCount; 646 647 int len; 648 testFile = ReadAndConvertFile(testFileName, len, NULL, status); 649 if (U_FAILURE(status)) { 650 goto cleanup; /* something went wrong, error already output */ 651 } 652 653 mutableDict = new MutableTrieDictionary(0x0E1C, status); 654 if (U_FAILURE(status)) { 655 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status)); 656 goto cleanup; 657 } 658 659 breaks = new UnicodeSet; 660 breaks->add(0x000A); // Line Feed 661 breaks->add(0x000D); // Carriage Return 662 breaks->add(0x2028); // Line Separator 663 breaks->add(0x2029); // Paragraph Separator 664 665 // Now add each non-comment line of the file as a word. 666 current = testFile; 667 word = current; 668 uc = *current++; 669 wordLen = 0; 670 wordCount = 0; 671 672 while (uc) { 673 if (uc == 0x0023) { // #comment line, skip 674 while (uc && !breaks->contains(uc)) { 675 uc = *current++; 676 } 677 } 678 else while (uc && !breaks->contains(uc)) { 679 ++wordLen; 680 uc = *current++; 681 } 682 if (wordLen > 0) { 683 mutableDict->addWord(word, wordLen, status); 684 if (U_FAILURE(status)) { 685 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status)); 686 goto cleanup; 687 } 688 wordCount += 1; 689 } 690 691 // Find beginning of next line 692 while (uc && breaks->contains(uc)) { 693 uc = *current++; 694 } 695 word = current-1; 696 wordLen = 0; 697 } 698 699 if (wordCount < 50) { 700 errln("Word count (%d) unreasonably small\n", wordCount); 701 goto cleanup; 702 } 703 704 enumer1 = mutableDict->openWords(status); 705 if (U_FAILURE(status)) { 706 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status)); 707 goto cleanup; 708 } 709 710 testCount = 0; 711 if (wordCount != (testCount = enumer1->count(status))) { 712 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 713 testCount, wordCount, u_errorName(status)); 714 goto cleanup; 715 } 716 717 // Now compact it 718 compactDict = new CompactTrieDictionary(*mutableDict, status); 719 if (U_FAILURE(status)) { 720 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status)); 721 goto cleanup; 722 } 723 724 enumer2 = compactDict->openWords(status); 725 if (U_FAILURE(status)) { 726 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status)); 727 goto cleanup; 728 } 729 730 if (wordCount != (testCount = enumer2->count(status))) { 731 errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 732 testCount, wordCount, u_errorName(status)); 733 goto cleanup; 734 } 735 736 if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) { 737 errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same"); 738 } 739 delete enumer1; 740 enumer1 = NULL; 741 delete enumer2; 742 enumer2 = NULL; 743 744 // Now un-compact it 745 mutable2 = compactDict->cloneMutable(status); 746 if (U_FAILURE(status)) { 747 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status)); 748 goto cleanup; 749 } 750 751 cloneEnum = mutable2->openWords(status); 752 if (U_FAILURE(status)) { 753 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status)); 754 goto cleanup; 755 } 756 757 if (wordCount != (testCount = cloneEnum->count(status))) { 758 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 759 testCount, wordCount, u_errorName(status)); 760 goto cleanup; 761 } 762 763 // Compact original dictionary to clone. Note that we can only compare the same kind of 764 // dictionary as the order of the enumerators is not guaranteed to be the same between 765 // different kinds 766 enumer1 = mutableDict->openWords(status); 767 if (U_FAILURE(status)) { 768 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status)); 769 goto cleanup; 770 } 771 772 originalWord = enumer1->snext(status); 773 cloneWord = cloneEnum->snext(status); 774 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { 775 if (*originalWord != *cloneWord) { 776 errln("Original and cloned MutableTrieDictionary word mismatch\n"); 777 goto cleanup; 778 } 779 originalWord = enumer1->snext(status); 780 cloneWord = cloneEnum->snext(status); 781 } 782 783 if (U_FAILURE(status)) { 784 errln("Enumeration failed: %s\n", u_errorName(status)); 785 goto cleanup; 786 } 787 788 if (originalWord != cloneWord) { 789 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n"); 790 goto cleanup; 791 } 792 793 // Test the data copying constructor for CompactTrieDict, and the data access APIs. 794 compact2 = new CompactTrieDictionary(compactDict->data(), status); 795 if (U_FAILURE(status)) { 796 errln("CompactTrieDictionary(const void *,...) failed\n"); 797 goto cleanup; 798 } 799 800 if (compact2->dataSize() == 0) { 801 errln("CompactTrieDictionary->dataSize() == 0\n"); 802 goto cleanup; 803 } 804 805 // Now count the words via the second dictionary 806 delete enumer1; 807 enumer1 = compact2->openWords(status); 808 if (U_FAILURE(status)) { 809 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status)); 810 goto cleanup; 811 } 812 813 if (wordCount != (testCount = enumer1->count(status))) { 814 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n", 815 testCount, wordCount, u_errorName(status)); 816 goto cleanup; 817 } 818 819cleanup: 820 delete compactDict; 821 delete mutableDict; 822 delete breaks; 823 delete[] testFile; 824 delete enumer1; 825 delete mutable2; 826 delete cloneEnum; 827 delete compact2; 828} 829 830 831//---------------------------------------------------------------------------- 832// 833// generalIteratorTest Given a break iterator and a set of test data, 834// Run the tests and report the results. 835// 836//---------------------------------------------------------------------------- 837void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 838{ 839 840 bi.setText(td.fDataToBreak); 841 842 testFirstAndNext(bi, td); 843 844 testLastAndPrevious(bi, td); 845 846 testFollowing(bi, td); 847 testPreceding(bi, td); 848 testIsBoundary(bi, td); 849 doMultipleSelectionTest(bi, td); 850} 851 852 853// 854// testFirstAndNext. Run the iterator forwards in the obvious first(), next() 855// kind of loop. 856// 857void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) 858{ 859 UErrorCode status = U_ZERO_ERROR; 860 int32_t p; 861 int32_t lastP = -1; 862 int32_t tag; 863 864 logln("Test first and next"); 865 bi.setText(td.fDataToBreak); 866 td.clearResults(); 867 868 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { 869 td.fActualBreakPositions.addElement(p, status); // Save result. 870 tag = bi.getRuleStatus(); 871 td.fActualTags.addElement(tag, status); 872 if (p <= lastP) { 873 // If the iterator is not making forward progress, stop. 874 // No need to raise an error here, it'll be detected in the normal check of results. 875 break; 876 } 877 lastP = p; 878 } 879 td.checkResults("testFirstAndNext", this); 880} 881 882 883// 884// TestLastAndPrevious. Run the iterator backwards, starting with last(). 885// 886void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) 887{ 888 UErrorCode status = U_ZERO_ERROR; 889 int32_t p; 890 int32_t lastP = 0x7ffffffe; 891 int32_t tag; 892 893 logln("Test last and previous"); 894 bi.setText(td.fDataToBreak); 895 td.clearResults(); 896 897 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { 898 // Save break position. Insert it at start of vector of results, shoving 899 // already-saved results further towards the end. 900 td.fActualBreakPositions.insertElementAt(p, 0, status); 901 // bi.previous(); // TODO: Why does this fix things up???? 902 // bi.next(); 903 tag = bi.getRuleStatus(); 904 td.fActualTags.insertElementAt(tag, 0, status); 905 if (p >= lastP) { 906 // If the iterator is not making progress, stop. 907 // No need to raise an error here, it'll be detected in the normal check of results. 908 break; 909 } 910 lastP = p; 911 } 912 td.checkResults("testLastAndPrevious", this); 913} 914 915 916void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) 917{ 918 UErrorCode status = U_ZERO_ERROR; 919 int32_t p; 920 int32_t tag; 921 int32_t lastP = -2; // A value that will never be returned as a break position. 922 // cannot be -1; that is returned for DONE. 923 int i; 924 925 logln("testFollowing():"); 926 bi.setText(td.fDataToBreak); 927 td.clearResults(); 928 929 // Save the starting point, since we won't get that out of following. 930 p = bi.first(); 931 td.fActualBreakPositions.addElement(p, status); // Save result. 932 tag = bi.getRuleStatus(); 933 td.fActualTags.addElement(tag, status); 934 935 for (i = 0; i <= td.fDataToBreak.length()+1; i++) { 936 p = bi.following(i); 937 if (p != lastP) { 938 if (p == RuleBasedBreakIterator::DONE) { 939 break; 940 } 941 // We've reached a new break position. Save it. 942 td.fActualBreakPositions.addElement(p, status); // Save result. 943 tag = bi.getRuleStatus(); 944 td.fActualTags.addElement(tag, status); 945 lastP = p; 946 } 947 } 948 // The loop normally exits by means of the break in the middle. 949 // Make sure that the index was at the correct position for the break iterator to have 950 // returned DONE. 951 if (i != td.fDataToBreak.length()) { 952 errln("testFollowing(): iterator returned DONE prematurely."); 953 } 954 955 // Full check of all results. 956 td.checkResults("testFollowing", this); 957} 958 959 960 961void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { 962 UErrorCode status = U_ZERO_ERROR; 963 int32_t p; 964 int32_t tag; 965 int32_t lastP = 0x7ffffffe; 966 int i; 967 968 logln("testPreceding():"); 969 bi.setText(td.fDataToBreak); 970 td.clearResults(); 971 972 p = bi.last(); 973 td.fActualBreakPositions.addElement(p, status); 974 tag = bi.getRuleStatus(); 975 td.fActualTags.addElement(tag, status); 976 977 for (i = td.fDataToBreak.length(); i>=-1; i--) { 978 p = bi.preceding(i); 979 if (p != lastP) { 980 if (p == RuleBasedBreakIterator::DONE) { 981 break; 982 } 983 // We've reached a new break position. Save it. 984 td.fActualBreakPositions.insertElementAt(p, 0, status); 985 lastP = p; 986 tag = bi.getRuleStatus(); 987 td.fActualTags.insertElementAt(tag, 0, status); 988 } 989 } 990 // The loop normally exits by means of the break in the middle. 991 // Make sure that the index was at the correct position for the break iterator to have 992 // returned DONE. 993 if (i != 0) { 994 errln("testPreceding(): iterator returned DONE prematurely."); 995 } 996 997 // Full check of all results. 998 td.checkResults("testPreceding", this); 999} 1000 1001 1002 1003void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { 1004 UErrorCode status = U_ZERO_ERROR; 1005 int i; 1006 int32_t tag; 1007 1008 logln("testIsBoundary():"); 1009 bi.setText(td.fDataToBreak); 1010 td.clearResults(); 1011 1012 for (i = 0; i <= td.fDataToBreak.length(); i++) { 1013 if (bi.isBoundary(i)) { 1014 td.fActualBreakPositions.addElement(i, status); // Save result. 1015 tag = bi.getRuleStatus(); 1016 td.fActualTags.addElement(tag, status); 1017 } 1018 } 1019 td.checkResults("testIsBoundary: ", this); 1020} 1021 1022 1023 1024void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) 1025{ 1026 iterator.setText(td.fDataToBreak); 1027 1028 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); 1029 int32_t offset = iterator.first(); 1030 int32_t testOffset; 1031 int32_t count = 0; 1032 1033 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); 1034 1035 if (*testIterator != iterator) 1036 errln("clone() or operator!= failed: two clones compared unequal"); 1037 1038 do { 1039 testOffset = testIterator->first(); 1040 testOffset = testIterator->next(count); 1041 if (offset != testOffset) 1042 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 1043 1044 if (offset != RuleBasedBreakIterator::DONE) { 1045 count++; 1046 offset = iterator.next(); 1047 1048 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { 1049 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); 1050 if (count > 10000 || offset == -1) { 1051 errln("operator== failed too many times. Stopping test."); 1052 if (offset == -1) { 1053 errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 1054 } 1055 return; 1056 } 1057 } 1058 } 1059 } while (offset != RuleBasedBreakIterator::DONE); 1060 1061 // now do it backwards... 1062 offset = iterator.last(); 1063 count = 0; 1064 1065 do { 1066 testOffset = testIterator->last(); 1067 testOffset = testIterator->next(count); // next() with a negative arg is same as previous 1068 if (offset != testOffset) 1069 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 1070 1071 if (offset != RuleBasedBreakIterator::DONE) { 1072 count--; 1073 offset = iterator.previous(); 1074 } 1075 } while (offset != RuleBasedBreakIterator::DONE); 1076 1077 delete testIterator; 1078} 1079 1080 1081//--------------------------------------------- 1082// 1083// other tests 1084// 1085//--------------------------------------------- 1086void RBBITest::TestEmptyString() 1087{ 1088 UnicodeString text = ""; 1089 UErrorCode status = U_ZERO_ERROR; 1090 1091 BITestData x(status); 1092 ADD_DATACHUNK(x, "", 0, status); // Break at start of data 1093 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 1094 if (U_FAILURE(status)) 1095 { 1096 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); 1097 return; 1098 } 1099 generalIteratorTest(*bi, x); 1100 delete bi; 1101} 1102 1103void RBBITest::TestGetAvailableLocales() 1104{ 1105 int32_t locCount = 0; 1106 const Locale* locList = BreakIterator::getAvailableLocales(locCount); 1107 1108 if (locCount == 0) 1109 dataerrln("getAvailableLocales() returned an empty list!"); 1110 // Just make sure that it's returning good memory. 1111 int32_t i; 1112 for (i = 0; i < locCount; ++i) { 1113 logln(locList[i].getName()); 1114 } 1115} 1116 1117//Testing the BreakIterator::getDisplayName() function 1118void RBBITest::TestGetDisplayName() 1119{ 1120 UnicodeString result; 1121 1122 BreakIterator::getDisplayName(Locale::getUS(), result); 1123 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 1124 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 1125 + result); 1126 1127 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 1128 if (result != "French (France)") 1129 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 1130 + result); 1131} 1132/** 1133 * Test End Behaviour 1134 * @bug 4068137 1135 */ 1136void RBBITest::TestEndBehaviour() 1137{ 1138 UErrorCode status = U_ZERO_ERROR; 1139 UnicodeString testString("boo."); 1140 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 1141 if (U_FAILURE(status)) 1142 { 1143 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 1144 return; 1145 } 1146 wb->setText(testString); 1147 1148 if (wb->first() != 0) 1149 errln("Didn't get break at beginning of string."); 1150 if (wb->next() != 3) 1151 errln("Didn't get break before period in \"boo.\""); 1152 if (wb->current() != 4 && wb->next() != 4) 1153 errln("Didn't get break at end of string."); 1154 delete wb; 1155} 1156/* 1157 * @bug 4153072 1158 */ 1159void RBBITest::TestBug4153072() { 1160 UErrorCode status = U_ZERO_ERROR; 1161 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 1162 if (U_FAILURE(status)) 1163 { 1164 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 1165 return; 1166 } 1167 UnicodeString str("...Hello, World!..."); 1168 int32_t begin = 3; 1169 int32_t end = str.length() - 3; 1170 UBool onBoundary; 1171 1172 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 1173 iter->adoptText(textIterator); 1174 int index; 1175 // Note: with the switch to UText, there is no way to restrict the 1176 // iteration range to begin at an index other than zero. 1177 // String character iterators created with a non-zero bound are 1178 // treated by RBBI as being empty. 1179 for (index = -1; index < begin + 1; ++index) { 1180 onBoundary = iter->isBoundary(index); 1181 if (index == 0? !onBoundary : onBoundary) { 1182 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 1183 " and begin index = " + begin); 1184 } 1185 } 1186 delete iter; 1187} 1188 1189 1190// 1191// Test for problem reported by Ashok Matoria on 9 July 2007 1192// One.<kSoftHyphen><kSpace>Two. 1193// 1194// Sentence break at start (0) and then on calling next() it breaks at 1195// 'T' of "Two". Now, at this point if I do next() and 1196// then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 1197// 1198void RBBITest::TestBug5775() { 1199 UErrorCode status = U_ZERO_ERROR; 1200 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1201 TEST_ASSERT_SUCCESS(status); 1202 if (U_FAILURE(status)) { 1203 return; 1204 } 1205// Check for status first for better handling of no data errors. 1206 TEST_ASSERT(bi != NULL); 1207 if (bi == NULL) { 1208 return; 1209 } 1210 1211 UnicodeString s("One.\\u00ad Two.", -1, US_INV); 1212 // 01234 56789 1213 s = s.unescape(); 1214 bi->setText(s); 1215 int pos = bi->next(); 1216 TEST_ASSERT(pos == 6); 1217 pos = bi->next(); 1218 TEST_ASSERT(pos == 10); 1219 pos = bi->previous(); 1220 TEST_ASSERT(pos == 6); 1221 delete bi; 1222} 1223 1224 1225 1226/** 1227 * Test Japanese Line Break 1228 * @bug 4095322 1229 */ 1230void RBBITest::TestJapaneseLineBreak() 1231{ 1232#if 0 1233 // Test needs updating some more... Dump it for now. 1234 1235 1236 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count 1237 // as opening and closing punctuation for line breaking. 1238 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars 1239 // from these tests. 6-13-2002 1240 // 1241 UErrorCode status = U_ZERO_ERROR; 1242 UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c"); 1243 UnicodeString precedingChars = CharsToUnicodeString( 1244 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f"); 1245 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e"); 1246 UnicodeString followingChars = CharsToUnicodeString( 1247 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc" 1248 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7" 1249 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034" 1250 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034" 1251 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302"); 1252 BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status); 1253 1254 int32_t i; 1255 if (U_FAILURE(status)) 1256 { 1257 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n"); 1258 return; 1259 } 1260 1261 for (i = 0; i < precedingChars.length(); i++) { 1262 testString.setCharAt(1, precedingChars[i]); 1263 iter->setText(testString); 1264 int32_t j = iter->first(); 1265 if (j != 0) 1266 errln("ja line break failure: failed to start at 0"); 1267 j = iter->next(); 1268 if (j != 1) 1269 errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i]) 1270 + "' (" + ((int)(precedingChars[i])) + ")"); 1271 j = iter->next(); 1272 if (j != 3) 1273 errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i]) 1274 + "' (" + ((int)(precedingChars[i])) + ")"); 1275 } 1276 1277 for (i = 0; i < followingChars.length(); i++) { 1278 testString.setCharAt(1, followingChars[i]); 1279 iter->setText(testString); 1280 int j = iter->first(); 1281 if (j != 0) 1282 errln("ja line break failure: failed to start at 0"); 1283 j = iter->next(); 1284 if (j != 2) 1285 errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i]) 1286 + "' (" + ((int)(followingChars[i])) + ")"); 1287 j = iter->next(); 1288 if (j != 3) 1289 errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i]) 1290 + "' (" + ((int)(followingChars[i])) + ")"); 1291 } 1292 delete iter; 1293#endif 1294} 1295 1296 1297//------------------------------------------------------------------------------ 1298// 1299// RBBITest::Extended Run RBBI Tests from an external test data file 1300// 1301//------------------------------------------------------------------------------ 1302 1303struct TestParams { 1304 BreakIterator *bi; 1305 UnicodeString dataToBreak; 1306 UVector32 *expectedBreaks; 1307 UVector32 *srcLine; 1308 UVector32 *srcCol; 1309}; 1310 1311void RBBITest::executeTest(TestParams *t) { 1312 int32_t bp; 1313 int32_t prevBP; 1314 int32_t i; 1315 1316 if (t->bi == NULL) { 1317 return; 1318 } 1319 1320 t->bi->setText(t->dataToBreak); 1321 // 1322 // Run the iterator forward 1323 // 1324 prevBP = -1; 1325 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 1326 if (prevBP == bp) { 1327 // Fail for lack of forward progress. 1328 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 1329 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1330 break; 1331 } 1332 1333 // Check that there were we didn't miss an expected break between the last one 1334 // and this one. 1335 for (i=prevBP+1; i<bp; i++) { 1336 if (t->expectedBreaks->elementAti(i) != 0) { 1337 int expected[] = {0, i}; 1338 printStringBreaks(t->dataToBreak, expected, 2); 1339 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1340 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1341 } 1342 } 1343 1344 // Check that the break we did find was expected 1345 if (t->expectedBreaks->elementAti(bp) == 0) { 1346 int expected[] = {0, bp}; 1347 printStringBreaks(t->dataToBreak, expected, 2); 1348 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1349 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1350 } else { 1351 // The break was expected. 1352 // Check that the {nnn} tag value is correct. 1353 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 1354 if (expectedTagVal == -1) { 1355 expectedTagVal = 0; 1356 } 1357 int32_t line = t->srcLine->elementAti(bp); 1358 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 1359 if (rs != expectedTagVal) { 1360 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 1361 " Actual, Expected status = %4d, %4d", 1362 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 1363 } 1364 } 1365 1366 1367 prevBP = bp; 1368 } 1369 1370 // Verify that there were no missed expected breaks after the last one found 1371 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) { 1372 if (t->expectedBreaks->elementAti(i) != 0) { 1373 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1374 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1375 } 1376 } 1377 1378 // 1379 // Run the iterator backwards, verify that the same breaks are found. 1380 // 1381 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen. 1382 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { 1383 if (prevBP == bp) { 1384 // Fail for lack of progress. 1385 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 1386 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1387 break; 1388 } 1389 1390 // Check that there were we didn't miss an expected break between the last one 1391 // and this one. (UVector returns zeros for index out of bounds.) 1392 for (i=prevBP-1; i>bp; i--) { 1393 if (t->expectedBreaks->elementAti(i) != 0) { 1394 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1395 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1396 } 1397 } 1398 1399 // Check that the break we did find was expected 1400 if (t->expectedBreaks->elementAti(bp) == 0) { 1401 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1402 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1403 } else { 1404 // The break was expected. 1405 // Check that the {nnn} tag value is correct. 1406 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 1407 if (expectedTagVal == -1) { 1408 expectedTagVal = 0; 1409 } 1410 int line = t->srcLine->elementAti(bp); 1411 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 1412 if (rs != expectedTagVal) { 1413 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 1414 " Actual, Expected status = %4d, %4d", 1415 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 1416 } 1417 } 1418 1419 prevBP = bp; 1420 } 1421 1422 // Verify that there were no missed breaks prior to the last one found 1423 for (i=prevBP-1; i>=0; i--) { 1424 if (t->expectedBreaks->elementAti(i) != 0) { 1425 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1426 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1427 } 1428 } 1429} 1430 1431 1432void RBBITest::TestExtended() { 1433#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1434 UErrorCode status = U_ZERO_ERROR; 1435 Locale locale(""); 1436 1437 UnicodeString rules; 1438 TestParams tp; 1439 tp.bi = NULL; 1440 tp.expectedBreaks = new UVector32(status); 1441 tp.srcLine = new UVector32(status); 1442 tp.srcCol = new UVector32(status); 1443 1444 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status); 1445 if (U_FAILURE(status)) { 1446 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 1447 } 1448 1449 1450 // 1451 // Open and read the test data file. 1452 // 1453 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1454 char testFileName[1000]; 1455 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1456 errln("Can't open test data. Path too long."); 1457 return; 1458 } 1459 strcpy(testFileName, testDataDirectory); 1460 strcat(testFileName, "rbbitst.txt"); 1461 1462 int len; 1463 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1464 if (U_FAILURE(status)) { 1465 return; /* something went wrong, error already output */ 1466 } 1467 1468 1469 1470 1471 // 1472 // Put the test data into a UnicodeString 1473 // 1474 UnicodeString testString(FALSE, testFile, len); 1475 1476 enum EParseState{ 1477 PARSE_COMMENT, 1478 PARSE_TAG, 1479 PARSE_DATA, 1480 PARSE_NUM 1481 } 1482 parseState = PARSE_TAG; 1483 1484 EParseState savedState = PARSE_TAG; 1485 1486 static const UChar CH_LF = 0x0a; 1487 static const UChar CH_CR = 0x0d; 1488 static const UChar CH_HASH = 0x23; 1489 /*static const UChar CH_PERIOD = 0x2e;*/ 1490 static const UChar CH_LT = 0x3c; 1491 static const UChar CH_GT = 0x3e; 1492 static const UChar CH_BACKSLASH = 0x5c; 1493 static const UChar CH_BULLET = 0x2022; 1494 1495 int32_t lineNum = 1; 1496 int32_t colStart = 0; 1497 int32_t column = 0; 1498 int32_t charIdx = 0; 1499 1500 int32_t tagValue = 0; // The numeric value of a <nnn> tag. 1501 1502 for (charIdx = 0; charIdx < len; ) { 1503 status = U_ZERO_ERROR; 1504 UChar c = testString.charAt(charIdx); 1505 charIdx++; 1506 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) { 1507 // treat CRLF as a unit 1508 c = CH_LF; 1509 charIdx++; 1510 } 1511 if (c == CH_LF || c == CH_CR) { 1512 lineNum++; 1513 colStart = charIdx; 1514 } 1515 column = charIdx - colStart + 1; 1516 1517 switch (parseState) { 1518 case PARSE_COMMENT: 1519 if (c == 0x0a || c == 0x0d) { 1520 parseState = savedState; 1521 } 1522 break; 1523 1524 case PARSE_TAG: 1525 { 1526 if (c == CH_HASH) { 1527 parseState = PARSE_COMMENT; 1528 savedState = PARSE_TAG; 1529 break; 1530 } 1531 if (u_isUWhiteSpace(c)) { 1532 break; 1533 } 1534 if (testString.compare(charIdx-1, 6, "<word>") == 0) { 1535 delete tp.bi; 1536 tp.bi = BreakIterator::createWordInstance(locale, status); 1537 charIdx += 5; 1538 break; 1539 } 1540 if (testString.compare(charIdx-1, 6, "<char>") == 0) { 1541 delete tp.bi; 1542 tp.bi = BreakIterator::createCharacterInstance(locale, status); 1543 charIdx += 5; 1544 break; 1545 } 1546 if (testString.compare(charIdx-1, 6, "<line>") == 0) { 1547 delete tp.bi; 1548 tp.bi = BreakIterator::createLineInstance(locale, status); 1549 charIdx += 5; 1550 break; 1551 } 1552 if (testString.compare(charIdx-1, 6, "<sent>") == 0) { 1553 delete tp.bi; 1554 tp.bi = NULL; 1555 tp.bi = BreakIterator::createSentenceInstance(locale, status); 1556 charIdx += 5; 1557 break; 1558 } 1559 if (testString.compare(charIdx-1, 7, "<title>") == 0) { 1560 delete tp.bi; 1561 tp.bi = BreakIterator::createTitleInstance(locale, status); 1562 charIdx += 6; 1563 break; 1564 } 1565 1566 // <locale loc_name> 1567 localeMatcher.reset(testString); 1568 if (localeMatcher.lookingAt(charIdx-1, status)) { 1569 UnicodeString localeName = localeMatcher.group(1, status); 1570 char localeName8[100]; 1571 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 1572 locale = Locale::createFromName(localeName8); 1573 charIdx += localeMatcher.group(0, status).length(); 1574 TEST_ASSERT_SUCCESS(status); 1575 break; 1576 } 1577 if (testString.compare(charIdx-1, 6, "<data>") == 0) { 1578 parseState = PARSE_DATA; 1579 charIdx += 5; 1580 tp.dataToBreak = ""; 1581 tp.expectedBreaks->removeAllElements(); 1582 tp.srcCol ->removeAllElements(); 1583 tp.srcLine->removeAllElements(); 1584 break; 1585 } 1586 1587 errln("line %d: Tag expected in test file.", lineNum); 1588 parseState = PARSE_COMMENT; 1589 savedState = PARSE_DATA; 1590 goto end_test; // Stop the test. 1591 } 1592 break; 1593 1594 case PARSE_DATA: 1595 if (c == CH_BULLET) { 1596 int32_t breakIdx = tp.dataToBreak.length(); 1597 tp.expectedBreaks->setSize(breakIdx+1); 1598 tp.expectedBreaks->setElementAt(-1, breakIdx); 1599 tp.srcLine->setSize(breakIdx+1); 1600 tp.srcLine->setElementAt(lineNum, breakIdx); 1601 tp.srcCol ->setSize(breakIdx+1); 1602 tp.srcCol ->setElementAt(column, breakIdx); 1603 break; 1604 } 1605 1606 if (testString.compare(charIdx-1, 7, "</data>") == 0) { 1607 // Add final entry to mappings from break location to source file position. 1608 // Need one extra because last break position returned is after the 1609 // last char in the data, not at the last char. 1610 tp.srcLine->addElement(lineNum, status); 1611 tp.srcCol ->addElement(column, status); 1612 1613 parseState = PARSE_TAG; 1614 charIdx += 6; 1615 1616 // RUN THE TEST! 1617 executeTest(&tp); 1618 break; 1619 } 1620 1621 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { 1622 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 1623 // Get the code point from the name and insert it into the test data. 1624 // (Damn, no API takes names in Unicode !!! 1625 // we've got to take it back to char *) 1626 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx); 1627 int32_t nameLength = nameEndIdx - (charIdx+2); 1628 char charNameBuf[200]; 1629 UChar32 theChar = -1; 1630 if (nameEndIdx != -1) { 1631 UErrorCode status = U_ZERO_ERROR; 1632 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 1633 charNameBuf[sizeof(charNameBuf)-1] = 0; 1634 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 1635 if (U_FAILURE(status)) { 1636 theChar = -1; 1637 } 1638 } 1639 if (theChar == -1) { 1640 errln("Error in named character in test file at line %d, col %d", 1641 lineNum, column); 1642 } else { 1643 // Named code point was recognized. Insert it 1644 // into the test data. 1645 tp.dataToBreak.append(theChar); 1646 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1647 tp.srcLine->addElement(lineNum, status); 1648 tp.srcCol ->addElement(column, status); 1649 } 1650 } 1651 if (nameEndIdx > charIdx) { 1652 charIdx = nameEndIdx+1; 1653 1654 } 1655 break; 1656 } 1657 1658 1659 1660 1661 if (testString.compare(charIdx-1, 2, "<>") == 0) { 1662 charIdx++; 1663 int32_t breakIdx = tp.dataToBreak.length(); 1664 tp.expectedBreaks->setSize(breakIdx+1); 1665 tp.expectedBreaks->setElementAt(-1, breakIdx); 1666 tp.srcLine->setSize(breakIdx+1); 1667 tp.srcLine->setElementAt(lineNum, breakIdx); 1668 tp.srcCol ->setSize(breakIdx+1); 1669 tp.srcCol ->setElementAt(column, breakIdx); 1670 break; 1671 } 1672 1673 if (c == CH_LT) { 1674 tagValue = 0; 1675 parseState = PARSE_NUM; 1676 break; 1677 } 1678 1679 if (c == CH_HASH && column==3) { // TODO: why is column off so far? 1680 parseState = PARSE_COMMENT; 1681 savedState = PARSE_DATA; 1682 break; 1683 } 1684 1685 if (c == CH_BACKSLASH) { 1686 // Check for \ at end of line, a line continuation. 1687 // Advance over (discard) the newline 1688 UChar32 cp = testString.char32At(charIdx); 1689 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) { 1690 // We have a CR LF 1691 // Need an extra increment of the input ptr to move over both of them 1692 charIdx++; 1693 } 1694 if (cp == CH_LF || cp == CH_CR) { 1695 lineNum++; 1696 colStart = charIdx; 1697 charIdx++; 1698 break; 1699 } 1700 1701 // Let unescape handle the back slash. 1702 cp = testString.unescapeAt(charIdx); 1703 if (cp != -1) { 1704 // Escape sequence was recognized. Insert the char 1705 // into the test data. 1706 tp.dataToBreak.append(cp); 1707 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1708 tp.srcLine->addElement(lineNum, status); 1709 tp.srcCol ->addElement(column, status); 1710 } 1711 break; 1712 } 1713 1714 1715 // Not a recognized backslash escape sequence. 1716 // Take the next char as a literal. 1717 // TODO: Should this be an error? 1718 c = testString.charAt(charIdx); 1719 charIdx = testString.moveIndex32(charIdx, 1); 1720 } 1721 1722 // Normal, non-escaped data char. 1723 tp.dataToBreak.append(c); 1724 1725 // Save the mapping from offset in the data to line/column numbers in 1726 // the original input file. Will be used for better error messages only. 1727 // If there's an expected break before this char, the slot in the mapping 1728 // vector will already be set for this char; don't overwrite it. 1729 if (tp.dataToBreak.length() > tp.srcLine->size()) { 1730 tp.srcLine->addElement(lineNum, status); 1731 tp.srcCol ->addElement(column, status); 1732 } 1733 break; 1734 1735 1736 case PARSE_NUM: 1737 // We are parsing an expected numeric tag value, like <1234>, 1738 // within a chunk of data. 1739 if (u_isUWhiteSpace(c)) { 1740 break; 1741 } 1742 1743 if (c == CH_GT) { 1744 // Finished the number. Add the info to the expected break data, 1745 // and switch parse state back to doing plain data. 1746 parseState = PARSE_DATA; 1747 if (tagValue == 0) { 1748 tagValue = -1; 1749 } 1750 int32_t breakIdx = tp.dataToBreak.length(); 1751 tp.expectedBreaks->setSize(breakIdx+1); 1752 tp.expectedBreaks->setElementAt(tagValue, breakIdx); 1753 tp.srcLine->setSize(breakIdx+1); 1754 tp.srcLine->setElementAt(lineNum, breakIdx); 1755 tp.srcCol ->setSize(breakIdx+1); 1756 tp.srcCol ->setElementAt(column, breakIdx); 1757 break; 1758 } 1759 1760 if (u_isdigit(c)) { 1761 tagValue = tagValue*10 + u_charDigitValue(c); 1762 break; 1763 } 1764 1765 errln("Syntax Error in test file at line %d, col %d", 1766 lineNum, column); 1767 parseState = PARSE_COMMENT; 1768 goto end_test; // Stop the test 1769 break; 1770 } 1771 1772 1773 if (U_FAILURE(status)) { 1774 errln("ICU Error %s while parsing test file at line %d.", 1775 u_errorName(status), lineNum); 1776 status = U_ZERO_ERROR; 1777 goto end_test; // Stop the test 1778 } 1779 1780 } 1781 1782end_test: 1783 delete tp.bi; 1784 delete tp.expectedBreaks; 1785 delete tp.srcLine; 1786 delete tp.srcCol; 1787 delete [] testFile; 1788#endif 1789} 1790 1791void RBBITest::TestThaiBreaks() { 1792 UErrorCode status=U_ZERO_ERROR; 1793 BreakIterator* b; 1794 Locale locale = Locale("th"); 1795 int32_t p, index; 1796 UChar c[]= { 1797 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, 1798 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, 1799 0x0E16, 0x0E49, 0x0E33 1800 }; 1801 int32_t expectedWordResult[] = { 1802 2, 3, 6, 10, 11, 15, 17, 20, 22 1803 }; 1804 int32_t expectedLineResult[] = { 1805 3, 6, 11, 15, 17, 20, 22 1806 }; 1807 int32_t size = sizeof(c)/sizeof(UChar); 1808 UnicodeString text=UnicodeString(c); 1809 1810 b = BreakIterator::createWordInstance(locale, status); 1811 if (U_FAILURE(status)) { 1812 errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status)); 1813 return; 1814 } 1815 b->setText(text); 1816 p = index = 0; 1817 while ((p=b->next())!=BreakIterator::DONE && p < size) { 1818 if (p != expectedWordResult[index++]) { 1819 errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p); 1820 } 1821 } 1822 delete b; 1823 1824 b = BreakIterator::createLineInstance(locale, status); 1825 if (U_FAILURE(status)) { 1826 printf("Unable to create thai line break iterator.\n"); 1827 return; 1828 } 1829 b->setText(text); 1830 p = index = 0; 1831 while ((p=b->next())!=BreakIterator::DONE && p < size) { 1832 if (p != expectedLineResult[index++]) { 1833 errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p); 1834 } 1835 } 1836 1837 delete b; 1838} 1839 1840// UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX" 1841// Words don't include colon or period (cldrbug #1969). 1842static const char posxWordText[] = "Can't have breaks in xx:yy or struct.field for CS-types."; 1843static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 }; 1844static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 }; 1845 1846// UBreakIteratorType UBRK_WORD, Locale "ja" 1847// Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009). 1848static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" 1849 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; 1850static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 }; 1851static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; 1852 1853// UBreakIteratorType UBRK_SENTENCE, Locale "el" 1854// Add break after Greek question mark (cldrbug #2069). 1855static const char elSentText[] = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. " 1856 "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3"; 1857static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 }; 1858static const int32_t elSentROffsets[] = { 20, 27, 35, 36 }; 1859 1860// UBreakIteratorType UBRK_CHARACTER, Locale "th" 1861// Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161). 1862static const char thCharText[] = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 " 1863 "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) " 1864 "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 "; 1865static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 1866 12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28, 1867 29, 30, 32, 33, 35, 37, 38, 39, 40, 41 }; 1868static const int32_t thCharROffsets[] = { 1, 3, 5, 6, 7, 8, 9, 11, 1869 12, 13, 15, 17, 19, 20, 22, 24, 26, 27, 28, 1870 29, 32, 33, 35, 37, 38, 40, 41 }; 1871 1872typedef struct { 1873 UBreakIteratorType type; 1874 const char * locale; 1875 const char * escapedText; 1876 const int32_t * tailoredOffsets; 1877 int32_t tailoredOffsetsCount; 1878 const int32_t * rootOffsets; 1879 int32_t rootOffsetsCount; 1880} TailoredBreakItem; 1881 1882#define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0])) 1883 1884static const TailoredBreakItem tbItems[] = { 1885 { UBRK_WORD, "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) }, 1886 { UBRK_WORD, "ja", jaWordText, ARRAY_PTR_LEN(jaWordTOffsets), ARRAY_PTR_LEN(jaWordROffsets) }, 1887 { UBRK_SENTENCE, "el", elSentText, ARRAY_PTR_LEN(elSentTOffsets), ARRAY_PTR_LEN(elSentROffsets) }, 1888 { UBRK_CHARACTER, "th", thCharText, ARRAY_PTR_LEN(thCharTOffsets), ARRAY_PTR_LEN(thCharROffsets) }, 1889 { UBRK_CHARACTER, NULL, NULL, NULL,0, NULL,0 } // terminator 1890}; 1891 1892static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) { 1893 while (count-- > 0) { 1894 int writeCount; 1895 sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */ 1896 buffer += writeCount; 1897 buflen -= writeCount; 1898 } 1899} 1900 1901enum { kMaxOffsetCount = 128 }; 1902 1903void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) { 1904 brkitr->setText( CharsToUnicodeString(escapedText) ); 1905 int32_t foundOffsets[kMaxOffsetCount]; 1906 int32_t offset, foundOffsetsCount = 0; 1907 // do forwards iteration test 1908 while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) { 1909 foundOffsets[foundOffsetsCount++] = offset; 1910 } 1911 if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) { 1912 // log error for forwards test 1913 char formatExpect[512], formatFound[512]; 1914 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets); 1915 formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets); 1916 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n", 1917 type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound); 1918 } else { 1919 // do backwards iteration test 1920 --foundOffsetsCount; // back off one from the end offset 1921 while ( foundOffsetsCount > 0 ) { 1922 offset = brkitr->previous(); 1923 if ( offset != foundOffsets[--foundOffsetsCount] ) { 1924 // log error for backwards test 1925 char formatExpect[512]; 1926 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets); 1927 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n", 1928 type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]); 1929 break; 1930 } 1931 } 1932 } 1933} 1934 1935void RBBITest::TestTailoredBreaks() { 1936 const TailoredBreakItem * tbItemPtr; 1937 Locale rootLocale = Locale("root"); 1938 for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) { 1939 Locale testLocale = Locale(tbItemPtr->locale); 1940 BreakIterator * tailoredBrkiter; 1941 BreakIterator * rootBrkiter; 1942 UErrorCode status = U_ZERO_ERROR; 1943 switch (tbItemPtr->type) { 1944 case UBRK_CHARACTER: 1945 tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status); 1946 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status); 1947 break; 1948 case UBRK_WORD: 1949 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status); 1950 rootBrkiter = BreakIterator::createWordInstance(rootLocale, status); 1951 break; 1952 case UBRK_LINE: 1953 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status); 1954 rootBrkiter = BreakIterator::createLineInstance(rootLocale, status); 1955 break; 1956 case UBRK_SENTENCE: 1957 tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status); 1958 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status); 1959 break; 1960 default: 1961 status = U_UNSUPPORTED_ERROR; 1962 break; 1963 } 1964 if (U_FAILURE(status)) { 1965 errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status)); 1966 continue; 1967 } 1968 TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount); 1969 TBTest(rootBrkiter, (int)(tbItemPtr->type), "root", tbItemPtr->escapedText, tbItemPtr->rootOffsets, tbItemPtr->rootOffsetsCount); 1970 1971 delete rootBrkiter; 1972 delete tailoredBrkiter; 1973 } 1974} 1975 1976 1977//------------------------------------------------------------------------------- 1978// 1979// ReadAndConvertFile Read a text data file, convert it to UChars, and 1980// return the datain one big UChar * buffer, which the caller must delete. 1981// 1982// parameters: 1983// fileName: the name of the file, with no directory part. The test data directory 1984// is assumed. 1985// ulen an out parameter, receives the actual length (in UChars) of the file data. 1986// encoding The file encoding. If the file contains a BOM, that will override the encoding 1987// specified here. The BOM, if it exists, will be stripped from the returned data. 1988// Pass NULL for the system default encoding. 1989// status 1990// returns: 1991// The file data, converted to UChar. 1992// The caller must delete this when done with 1993// delete [] theBuffer; 1994// 1995// TODO: This is a clone of RegexTest::ReadAndConvertFile. 1996// Move this function to some common place. 1997// 1998//-------------------------------------------------------------------------------- 1999UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 2000 UChar *retPtr = NULL; 2001 char *fileBuf = NULL; 2002 UConverter* conv = NULL; 2003 FILE *f = NULL; 2004 2005 ulen = 0; 2006 if (U_FAILURE(status)) { 2007 return retPtr; 2008 } 2009 2010 // 2011 // Open the file. 2012 // 2013 f = fopen(fileName, "rb"); 2014 if (f == 0) { 2015 dataerrln("Error opening test data file %s\n", fileName); 2016 status = U_FILE_ACCESS_ERROR; 2017 return NULL; 2018 } 2019 // 2020 // Read it in 2021 // 2022 int fileSize; 2023 int amt_read; 2024 2025 fseek( f, 0, SEEK_END); 2026 fileSize = ftell(f); 2027 fileBuf = new char[fileSize]; 2028 fseek(f, 0, SEEK_SET); 2029 amt_read = fread(fileBuf, 1, fileSize, f); 2030 if (amt_read != fileSize || fileSize <= 0) { 2031 errln("Error reading test data file."); 2032 goto cleanUpAndReturn; 2033 } 2034 2035 // 2036 // Look for a Unicode Signature (BOM) on the data just read 2037 // 2038 int32_t signatureLength; 2039 const char * fileBufC; 2040 const char* bomEncoding; 2041 2042 fileBufC = fileBuf; 2043 bomEncoding = ucnv_detectUnicodeSignature( 2044 fileBuf, fileSize, &signatureLength, &status); 2045 if(bomEncoding!=NULL ){ 2046 fileBufC += signatureLength; 2047 fileSize -= signatureLength; 2048 encoding = bomEncoding; 2049 } 2050 2051 // 2052 // Open a converter to take the rule file to UTF-16 2053 // 2054 conv = ucnv_open(encoding, &status); 2055 if (U_FAILURE(status)) { 2056 goto cleanUpAndReturn; 2057 } 2058 2059 // 2060 // Convert the rules to UChar. 2061 // Preflight first to determine required buffer size. 2062 // 2063 ulen = ucnv_toUChars(conv, 2064 NULL, // dest, 2065 0, // destCapacity, 2066 fileBufC, 2067 fileSize, 2068 &status); 2069 if (status == U_BUFFER_OVERFLOW_ERROR) { 2070 // Buffer Overflow is expected from the preflight operation. 2071 status = U_ZERO_ERROR; 2072 2073 retPtr = new UChar[ulen+1]; 2074 ucnv_toUChars(conv, 2075 retPtr, // dest, 2076 ulen+1, 2077 fileBufC, 2078 fileSize, 2079 &status); 2080 } 2081 2082cleanUpAndReturn: 2083 fclose(f); 2084 delete []fileBuf; 2085 ucnv_close(conv); 2086 if (U_FAILURE(status)) { 2087 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 2088 delete retPtr; 2089 retPtr = 0; 2090 ulen = 0; 2091 }; 2092 return retPtr; 2093} 2094 2095 2096 2097//-------------------------------------------------------------------------------------------- 2098// 2099// Run tests from each of the boundary test data files distributed by the Unicode Consortium 2100// 2101//------------------------------------------------------------------------------------------- 2102void RBBITest::TestUnicodeFiles() { 2103 RuleBasedBreakIterator *bi; 2104 UErrorCode status = U_ZERO_ERROR; 2105 2106 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getDefault(), status); 2107 TEST_ASSERT_SUCCESS(status); 2108 if (U_SUCCESS(status)) { 2109 runUnicodeTestData("GraphemeBreakTest.txt", bi); 2110 } 2111 delete bi; 2112 2113 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status); 2114 TEST_ASSERT_SUCCESS(status); 2115 if (U_SUCCESS(status)) { 2116 runUnicodeTestData("WordBreakTest.txt", bi); 2117 } 2118 delete bi; 2119 2120 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 2121 TEST_ASSERT_SUCCESS(status); 2122 if (U_SUCCESS(status)) { 2123 runUnicodeTestData("SentenceBreakTest.txt", bi); 2124 } 2125 delete bi; 2126 2127 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 2128 TEST_ASSERT_SUCCESS(status); 2129 if (U_SUCCESS(status)) { 2130 runUnicodeTestData("LineBreakTest.txt", bi); 2131 } 2132 delete bi; 2133} 2134 2135 2136//-------------------------------------------------------------------------------------------- 2137// 2138// Run tests from one of the boundary test data files distributed by the Unicode Consortium 2139// 2140//------------------------------------------------------------------------------------------- 2141void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 2142#if !UCONFIG_NO_REGULAR_EXPRESSIONS 2143 UErrorCode status = U_ZERO_ERROR; 2144 2145 // 2146 // Open and read the test data file, put it into a UnicodeString. 2147 // 2148 const char *testDataDirectory = IntlTest::getSourceTestData(status); 2149 char testFileName[1000]; 2150 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 2151 dataerrln("Can't open test data. Path too long."); 2152 return; 2153 } 2154 strcpy(testFileName, testDataDirectory); 2155 strcat(testFileName, fileName); 2156 2157 logln("Opening data file %s\n", fileName); 2158 2159 int len; 2160 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 2161 if (status != U_FILE_ACCESS_ERROR) { 2162 TEST_ASSERT_SUCCESS(status); 2163 TEST_ASSERT(testFile != NULL); 2164 } 2165 if (U_FAILURE(status) || testFile == NULL) { 2166 return; /* something went wrong, error already output */ 2167 } 2168 UnicodeString testFileAsString(TRUE, testFile, len); 2169 2170 // 2171 // Parse the test data file using a regular expression. 2172 // Each kind of token is recognized in its own capture group; what type of item was scanned 2173 // is identified by which group had a match. 2174 // 2175 // Caputure Group # 1 2 3 4 5 2176 // Parses this item: divide x hex digits comment \n unrecognized \n 2177 // 2178 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 2179 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 2180 UnicodeString testString; 2181 UVector32 breakPositions(status); 2182 int lineNumber = 1; 2183 TEST_ASSERT_SUCCESS(status); 2184 if (U_FAILURE(status)) { 2185 return; 2186 } 2187 2188 // 2189 // Scan through each test case, building up the string to be broken in testString, 2190 // and the positions that should be boundaries in the breakPositions vector. 2191 // 2192 while (tokenMatcher.find()) { 2193 if (tokenMatcher.start(1, status) >= 0) { 2194 // Scanned a divide sign, indicating a break position in the test data. 2195 if (testString.length()>0) { 2196 breakPositions.addElement(testString.length(), status); 2197 } 2198 } 2199 else if (tokenMatcher.start(2, status) >= 0) { 2200 // Scanned an 'x', meaning no break at this position in the test data 2201 // Nothing to be done here. 2202 } 2203 else if (tokenMatcher.start(3, status) >= 0) { 2204 // Scanned Hex digits. Convert them to binary, append to the character data string. 2205 const UnicodeString &hexNumber = tokenMatcher.group(3, status); 2206 int length = hexNumber.length(); 2207 if (length<=8) { 2208 char buf[10]; 2209 hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 2210 UChar32 c = (UChar32)strtol(buf, NULL, 16); 2211 if (c<=0x10ffff) { 2212 testString.append(c); 2213 } else { 2214 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 2215 fileName, lineNumber); 2216 } 2217 } else { 2218 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 2219 fileName, lineNumber); 2220 } 2221 } 2222 else if (tokenMatcher.start(4, status) >= 0) { 2223 // Scanned to end of a line, possibly skipping over a comment in the process. 2224 // If the line from the file contained test data, run the test now. 2225 // 2226 if (testString.length() > 0) { 2227 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 2228 } 2229 2230 // Clear out this test case. 2231 // The string and breakPositions vector will be refilled as the next 2232 // test case is parsed. 2233 testString.remove(); 2234 breakPositions.removeAllElements(); 2235 lineNumber++; 2236 } else { 2237 // Scanner catchall. Something unrecognized appeared on the line. 2238 char token[16]; 2239 UnicodeString uToken = tokenMatcher.group(0, status); 2240 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 2241 token[sizeof(token)-1] = 0; 2242 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 2243 2244 // Clean up, in preparation for continuing with the next line. 2245 testString.remove(); 2246 breakPositions.removeAllElements(); 2247 lineNumber++; 2248 } 2249 TEST_ASSERT_SUCCESS(status); 2250 if (U_FAILURE(status)) { 2251 break; 2252 } 2253 } 2254 2255 delete [] testFile; 2256 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 2257} 2258 2259//-------------------------------------------------------------------------------------------- 2260// 2261// checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 2262// test data files. Do only a simple, forward-only check - 2263// this test is mostly to check that ICU and the Unicode 2264// data agree with each other. 2265// 2266//-------------------------------------------------------------------------------------------- 2267void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 2268 const UnicodeString &testString, // Text data to be broken 2269 UVector32 *breakPositions, // Positions where breaks should be found. 2270 RuleBasedBreakIterator *bi) { 2271 int32_t pos; // Break Position in the test string 2272 int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 2273 int32_t expectedPos; // Expected break position (index into test string) 2274 2275 bi->setText(testString); 2276 pos = bi->first(); 2277 pos = bi->next(); 2278 2279 while (pos != BreakIterator::DONE) { 2280 if (expectedI >= breakPositions->size()) { 2281 errln("Test file \"%s\", line %d, unexpected break found at position %d", 2282 testFileName, lineNumber, pos); 2283 break; 2284 } 2285 expectedPos = breakPositions->elementAti(expectedI); 2286 if (pos < expectedPos) { 2287 errln("Test file \"%s\", line %d, unexpected break found at position %d", 2288 testFileName, lineNumber, pos); 2289 break; 2290 } 2291 if (pos > expectedPos) { 2292 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 2293 testFileName, lineNumber, expectedPos); 2294 break; 2295 } 2296 pos = bi->next(); 2297 expectedI++; 2298 } 2299 2300 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 2301 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 2302 testFileName, lineNumber, breakPositions->elementAti(expectedI)); 2303 } 2304} 2305 2306 2307 2308#if !UCONFIG_NO_REGULAR_EXPRESSIONS 2309//--------------------------------------------------------------------------------------- 2310// 2311// classs RBBIMonkeyKind 2312// 2313// Monkey Test for Break Iteration 2314// Abstract interface class. Concrete derived classes independently 2315// implement the break rules for different iterator types. 2316// 2317// The Monkey Test itself uses doesn't know which type of break iterator it is 2318// testing, but works purely in terms of the interface defined here. 2319// 2320//--------------------------------------------------------------------------------------- 2321class RBBIMonkeyKind { 2322public: 2323 // Return a UVector of UnicodeSets, representing the character classes used 2324 // for this type of iterator. 2325 virtual UVector *charClasses() = 0; 2326 2327 // Set the test text on which subsequent calls to next() will operate 2328 virtual void setText(const UnicodeString &s) = 0; 2329 2330 // Find the next break postion, starting from the prev break position, or from zero. 2331 // Return -1 after reaching end of string. 2332 virtual int32_t next(int32_t i) = 0; 2333 2334 virtual ~RBBIMonkeyKind(); 2335 UErrorCode deferredStatus; 2336 2337 2338protected: 2339 RBBIMonkeyKind(); 2340 2341private: 2342}; 2343 2344RBBIMonkeyKind::RBBIMonkeyKind() { 2345 deferredStatus = U_ZERO_ERROR; 2346} 2347 2348RBBIMonkeyKind::~RBBIMonkeyKind() { 2349} 2350 2351 2352//---------------------------------------------------------------------------------------- 2353// 2354// Random Numbers. Similar to standard lib rand() and srand() 2355// Not using library to 2356// 1. Get same results on all platforms. 2357// 2. Get access to current seed, to more easily reproduce failures. 2358// 2359//--------------------------------------------------------------------------------------- 2360static uint32_t m_seed = 1; 2361 2362static uint32_t m_rand() 2363{ 2364 m_seed = m_seed * 1103515245 + 12345; 2365 return (uint32_t)(m_seed/65536) % 32768; 2366} 2367 2368 2369//------------------------------------------------------------------------------------------ 2370// 2371// class RBBICharMonkey Character (Grapheme Cluster) specific implementation 2372// of RBBIMonkeyKind. 2373// 2374//------------------------------------------------------------------------------------------ 2375class RBBICharMonkey: public RBBIMonkeyKind { 2376public: 2377 RBBICharMonkey(); 2378 virtual ~RBBICharMonkey(); 2379 virtual UVector *charClasses(); 2380 virtual void setText(const UnicodeString &s); 2381 virtual int32_t next(int32_t i); 2382private: 2383 UVector *fSets; 2384 2385 UnicodeSet *fCRLFSet; 2386 UnicodeSet *fControlSet; 2387 UnicodeSet *fExtendSet; 2388 UnicodeSet *fPrependSet; 2389 UnicodeSet *fSpacingSet; 2390 UnicodeSet *fLSet; 2391 UnicodeSet *fVSet; 2392 UnicodeSet *fTSet; 2393 UnicodeSet *fLVSet; 2394 UnicodeSet *fLVTSet; 2395 UnicodeSet *fHangulSet; 2396 UnicodeSet *fAnySet; 2397 2398 const UnicodeString *fText; 2399}; 2400 2401 2402RBBICharMonkey::RBBICharMonkey() { 2403 UErrorCode status = U_ZERO_ERROR; 2404 2405 fText = NULL; 2406 2407 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 2408 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status); 2409 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status); 2410 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 2411 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 2412 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 2413 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 2414 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 2415 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 2416 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 2417 fHangulSet = new UnicodeSet(); 2418 fHangulSet->addAll(*fLSet); 2419 fHangulSet->addAll(*fVSet); 2420 fHangulSet->addAll(*fTSet); 2421 fHangulSet->addAll(*fLVSet); 2422 fHangulSet->addAll(*fLVTSet); 2423 fAnySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status); 2424 2425 fSets = new UVector(status); 2426 fSets->addElement(fCRLFSet, status); 2427 fSets->addElement(fControlSet, status); 2428 fSets->addElement(fExtendSet, status); 2429 fSets->addElement(fPrependSet, status); 2430 fSets->addElement(fSpacingSet, status); 2431 fSets->addElement(fHangulSet, status); 2432 fSets->addElement(fAnySet, status); 2433 if (U_FAILURE(status)) { 2434 deferredStatus = status; 2435 } 2436} 2437 2438 2439void RBBICharMonkey::setText(const UnicodeString &s) { 2440 fText = &s; 2441} 2442 2443 2444 2445int32_t RBBICharMonkey::next(int32_t prevPos) { 2446 int p0, p1, p2, p3; // Indices of the significant code points around the 2447 // break position being tested. The candidate break 2448 // location is before p2. 2449 2450 int breakPos = -1; 2451 2452 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2453 2454 if (U_FAILURE(deferredStatus)) { 2455 return -1; 2456 } 2457 2458 // Previous break at end of string. return DONE. 2459 if (prevPos >= fText->length()) { 2460 return -1; 2461 } 2462 p0 = p1 = p2 = p3 = prevPos; 2463 c3 = fText->char32At(prevPos); 2464 c0 = c1 = c2 = 0; 2465 2466 // Loop runs once per "significant" character position in the input text. 2467 for (;;) { 2468 // Move all of the positions forward in the input string. 2469 p0 = p1; c0 = c1; 2470 p1 = p2; c1 = c2; 2471 p2 = p3; c2 = c3; 2472 2473 // Advancd p3 by one codepoint 2474 p3 = fText->moveIndex32(p3, 1); 2475 c3 = fText->char32At(p3); 2476 2477 if (p1 == p2) { 2478 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2479 continue; 2480 } 2481 if (p2 == fText->length()) { 2482 // Reached end of string. Always a break position. 2483 break; 2484 } 2485 2486 // Rule GB3 CR x LF 2487 // No Extend or Format characters may appear between the CR and LF, 2488 // which requires the additional check for p2 immediately following p1. 2489 // 2490 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 2491 continue; 2492 } 2493 2494 // Rule (GB4). ( Control | CR | LF ) <break> 2495 if (fControlSet->contains(c1) || 2496 c1 == 0x0D || 2497 c1 == 0x0A) { 2498 break; 2499 } 2500 2501 // Rule (GB5) <break> ( Control | CR | LF ) 2502 // 2503 if (fControlSet->contains(c2) || 2504 c2 == 0x0D || 2505 c2 == 0x0A) { 2506 break; 2507 } 2508 2509 2510 // Rule (GB6) L x ( L | V | LV | LVT ) 2511 if (fLSet->contains(c1) && 2512 (fLSet->contains(c2) || 2513 fVSet->contains(c2) || 2514 fLVSet->contains(c2) || 2515 fLVTSet->contains(c2))) { 2516 continue; 2517 } 2518 2519 // Rule (GB7) ( LV | V ) x ( V | T ) 2520 if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 2521 (fVSet->contains(c2) || fTSet->contains(c2))) { 2522 continue; 2523 } 2524 2525 // Rule (GB8) ( LVT | T) x T 2526 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 2527 fTSet->contains(c2)) { 2528 continue; 2529 } 2530 2531 // Rule (GB9) Numeric x ALetter 2532 if (fExtendSet->contains(c2)) { 2533 continue; 2534 } 2535 2536 // Rule (GB9a) x SpacingMark 2537 if (fSpacingSet->contains(c2)) { 2538 continue; 2539 } 2540 2541 // Rule (GB9b) Prepend x 2542 if (fPrependSet->contains(c1)) { 2543 continue; 2544 } 2545 2546 // Rule (GB10) Any <break> Any 2547 break; 2548 } 2549 2550 breakPos = p2; 2551 return breakPos; 2552} 2553 2554 2555 2556UVector *RBBICharMonkey::charClasses() { 2557 return fSets; 2558} 2559 2560 2561RBBICharMonkey::~RBBICharMonkey() { 2562 delete fSets; 2563 delete fCRLFSet; 2564 delete fControlSet; 2565 delete fExtendSet; 2566 delete fPrependSet; 2567 delete fSpacingSet; 2568 delete fLSet; 2569 delete fVSet; 2570 delete fTSet; 2571 delete fLVSet; 2572 delete fLVTSet; 2573 delete fHangulSet; 2574 delete fAnySet; 2575} 2576 2577//------------------------------------------------------------------------------------------ 2578// 2579// class RBBIWordMonkey Word Break specific implementation 2580// of RBBIMonkeyKind. 2581// 2582//------------------------------------------------------------------------------------------ 2583class RBBIWordMonkey: public RBBIMonkeyKind { 2584public: 2585 RBBIWordMonkey(); 2586 virtual ~RBBIWordMonkey(); 2587 virtual UVector *charClasses(); 2588 virtual void setText(const UnicodeString &s); 2589 virtual int32_t next(int32_t i); 2590private: 2591 UVector *fSets; 2592 2593 UnicodeSet *fCRSet; 2594 UnicodeSet *fLFSet; 2595 UnicodeSet *fNewlineSet; 2596 UnicodeSet *fKatakanaSet; 2597 UnicodeSet *fALetterSet; 2598 UnicodeSet *fMidNumLetSet; 2599 UnicodeSet *fMidLetterSet; 2600 UnicodeSet *fMidNumSet; 2601 UnicodeSet *fNumericSet; 2602 UnicodeSet *fFormatSet; 2603 UnicodeSet *fOtherSet; 2604 UnicodeSet *fExtendSet; 2605 UnicodeSet *fExtendNumLetSet; 2606 2607 RegexMatcher *fMatcher; 2608 2609 const UnicodeString *fText; 2610}; 2611 2612 2613RBBIWordMonkey::RBBIWordMonkey() 2614{ 2615 UErrorCode status = U_ZERO_ERROR; 2616 2617 fSets = new UVector(status); 2618 2619 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 2620 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 2621 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 2622 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 2623 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 2624 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 2625 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 2626 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 2627 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); 2628 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 2629 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 2630 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 2631 2632 fOtherSet = new UnicodeSet(); 2633 if(U_FAILURE(status)) { 2634 deferredStatus = status; 2635 return; 2636 } 2637 2638 fOtherSet->complement(); 2639 fOtherSet->removeAll(*fCRSet); 2640 fOtherSet->removeAll(*fLFSet); 2641 fOtherSet->removeAll(*fNewlineSet); 2642 fOtherSet->removeAll(*fKatakanaSet); 2643 fOtherSet->removeAll(*fALetterSet); 2644 fOtherSet->removeAll(*fMidLetterSet); 2645 fOtherSet->removeAll(*fMidNumSet); 2646 fOtherSet->removeAll(*fNumericSet); 2647 fOtherSet->removeAll(*fExtendNumLetSet); 2648 fOtherSet->removeAll(*fFormatSet); 2649 fOtherSet->removeAll(*fExtendSet); 2650 // Inhibit dictionary characters from being tested at all. 2651 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); 2652 2653 fSets->addElement(fCRSet, status); 2654 fSets->addElement(fLFSet, status); 2655 fSets->addElement(fNewlineSet, status); 2656 fSets->addElement(fALetterSet, status); 2657 fSets->addElement(fKatakanaSet, status); 2658 fSets->addElement(fMidLetterSet, status); 2659 fSets->addElement(fMidNumLetSet, status); 2660 fSets->addElement(fMidNumSet, status); 2661 fSets->addElement(fNumericSet, status); 2662 fSets->addElement(fFormatSet, status); 2663 fSets->addElement(fExtendSet, status); 2664 fSets->addElement(fOtherSet, status); 2665 fSets->addElement(fExtendNumLetSet, status); 2666 2667 if (U_FAILURE(status)) { 2668 deferredStatus = status; 2669 } 2670} 2671 2672void RBBIWordMonkey::setText(const UnicodeString &s) { 2673 fText = &s; 2674} 2675 2676 2677int32_t RBBIWordMonkey::next(int32_t prevPos) { 2678 int p0, p1, p2, p3; // Indices of the significant code points around the 2679 // break position being tested. The candidate break 2680 // location is before p2. 2681 2682 int breakPos = -1; 2683 2684 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2685 2686 if (U_FAILURE(deferredStatus)) { 2687 return -1; 2688 } 2689 2690 // Prev break at end of string. return DONE. 2691 if (prevPos >= fText->length()) { 2692 return -1; 2693 } 2694 p0 = p1 = p2 = p3 = prevPos; 2695 c3 = fText->char32At(prevPos); 2696 c0 = c1 = c2 = 0; 2697 2698 // Loop runs once per "significant" character position in the input text. 2699 for (;;) { 2700 // Move all of the positions forward in the input string. 2701 p0 = p1; c0 = c1; 2702 p1 = p2; c1 = c2; 2703 p2 = p3; c2 = c3; 2704 2705 // Advancd p3 by X(Extend | Format)* Rule 4 2706 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 2707 do { 2708 p3 = fText->moveIndex32(p3, 1); 2709 c3 = fText->char32At(p3); 2710 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2711 break; 2712 }; 2713 } 2714 while (fFormatSet->contains(c3) || fExtendSet->contains(c3)); 2715 2716 2717 if (p1 == p2) { 2718 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2719 continue; 2720 } 2721 if (p2 == fText->length()) { 2722 // Reached end of string. Always a break position. 2723 break; 2724 } 2725 2726 // Rule (3) CR x LF 2727 // No Extend or Format characters may appear between the CR and LF, 2728 // which requires the additional check for p2 immediately following p1. 2729 // 2730 if (c1==0x0D && c2==0x0A) { 2731 continue; 2732 } 2733 2734 // Rule (3a) Break before and after newlines (including CR and LF) 2735 // 2736 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 2737 break; 2738 }; 2739 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2740 break; 2741 }; 2742 2743 // Rule (5). ALetter x ALetter 2744 if (fALetterSet->contains(c1) && 2745 fALetterSet->contains(c2)) { 2746 continue; 2747 } 2748 2749 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter 2750 // 2751 if ( fALetterSet->contains(c1) && 2752 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) && 2753 fALetterSet->contains(c3)) { 2754 continue; 2755 } 2756 2757 2758 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter 2759 if (fALetterSet->contains(c0) && 2760 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1)) && 2761 fALetterSet->contains(c2)) { 2762 continue; 2763 } 2764 2765 // Rule (8) Numeric x Numeric 2766 if (fNumericSet->contains(c1) && 2767 fNumericSet->contains(c2)) { 2768 continue; 2769 } 2770 2771 // Rule (9) ALetter x Numeric 2772 if (fALetterSet->contains(c1) && 2773 fNumericSet->contains(c2)) { 2774 continue; 2775 } 2776 2777 // Rule (10) Numeric x ALetter 2778 if (fNumericSet->contains(c1) && 2779 fALetterSet->contains(c2)) { 2780 continue; 2781 } 2782 2783 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric 2784 if (fNumericSet->contains(c0) && 2785 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1)) && 2786 fNumericSet->contains(c2)) { 2787 continue; 2788 } 2789 2790 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric 2791 if (fNumericSet->contains(c1) && 2792 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2)) && 2793 fNumericSet->contains(c3)) { 2794 continue; 2795 } 2796 2797 // Rule (13) Katakana x Katakana 2798 if (fKatakanaSet->contains(c1) && 2799 fKatakanaSet->contains(c2)) { 2800 continue; 2801 } 2802 2803 // Rule 13a 2804 if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) || 2805 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 2806 fExtendNumLetSet->contains(c2)) { 2807 continue; 2808 } 2809 2810 // Rule 13b 2811 if (fExtendNumLetSet->contains(c1) && 2812 (fALetterSet->contains(c2) || fNumericSet->contains(c2) || 2813 fKatakanaSet->contains(c2))) { 2814 continue; 2815 } 2816 2817 // Rule 14. Break found here. 2818 break; 2819 } 2820 2821 breakPos = p2; 2822 return breakPos; 2823} 2824 2825 2826UVector *RBBIWordMonkey::charClasses() { 2827 return fSets; 2828} 2829 2830 2831RBBIWordMonkey::~RBBIWordMonkey() { 2832 delete fSets; 2833 delete fCRSet; 2834 delete fLFSet; 2835 delete fNewlineSet; 2836 delete fKatakanaSet; 2837 delete fALetterSet; 2838 delete fMidNumLetSet; 2839 delete fMidLetterSet; 2840 delete fMidNumSet; 2841 delete fNumericSet; 2842 delete fFormatSet; 2843 delete fExtendSet; 2844 delete fExtendNumLetSet; 2845 delete fOtherSet; 2846} 2847 2848 2849 2850 2851//------------------------------------------------------------------------------------------ 2852// 2853// class RBBISentMonkey Sentence Break specific implementation 2854// of RBBIMonkeyKind. 2855// 2856//------------------------------------------------------------------------------------------ 2857class RBBISentMonkey: public RBBIMonkeyKind { 2858public: 2859 RBBISentMonkey(); 2860 virtual ~RBBISentMonkey(); 2861 virtual UVector *charClasses(); 2862 virtual void setText(const UnicodeString &s); 2863 virtual int32_t next(int32_t i); 2864private: 2865 int moveBack(int posFrom); 2866 int moveForward(int posFrom); 2867 UChar32 cAt(int pos); 2868 2869 UVector *fSets; 2870 2871 UnicodeSet *fSepSet; 2872 UnicodeSet *fFormatSet; 2873 UnicodeSet *fSpSet; 2874 UnicodeSet *fLowerSet; 2875 UnicodeSet *fUpperSet; 2876 UnicodeSet *fOLetterSet; 2877 UnicodeSet *fNumericSet; 2878 UnicodeSet *fATermSet; 2879 UnicodeSet *fSContinueSet; 2880 UnicodeSet *fSTermSet; 2881 UnicodeSet *fCloseSet; 2882 UnicodeSet *fOtherSet; 2883 UnicodeSet *fExtendSet; 2884 2885 const UnicodeString *fText; 2886 2887}; 2888 2889RBBISentMonkey::RBBISentMonkey() 2890{ 2891 UErrorCode status = U_ZERO_ERROR; 2892 2893 fSets = new UVector(status); 2894 2895 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 2896 // set and made into character classes of their own. For the monkey impl, 2897 // they remain in SEP, since Sep always appears with CR and LF in the rules. 2898 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 2899 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 2900 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 2901 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 2902 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 2903 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 2904 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 2905 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 2906 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 2907 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 2908 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 2909 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 2910 fOtherSet = new UnicodeSet(); 2911 2912 if(U_FAILURE(status)) { 2913 deferredStatus = status; 2914 return; 2915 } 2916 2917 fOtherSet->complement(); 2918 fOtherSet->removeAll(*fSepSet); 2919 fOtherSet->removeAll(*fFormatSet); 2920 fOtherSet->removeAll(*fSpSet); 2921 fOtherSet->removeAll(*fLowerSet); 2922 fOtherSet->removeAll(*fUpperSet); 2923 fOtherSet->removeAll(*fOLetterSet); 2924 fOtherSet->removeAll(*fNumericSet); 2925 fOtherSet->removeAll(*fATermSet); 2926 fOtherSet->removeAll(*fSContinueSet); 2927 fOtherSet->removeAll(*fSTermSet); 2928 fOtherSet->removeAll(*fCloseSet); 2929 fOtherSet->removeAll(*fExtendSet); 2930 2931 fSets->addElement(fSepSet, status); 2932 fSets->addElement(fFormatSet, status); 2933 fSets->addElement(fSpSet, status); 2934 fSets->addElement(fLowerSet, status); 2935 fSets->addElement(fUpperSet, status); 2936 fSets->addElement(fOLetterSet, status); 2937 fSets->addElement(fNumericSet, status); 2938 fSets->addElement(fATermSet, status); 2939 fSets->addElement(fSContinueSet, status); 2940 fSets->addElement(fSTermSet, status); 2941 fSets->addElement(fCloseSet, status); 2942 fSets->addElement(fOtherSet, status); 2943 fSets->addElement(fExtendSet, status); 2944 2945 if (U_FAILURE(status)) { 2946 deferredStatus = status; 2947 } 2948} 2949 2950 2951 2952void RBBISentMonkey::setText(const UnicodeString &s) { 2953 fText = &s; 2954} 2955 2956UVector *RBBISentMonkey::charClasses() { 2957 return fSets; 2958} 2959 2960 2961// moveBack() Find the "significant" code point preceding the index i. 2962// Skips over ($Extend | $Format)* . 2963// 2964int RBBISentMonkey::moveBack(int i) { 2965 if (i <= 0) { 2966 return -1; 2967 } 2968 UChar32 c; 2969 int32_t j = i; 2970 do { 2971 j = fText->moveIndex32(j, -1); 2972 c = fText->char32At(j); 2973 } 2974 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 2975 return j; 2976 2977 } 2978 2979 2980int RBBISentMonkey::moveForward(int i) { 2981 if (i>=fText->length()) { 2982 return fText->length(); 2983 } 2984 UChar32 c; 2985 int32_t j = i; 2986 do { 2987 j = fText->moveIndex32(j, 1); 2988 c = cAt(j); 2989 } 2990 while (fFormatSet->contains(c) || fExtendSet->contains(c)); 2991 return j; 2992} 2993 2994UChar32 RBBISentMonkey::cAt(int pos) { 2995 if (pos<0 || pos>=fText->length()) { 2996 return -1; 2997 } else { 2998 return fText->char32At(pos); 2999 } 3000} 3001 3002int32_t RBBISentMonkey::next(int32_t prevPos) { 3003 int p0, p1, p2, p3; // Indices of the significant code points around the 3004 // break position being tested. The candidate break 3005 // location is before p2. 3006 3007 int breakPos = -1; 3008 3009 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 3010 UChar32 c; 3011 3012 if (U_FAILURE(deferredStatus)) { 3013 return -1; 3014 } 3015 3016 // Prev break at end of string. return DONE. 3017 if (prevPos >= fText->length()) { 3018 return -1; 3019 } 3020 p0 = p1 = p2 = p3 = prevPos; 3021 c3 = fText->char32At(prevPos); 3022 c0 = c1 = c2 = 0; 3023 3024 // Loop runs once per "significant" character position in the input text. 3025 for (;;) { 3026 // Move all of the positions forward in the input string. 3027 p0 = p1; c0 = c1; 3028 p1 = p2; c1 = c2; 3029 p2 = p3; c2 = c3; 3030 3031 // Advancd p3 by X(Extend | Format)* Rule 4 3032 p3 = moveForward(p3); 3033 c3 = cAt(p3); 3034 3035 // Rule (3) CR x LF 3036 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 3037 continue; 3038 } 3039 3040 // Rule (4). Sep <break> 3041 if (fSepSet->contains(c1)) { 3042 p2 = p1+1; // Separators don't combine with Extend or Format. 3043 break; 3044 } 3045 3046 if (p2 >= fText->length()) { 3047 // Reached end of string. Always a break position. 3048 break; 3049 } 3050 3051 if (p2 == prevPos) { 3052 // Still warming up the loop. (won't work with zero length strings, but we don't care) 3053 continue; 3054 } 3055 3056 // Rule (6). ATerm x Numeric 3057 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 3058 continue; 3059 } 3060 3061 // Rule (7). Upper ATerm x Uppper 3062 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) { 3063 continue; 3064 } 3065 3066 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 3067 // Note: STerm | ATerm are added to the negated part of the expression by a 3068 // note to the Unicode 5.0 documents. 3069 int p8 = p1; 3070 while (fSpSet->contains(cAt(p8))) { 3071 p8 = moveBack(p8); 3072 } 3073 while (fCloseSet->contains(cAt(p8))) { 3074 p8 = moveBack(p8); 3075 } 3076 if (fATermSet->contains(cAt(p8))) { 3077 p8=p2; 3078 for (;;) { 3079 c = cAt(p8); 3080 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 3081 fLowerSet->contains(c) || fSepSet->contains(c) || 3082 fATermSet->contains(c) || fSTermSet->contains(c)) { 3083 break; 3084 } 3085 p8 = moveForward(p8); 3086 } 3087 if (fLowerSet->contains(cAt(p8))) { 3088 continue; 3089 } 3090 } 3091 3092 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 3093 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 3094 p8 = p1; 3095 while (fSpSet->contains(cAt(p8))) { 3096 p8 = moveBack(p8); 3097 } 3098 while (fCloseSet->contains(cAt(p8))) { 3099 p8 = moveBack(p8); 3100 } 3101 c = cAt(p8); 3102 if (fSTermSet->contains(c) || fATermSet->contains(c)) { 3103 continue; 3104 } 3105 } 3106 3107 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 3108 int p9 = p1; 3109 while (fCloseSet->contains(cAt(p9))) { 3110 p9 = moveBack(p9); 3111 } 3112 c = cAt(p9); 3113 if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 3114 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 3115 continue; 3116 } 3117 } 3118 3119 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 3120 int p10 = p1; 3121 while (fSpSet->contains(cAt(p10))) { 3122 p10 = moveBack(p10); 3123 } 3124 while (fCloseSet->contains(cAt(p10))) { 3125 p10 = moveBack(p10); 3126 } 3127 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 3128 if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 3129 continue; 3130 } 3131 } 3132 3133 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 3134 int p11 = p1; 3135 if (fSepSet->contains(cAt(p11))) { 3136 p11 = moveBack(p11); 3137 } 3138 while (fSpSet->contains(cAt(p11))) { 3139 p11 = moveBack(p11); 3140 } 3141 while (fCloseSet->contains(cAt(p11))) { 3142 p11 = moveBack(p11); 3143 } 3144 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 3145 break; 3146 } 3147 3148 // Rule (12) Any x Any 3149 continue; 3150 } 3151 breakPos = p2; 3152 return breakPos; 3153} 3154 3155RBBISentMonkey::~RBBISentMonkey() { 3156 delete fSets; 3157 delete fSepSet; 3158 delete fFormatSet; 3159 delete fSpSet; 3160 delete fLowerSet; 3161 delete fUpperSet; 3162 delete fOLetterSet; 3163 delete fNumericSet; 3164 delete fATermSet; 3165 delete fSContinueSet; 3166 delete fSTermSet; 3167 delete fCloseSet; 3168 delete fOtherSet; 3169 delete fExtendSet; 3170} 3171 3172 3173 3174//------------------------------------------------------------------------------------------- 3175// 3176// RBBILineMonkey 3177// 3178//------------------------------------------------------------------------------------------- 3179 3180class RBBILineMonkey: public RBBIMonkeyKind { 3181public: 3182 RBBILineMonkey(); 3183 virtual ~RBBILineMonkey(); 3184 virtual UVector *charClasses(); 3185 virtual void setText(const UnicodeString &s); 3186 virtual int32_t next(int32_t i); 3187 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 3188private: 3189 UVector *fSets; 3190 3191 UnicodeSet *fBK; 3192 UnicodeSet *fCR; 3193 UnicodeSet *fLF; 3194 UnicodeSet *fCM; 3195 UnicodeSet *fNL; 3196 UnicodeSet *fSG; 3197 UnicodeSet *fWJ; 3198 UnicodeSet *fZW; 3199 UnicodeSet *fGL; 3200 UnicodeSet *fCB; 3201 UnicodeSet *fSP; 3202 UnicodeSet *fB2; 3203 UnicodeSet *fBA; 3204 UnicodeSet *fBB; 3205 UnicodeSet *fHY; 3206 UnicodeSet *fH2; 3207 UnicodeSet *fH3; 3208 UnicodeSet *fCL; 3209 UnicodeSet *fEX; 3210 UnicodeSet *fIN; 3211 UnicodeSet *fJL; 3212 UnicodeSet *fJV; 3213 UnicodeSet *fJT; 3214 UnicodeSet *fNS; 3215 UnicodeSet *fOP; 3216 UnicodeSet *fQU; 3217 UnicodeSet *fIS; 3218 UnicodeSet *fNU; 3219 UnicodeSet *fPO; 3220 UnicodeSet *fPR; 3221 UnicodeSet *fSY; 3222 UnicodeSet *fAI; 3223 UnicodeSet *fAL; 3224 UnicodeSet *fID; 3225 UnicodeSet *fSA; 3226 UnicodeSet *fXX; 3227 3228 BreakIterator *fCharBI; 3229 3230 const UnicodeString *fText; 3231 int32_t *fOrigPositions; 3232 3233 RegexMatcher *fNumberMatcher; 3234 RegexMatcher *fLB11Matcher; 3235}; 3236 3237 3238RBBILineMonkey::RBBILineMonkey() 3239{ 3240 UErrorCode status = U_ZERO_ERROR; 3241 3242 fSets = new UVector(status); 3243 3244 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 3245 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 3246 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 3247 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 3248 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 3249 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 3250 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 3251 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 3252 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 3253 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 3254 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 3255 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 3256 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 3257 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 3258 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 3259 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 3260 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 3261 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 3262 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 3263 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 3264 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 3265 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 3266 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 3267 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 3268 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 3269 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 3270 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 3271 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 3272 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 3273 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 3274 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 3275 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 3276 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 3277 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status); 3278 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 3279 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 3280 3281 if (U_FAILURE(status)) { 3282 deferredStatus = status; 3283 fCharBI = NULL; 3284 fNumberMatcher = NULL; 3285 return; 3286 } 3287 3288 fAL->addAll(*fXX); // Default behavior for XX is identical to AL 3289 fAL->addAll(*fAI); // Default behavior for AI is identical to AL 3290 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL 3291 fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 3292 3293 fSets->addElement(fBK, status); 3294 fSets->addElement(fCR, status); 3295 fSets->addElement(fLF, status); 3296 fSets->addElement(fCM, status); 3297 fSets->addElement(fNL, status); 3298 fSets->addElement(fWJ, status); 3299 fSets->addElement(fZW, status); 3300 fSets->addElement(fGL, status); 3301 fSets->addElement(fCB, status); 3302 fSets->addElement(fSP, status); 3303 fSets->addElement(fB2, status); 3304 fSets->addElement(fBA, status); 3305 fSets->addElement(fBB, status); 3306 fSets->addElement(fHY, status); 3307 fSets->addElement(fH2, status); 3308 fSets->addElement(fH3, status); 3309 fSets->addElement(fCL, status); 3310 fSets->addElement(fEX, status); 3311 fSets->addElement(fIN, status); 3312 fSets->addElement(fJL, status); 3313 fSets->addElement(fJT, status); 3314 fSets->addElement(fJV, status); 3315 fSets->addElement(fNS, status); 3316 fSets->addElement(fOP, status); 3317 fSets->addElement(fQU, status); 3318 fSets->addElement(fIS, status); 3319 fSets->addElement(fNU, status); 3320 fSets->addElement(fPO, status); 3321 fSets->addElement(fPR, status); 3322 fSets->addElement(fSY, status); 3323 fSets->addElement(fAI, status); 3324 fSets->addElement(fAL, status); 3325 fSets->addElement(fID, status); 3326 fSets->addElement(fWJ, status); 3327 fSets->addElement(fSA, status); 3328 fSets->addElement(fSG, status); 3329 3330 const char *rules = 3331 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" 3332 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" 3333 "\\p{Line_Break=NU}\\p{Line_Break=CM}*" 3334 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*" 3335 "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?" 3336 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"; 3337 3338 fNumberMatcher = new RegexMatcher( 3339 UnicodeString(rules, -1, US_INV), 0, status); 3340 3341 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 3342 3343 if (U_FAILURE(status)) { 3344 deferredStatus = status; 3345 } 3346} 3347 3348 3349void RBBILineMonkey::setText(const UnicodeString &s) { 3350 fText = &s; 3351 fCharBI->setText(s); 3352 fNumberMatcher->reset(s); 3353} 3354 3355// 3356// rule9Adjust 3357// Line Break TR rules 9 and 10 implementation. 3358// This deals with combining marks and other sequences that 3359// that must be treated as if they were something other than what they actually are. 3360// 3361// This is factored out into a separate function because it must be applied twice for 3362// each potential break, once to the chars before the position being checked, then 3363// again to the text following the possible break. 3364// 3365void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 3366 if (pos == -1) { 3367 // Invalid initial position. Happens during the warmup iteration of the 3368 // main loop in next(). 3369 return; 3370 } 3371 3372 int32_t nPos = *nextPos; 3373 3374 // LB 9 Keep combining sequences together. 3375 // advance over any CM class chars. Note that Line Break CM is different 3376 // from the normal Grapheme Extend property. 3377 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 3378 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 3379 for (;;) { 3380 *nextChar = fText->char32At(nPos); 3381 if (!fCM->contains(*nextChar)) { 3382 break; 3383 } 3384 nPos = fText->moveIndex32(nPos, 1); 3385 } 3386 } 3387 3388 3389 // LB 9 Treat X CM* as if it were x. 3390 // No explicit action required. 3391 3392 // LB 10 Treat any remaining combining mark as AL 3393 if (fCM->contains(*posChar)) { 3394 *posChar = 0x41; // thisChar = 'A'; 3395 } 3396 3397 // Push the updated nextPos and nextChar back to our caller. 3398 // This only makes a difference if posChar got bigger by consuming a 3399 // combining sequence. 3400 *nextPos = nPos; 3401 *nextChar = fText->char32At(nPos); 3402} 3403 3404 3405 3406int32_t RBBILineMonkey::next(int32_t startPos) { 3407 UErrorCode status = U_ZERO_ERROR; 3408 int32_t pos; // Index of the char following a potential break position 3409 UChar32 thisChar; // Character at above position "pos" 3410 3411 int32_t prevPos; // Index of the char preceding a potential break position 3412 UChar32 prevChar; // Character at above position. Note that prevChar 3413 // and thisChar may not be adjacent because combining 3414 // characters between them will be ignored. 3415 3416 int32_t nextPos; // Index of the next character following pos. 3417 // Usually skips over combining marks. 3418 int32_t nextCPPos; // Index of the code point following "pos." 3419 // May point to a combining mark. 3420 int32_t tPos; // temp value. 3421 UChar32 c; 3422 3423 if (U_FAILURE(deferredStatus)) { 3424 return -1; 3425 } 3426 3427 if (startPos >= fText->length()) { 3428 return -1; 3429 } 3430 3431 3432 // Initial values for loop. Loop will run the first time without finding breaks, 3433 // while the invalid values shift out and the "this" and 3434 // "prev" positions are filled in with good values. 3435 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration. 3436 thisChar = prevChar = 0; 3437 nextPos = nextCPPos = startPos; 3438 3439 3440 // Loop runs once per position in the test text, until a break position 3441 // is found. 3442 for (;;) { 3443 prevPos = pos; 3444 prevChar = thisChar; 3445 3446 pos = nextPos; 3447 thisChar = fText->char32At(pos); 3448 3449 nextCPPos = fText->moveIndex32(pos, 1); 3450 nextPos = nextCPPos; 3451 3452 // Rule LB2 - Break at end of text. 3453 if (pos >= fText->length()) { 3454 break; 3455 } 3456 3457 // Rule LB 9 - adjust for combining sequences. 3458 // We do this one out-of-order because the adjustment does not change anything 3459 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 3460 // be applied. 3461 rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 3462 nextCPPos = nextPos = fText->moveIndex32(pos, 1); 3463 c = fText->char32At(nextPos); 3464 rule9Adjust(pos, &thisChar, &nextPos, &c); 3465 3466 // If the loop is still warming up - if we haven't shifted the initial 3467 // -1 positions out of prevPos yet - loop back to advance the 3468 // position in the input without any further looking for breaks. 3469 if (prevPos == -1) { 3470 continue; 3471 } 3472 3473 // LB 4 Always break after hard line breaks, 3474 if (fBK->contains(prevChar)) { 3475 break; 3476 } 3477 3478 // LB 5 Break after CR, LF, NL, but not inside CR LF 3479 if (prevChar == 0x0d && thisChar == 0x0a) { 3480 continue; 3481 } 3482 if (prevChar == 0x0d || 3483 prevChar == 0x0a || 3484 prevChar == 0x85) { 3485 break; 3486 } 3487 3488 // LB 6 Don't break before hard line breaks 3489 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 3490 fBK->contains(thisChar)) { 3491 continue; 3492 } 3493 3494 3495 // LB 7 Don't break before spaces or zero-width space. 3496 if (fSP->contains(thisChar)) { 3497 continue; 3498 } 3499 3500 if (fZW->contains(thisChar)) { 3501 continue; 3502 } 3503 3504 // LB 8 Break after zero width space 3505 if (fZW->contains(prevChar)) { 3506 break; 3507 } 3508 3509 // LB 9, 10 Already done, at top of loop. 3510 // 3511 3512 3513 // LB 11 Do not break before or after WORD JOINER and related characters. 3514 // x WJ 3515 // WJ x 3516 // 3517 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 3518 continue; 3519 } 3520 3521 // LB 12 3522 // GL x 3523 if (fGL->contains(prevChar)) { 3524 continue; 3525 } 3526 3527 // LB 12a 3528 // [^SP BA HY] x GL 3529 if (!(fSP->contains(prevChar) || 3530 fBA->contains(prevChar) || 3531 fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 3532 continue; 3533 } 3534 3535 3536 3537 // LB 13 Don't break before closings. 3538 // NU x CL and NU x IS are not matched here so that they will 3539 // fall into LB 17 and the more general number regular expression. 3540 // 3541 if (!fNU->contains(prevChar) && fCL->contains(thisChar) || 3542 fEX->contains(thisChar) || 3543 !fNU->contains(prevChar) && fIS->contains(thisChar) || 3544 !fNU->contains(prevChar) && fSY->contains(thisChar)) { 3545 continue; 3546 } 3547 3548 // LB 14 Don't break after OP SP* 3549 // Scan backwards, checking for this sequence. 3550 // The OP char could include combining marks, so we actually check for 3551 // OP CM* SP* 3552 // Another Twist: The Rule 67 fixes may have changed a SP CM 3553 // sequence into a ID char, so before scanning back through spaces, 3554 // verify that prevChar is indeed a space. The prevChar variable 3555 // may differ from fText[prevPos] 3556 tPos = prevPos; 3557 if (fSP->contains(prevChar)) { 3558 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3559 tPos=fText->moveIndex32(tPos, -1); 3560 } 3561 } 3562 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3563 tPos=fText->moveIndex32(tPos, -1); 3564 } 3565 if (fOP->contains(fText->char32At(tPos))) { 3566 continue; 3567 } 3568 3569 3570 // LB 15 QU SP* x OP 3571 if (fOP->contains(thisChar)) { 3572 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 3573 int tPos = prevPos; 3574 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3575 tPos = fText->moveIndex32(tPos, -1); 3576 } 3577 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3578 tPos = fText->moveIndex32(tPos, -1); 3579 } 3580 if (fQU->contains(fText->char32At(tPos))) { 3581 continue; 3582 } 3583 } 3584 3585 3586 3587 // LB 16 CL SP* x NS 3588 // Scan backwards for SP* CM* CL 3589 if (fNS->contains(thisChar)) { 3590 int tPos = prevPos; 3591 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3592 tPos = fText->moveIndex32(tPos, -1); 3593 } 3594 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3595 tPos = fText->moveIndex32(tPos, -1); 3596 } 3597 if (fCL->contains(fText->char32At(tPos))) { 3598 continue; 3599 } 3600 } 3601 3602 3603 // LB 17 B2 SP* x B2 3604 if (fB2->contains(thisChar)) { 3605 // Scan backwards, checking for the B2 CM* SP* sequence. 3606 tPos = prevPos; 3607 if (fSP->contains(prevChar)) { 3608 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3609 tPos=fText->moveIndex32(tPos, -1); 3610 } 3611 } 3612 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3613 tPos=fText->moveIndex32(tPos, -1); 3614 } 3615 if (fB2->contains(fText->char32At(tPos))) { 3616 continue; 3617 } 3618 } 3619 3620 3621 // LB 18 break after space 3622 if (fSP->contains(prevChar)) { 3623 break; 3624 } 3625 3626 // LB 19 3627 // x QU 3628 // QU x 3629 if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 3630 continue; 3631 } 3632 3633 // LB 20 Break around a CB 3634 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 3635 break; 3636 } 3637 3638 // LB 21 3639 if (fBA->contains(thisChar) || 3640 fHY->contains(thisChar) || 3641 fNS->contains(thisChar) || 3642 fBB->contains(prevChar) ) { 3643 continue; 3644 } 3645 3646 // LB 22 3647 if (fAL->contains(prevChar) && fIN->contains(thisChar) || 3648 fID->contains(prevChar) && fIN->contains(thisChar) || 3649 fIN->contains(prevChar) && fIN->contains(thisChar) || 3650 fNU->contains(prevChar) && fIN->contains(thisChar) ) { 3651 continue; 3652 } 3653 3654 3655 // LB 23 ID x PO 3656 // AL x NU 3657 // NU x AL 3658 if (fID->contains(prevChar) && fPO->contains(thisChar) || 3659 fAL->contains(prevChar) && fNU->contains(thisChar) || 3660 fNU->contains(prevChar) && fAL->contains(thisChar) ) { 3661 continue; 3662 } 3663 3664 // LB 24 Do not break between prefix and letters or ideographs. 3665 // PR x ID 3666 // PR x AL 3667 // PO x AL 3668 if (fPR->contains(prevChar) && fID->contains(thisChar) || 3669 fPR->contains(prevChar) && fAL->contains(thisChar) || 3670 fPO->contains(prevChar) && fAL->contains(thisChar) ) { 3671 continue; 3672 } 3673 3674 3675 3676 // LB 25 Numbers 3677 if (fNumberMatcher->lookingAt(prevPos, status)) { 3678 if (U_FAILURE(status)) { 3679 break; 3680 } 3681 // Matched a number. But could have been just a single digit, which would 3682 // not represent a "no break here" between prevChar and thisChar 3683 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 3684 if (numEndIdx > pos) { 3685 // Number match includes at least our two chars being checked 3686 if (numEndIdx > nextPos) { 3687 // Number match includes additional chars. Update pos and nextPos 3688 // so that next loop iteration will continue at the end of the number, 3689 // checking for breaks between last char in number & whatever follows. 3690 pos = nextPos = numEndIdx; 3691 do { 3692 pos = fText->moveIndex32(pos, -1); 3693 thisChar = fText->char32At(pos); 3694 } while (fCM->contains(thisChar)); 3695 } 3696 continue; 3697 } 3698 } 3699 3700 3701 // LB 26 Do not break a Korean syllable. 3702 if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 3703 fJV->contains(thisChar) || 3704 fH2->contains(thisChar) || 3705 fH3->contains(thisChar))) { 3706 continue; 3707 } 3708 3709 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 3710 (fJV->contains(thisChar) || fJT->contains(thisChar))) { 3711 continue; 3712 } 3713 3714 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 3715 fJT->contains(thisChar)) { 3716 continue; 3717 } 3718 3719 // LB 27 Treat a Korean Syllable Block the same as ID. 3720 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3721 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3722 fIN->contains(thisChar)) { 3723 continue; 3724 } 3725 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3726 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3727 fPO->contains(thisChar)) { 3728 continue; 3729 } 3730 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 3731 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 3732 continue; 3733 } 3734 3735 3736 3737 // LB 28 Do not break between alphabetics ("at"). 3738 if (fAL->contains(prevChar) && fAL->contains(thisChar)) { 3739 continue; 3740 } 3741 3742 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 3743 if (fIS->contains(prevChar) && fAL->contains(thisChar)) { 3744 continue; 3745 } 3746 3747 // LB 31 Break everywhere else 3748 break; 3749 3750 } 3751 3752 return pos; 3753} 3754 3755 3756UVector *RBBILineMonkey::charClasses() { 3757 return fSets; 3758} 3759 3760 3761RBBILineMonkey::~RBBILineMonkey() { 3762 delete fSets; 3763 3764 delete fBK; 3765 delete fCR; 3766 delete fLF; 3767 delete fCM; 3768 delete fNL; 3769 delete fWJ; 3770 delete fZW; 3771 delete fGL; 3772 delete fCB; 3773 delete fSP; 3774 delete fB2; 3775 delete fBA; 3776 delete fBB; 3777 delete fHY; 3778 delete fH2; 3779 delete fH3; 3780 delete fCL; 3781 delete fEX; 3782 delete fIN; 3783 delete fJL; 3784 delete fJV; 3785 delete fJT; 3786 delete fNS; 3787 delete fOP; 3788 delete fQU; 3789 delete fIS; 3790 delete fNU; 3791 delete fPO; 3792 delete fPR; 3793 delete fSY; 3794 delete fAI; 3795 delete fAL; 3796 delete fID; 3797 delete fSA; 3798 delete fSG; 3799 delete fXX; 3800 3801 delete fCharBI; 3802 delete fNumberMatcher; 3803} 3804 3805 3806//------------------------------------------------------------------------------------------- 3807// 3808// TestMonkey 3809// 3810// params 3811// seed=nnnnn Random number starting seed. 3812// Setting the seed allows errors to be reproduced. 3813// loop=nnn Looping count. Controls running time. 3814// -1: run forever. 3815// 0 or greater: run length. 3816// 3817// type = char | word | line | sent | title 3818// 3819//------------------------------------------------------------------------------------------- 3820 3821static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 3822 int32_t val = defaultVal; 3823 name.append(" *= *(-?\\d+)"); 3824 UErrorCode status = U_ZERO_ERROR; 3825 RegexMatcher m(name, params, 0, status); 3826 if (m.find()) { 3827 // The param exists. Convert the string to an int. 3828 char valString[100]; 3829 int32_t paramLength = m.end(1, status) - m.start(1, status); 3830 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 3831 paramLength = (int32_t)(sizeof(valString)-2); 3832 } 3833 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 3834 val = strtol(valString, NULL, 10); 3835 3836 // Delete this parameter from the params string. 3837 m.reset(); 3838 params = m.replaceFirst("", status); 3839 } 3840 U_ASSERT(U_SUCCESS(status)); 3841 return val; 3842} 3843#endif 3844 3845static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 3846 BreakIterator *bi, 3847 int expected[], 3848 int expectedcount) 3849{ 3850 int count = 0; 3851 int i = 0; 3852 int forward[50]; 3853 bi->setText(ustr); 3854 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3855 forward[count] = i; 3856 if (count < expectedcount && expected[count] != i) { 3857 test->errln("break forward test failed: expected %d but got %d", 3858 expected[count], i); 3859 break; 3860 } 3861 count ++; 3862 } 3863 if (count != expectedcount) { 3864 printStringBreaks(ustr, expected, expectedcount); 3865 test->errln("break forward test failed: missed %d match", 3866 expectedcount - count); 3867 return; 3868 } 3869 // testing boundaries 3870 for (i = 1; i < expectedcount; i ++) { 3871 int j = expected[i - 1]; 3872 if (!bi->isBoundary(j)) { 3873 printStringBreaks(ustr, expected, expectedcount); 3874 test->errln("isBoundary() failed. Expected boundary at position %d", j); 3875 return; 3876 } 3877 for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 3878 if (bi->isBoundary(j)) { 3879 printStringBreaks(ustr, expected, expectedcount); 3880 test->errln("isBoundary() failed. Not expecting boundary at position %d", j); 3881 return; 3882 } 3883 } 3884 } 3885 3886 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3887 count --; 3888 if (forward[count] != i) { 3889 test->errln("happy break test previous() failed: expected %d but got %d", 3890 forward[count], i); 3891 break; 3892 } 3893 } 3894 if (count != 0) { 3895 printStringBreaks(ustr, expected, expectedcount); 3896 test->errln("break test previous() failed: missed a match"); 3897 return; 3898 } 3899 3900 // testing preceding 3901 for (i = 0; i < expectedcount - 1; i ++) { 3902 // int j = expected[i] + 1; 3903 int j = ustr.moveIndex32(expected[i], 1); 3904 for (; j <= expected[i + 1]; j ++) { 3905 if (bi->preceding(j) != expected[i]) { 3906 printStringBreaks(ustr, expected, expectedcount); 3907 test->errln("preceding(): Not expecting boundary at position %d", j); 3908 return; 3909 } 3910 } 3911 } 3912} 3913 3914void RBBITest::TestWordBreaks(void) 3915{ 3916#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3917 3918 Locale locale("en"); 3919 UErrorCode status = U_ZERO_ERROR; 3920 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3921 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3922 static const char *strlist[] = 3923 { 3924 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 3925 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b", 3926 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 3927 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 3928 "\\u90ca\\u3588\\u009c\\u0953\\u194b", 3929 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3930 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 3931 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e", 3932 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3933 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3934 "\\u2027\\U000e0067\\u0a47\\u00b7", 3935 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3936 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3937 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3938 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 3939 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3940 "\\u0027\\u11af\\U000e0057\\u0602", 3941 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3942 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3943 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3944 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3945 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3946 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 3947 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3948 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3949 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3950 "\\u58f4\\U000e0049\\u20e7\\u2027", 3951 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3952 "\\ua183\\u102d\\u0bec\\u003a", 3953 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3954 "\\u003a\\u0e57\\u0fad\\u002e", 3955 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3956 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3957 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 3958 "\\u003a\\u0664\\u00b7\\u1fba", 3959 "\\u003b\\u0027\\u00b7\\u47a3", 3960 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b", 3961 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 3962 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 3963 }; 3964 int loop; 3965 if (U_FAILURE(status)) { 3966 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3967 return; 3968 } 3969 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3970 // printf("looping %d\n", loop); 3971 UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 3972 // RBBICharMonkey monkey; 3973 RBBIWordMonkey monkey; 3974 3975 int expected[50]; 3976 int expectedcount = 0; 3977 3978 monkey.setText(ustr); 3979 int i; 3980 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3981 expected[expectedcount ++] = i; 3982 } 3983 3984 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3985 } 3986 delete bi; 3987#endif 3988} 3989 3990void RBBITest::TestWordBoundary(void) 3991{ 3992 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 3993 Locale locale("en"); 3994 UErrorCode status = U_ZERO_ERROR; 3995 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3996 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3997 UChar str[50]; 3998 static const char *strlist[] = 3999 { 4000 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 4001 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 4002 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 4003 "\\u2027\\U000e0067\\u0a47\\u00b7", 4004 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 4005 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 4006 "\\u0589\\U000e006e\\u0a42\\U000104a5", 4007 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 4008 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 4009 "\\u0027\\u11af\\U000e0057\\u0602", 4010 "\\U0001d7f2\\U000e007\\u0004\\u0589", 4011 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 4012 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 4013 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 4014 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 4015 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 4016 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 4017 "\\u0233\\U000e0020\\u0a69\\u0d6a", 4018 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 4019 "\\u58f4\\U000e0049\\u20e7\\u2027", 4020 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 4021 "\\ua183\\u102d\\u0bec\\u003a", 4022 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 4023 "\\u003a\\u0e57\\u0fad\\u002e", 4024 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 4025 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 4026 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 4027 "\\u003a\\u0664\\u00b7\\u1fba", 4028 "\\u003b\\u0027\\u00b7\\u47a3", 4029 }; 4030 int loop; 4031 if (U_FAILURE(status)) { 4032 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4033 return; 4034 } 4035 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4036 // printf("looping %d\n", loop); 4037 u_unescape(strlist[loop], str, 20); 4038 UnicodeString ustr(str); 4039 int forward[50]; 4040 int count = 0; 4041 4042 bi->setText(ustr); 4043 int prev = 0; 4044 int i; 4045 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 4046 forward[count ++] = i; 4047 if (i > prev) { 4048 int j; 4049 for (j = prev + 1; j < i; j ++) { 4050 if (bi->isBoundary(j)) { 4051 printStringBreaks(ustr, forward, count); 4052 errln("happy boundary test failed: expected %d not a boundary", 4053 j); 4054 return; 4055 } 4056 } 4057 } 4058 if (!bi->isBoundary(i)) { 4059 printStringBreaks(ustr, forward, count); 4060 errln("happy boundary test failed: expected %d a boundary", 4061 i); 4062 return; 4063 } 4064 prev = i; 4065 } 4066 } 4067 delete bi; 4068} 4069 4070void RBBITest::TestLineBreaks(void) 4071{ 4072#if !UCONFIG_NO_REGULAR_EXPRESSIONS 4073 Locale locale("en"); 4074 UErrorCode status = U_ZERO_ERROR; 4075 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4076 const int32_t STRSIZE = 50; 4077 UChar str[STRSIZE]; 4078 static const char *strlist[] = 4079 { 4080 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 4081 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 4082 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 4083 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 4084 "u2014\\U000e0105\\u118c\\u000a\\u07f8", 4085 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 4086 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 4087 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 4088 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 4089 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 4090 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", 4091 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 4092 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 4093 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 4094 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 4095 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 4096 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 4097 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 4098 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 4099 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 4100 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 4101 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 4102 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 4103 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 4104 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 4105 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 4106 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", 4107 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 4108 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 4109 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 4110 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 4111 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 4112 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", 4113 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 4114 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 4115 "\\u2014\\u0020\\u000a\\u17c5\\u24fc", 4116 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 4117 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 4118 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 4119 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 4120 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 4121 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 4122 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" 4123 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" 4124 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", 4125 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 4126 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 4127 }; 4128 int loop; 4129 TEST_ASSERT_SUCCESS(status); 4130 if (U_FAILURE(status)) { 4131 return; 4132 } 4133 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4134 // printf("looping %d\n", loop); 4135 int32_t t = u_unescape(strlist[loop], str, STRSIZE); 4136 if (t >= STRSIZE) { 4137 TEST_ASSERT(FALSE); 4138 continue; 4139 } 4140 4141 4142 UnicodeString ustr(str); 4143 RBBILineMonkey monkey; 4144 if (U_FAILURE(monkey.deferredStatus)) { 4145 continue; 4146 } 4147 4148 const int EXPECTEDSIZE = 50; 4149 int expected[EXPECTEDSIZE]; 4150 int expectedcount = 0; 4151 4152 monkey.setText(ustr); 4153 int i; 4154 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4155 if (expectedcount >= EXPECTEDSIZE) { 4156 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4157 return; 4158 } 4159 expected[expectedcount ++] = i; 4160 } 4161 4162 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4163 } 4164 delete bi; 4165#endif 4166} 4167 4168void RBBITest::TestSentBreaks(void) 4169{ 4170#if !UCONFIG_NO_REGULAR_EXPRESSIONS 4171 Locale locale("en"); 4172 UErrorCode status = U_ZERO_ERROR; 4173 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4174 UChar str[200]; 4175 static const char *strlist[] = 4176 { 4177 "Now\ris\nthe\r\ntime\n\rfor\r\r", 4178 "This\n", 4179 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 4180 "\"Sentence ending with a quote.\" Bye.", 4181 " (This is it). Testing the sentence iterator. \"This isn't it.\"", 4182 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 4183 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 4184 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 4185 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 4186 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 4187 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 4188 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 4189 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 4190 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 4191 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 4192 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 4193 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 4194 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 4195 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 4196 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 4197 }; 4198 int loop; 4199 if (U_FAILURE(status)) { 4200 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4201 return; 4202 } 4203 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4204 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0]))); 4205 UnicodeString ustr(str); 4206 4207 RBBISentMonkey monkey; 4208 if (U_FAILURE(monkey.deferredStatus)) { 4209 continue; 4210 } 4211 4212 const int EXPECTEDSIZE = 50; 4213 int expected[EXPECTEDSIZE]; 4214 int expectedcount = 0; 4215 4216 monkey.setText(ustr); 4217 int i; 4218 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4219 if (expectedcount >= EXPECTEDSIZE) { 4220 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4221 return; 4222 } 4223 expected[expectedcount ++] = i; 4224 } 4225 4226 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4227 } 4228 delete bi; 4229#endif 4230} 4231 4232void RBBITest::TestMonkey(char *params) { 4233#if !UCONFIG_NO_REGULAR_EXPRESSIONS 4234 4235 UErrorCode status = U_ZERO_ERROR; 4236 int32_t loopCount = 500; 4237 int32_t seed = 1; 4238 UnicodeString breakType = "all"; 4239 Locale locale("en"); 4240 UBool useUText = FALSE; 4241 4242 if (quick == FALSE) { 4243 loopCount = 10000; 4244 } 4245 4246 if (params) { 4247 UnicodeString p(params); 4248 loopCount = getIntParam("loop", p, loopCount); 4249 seed = getIntParam("seed", p, seed); 4250 4251 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 4252 if (m.find()) { 4253 breakType = m.group(1, status); 4254 m.reset(); 4255 p = m.replaceFirst("", status); 4256 } 4257 4258 RegexMatcher u(" *utext", p, 0, status); 4259 if (u.find()) { 4260 useUText = TRUE; 4261 u.reset(); 4262 p = u.replaceFirst("", status); 4263 } 4264 4265 4266 // m.reset(p); 4267 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 4268 // Each option is stripped out of the option string as it is processed. 4269 // All options have been checked. The option string should have been completely emptied.. 4270 char buf[100]; 4271 p.extract(buf, sizeof(buf), NULL, status); 4272 buf[sizeof(buf)-1] = 0; 4273 errln("Unrecognized or extra parameter: %s\n", buf); 4274 return; 4275 } 4276 4277 } 4278 4279 if (breakType == "char" || breakType == "all") { 4280 RBBICharMonkey m; 4281 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4282 if (U_SUCCESS(status)) { 4283 RunMonkey(bi, m, "char", seed, loopCount, useUText); 4284 if (breakType == "all" && useUText==FALSE) { 4285 // Also run a quick test with UText when "all" is specified 4286 RunMonkey(bi, m, "char", seed, loopCount, TRUE); 4287 } 4288 } 4289 else { 4290 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 4291 } 4292 delete bi; 4293 } 4294 4295 if (breakType == "word" || breakType == "all") { 4296 logln("Word Break Monkey Test"); 4297 RBBIWordMonkey m; 4298 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4299 if (U_SUCCESS(status)) { 4300 RunMonkey(bi, m, "word", seed, loopCount, useUText); 4301 } 4302 else { 4303 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 4304 } 4305 delete bi; 4306 } 4307 4308 if (breakType == "line" || breakType == "all") { 4309 logln("Line Break Monkey Test"); 4310 RBBILineMonkey m; 4311 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4312 if (loopCount >= 10) { 4313 loopCount = loopCount / 5; // Line break runs slower than the others. 4314 } 4315 if (U_SUCCESS(status)) { 4316 RunMonkey(bi, m, "line", seed, loopCount, useUText); 4317 } 4318 else { 4319 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4320 } 4321 delete bi; 4322 } 4323 4324 if (breakType == "sent" || breakType == "all" ) { 4325 logln("Sentence Break Monkey Test"); 4326 RBBISentMonkey m; 4327 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4328 if (loopCount >= 10) { 4329 loopCount = loopCount / 10; // Sentence runs slower than the other break types 4330 } 4331 if (U_SUCCESS(status)) { 4332 RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 4333 } 4334 else { 4335 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4336 } 4337 delete bi; 4338 } 4339 4340#endif 4341} 4342 4343// 4344// Run a RBBI monkey test. Common routine, for all break iterator types. 4345// Parameters: 4346// bi - the break iterator to use 4347// mk - MonkeyKind, abstraction for obtaining expected results 4348// name - Name of test (char, word, etc.) for use in error messages 4349// seed - Seed for starting random number generator (parameter from user) 4350// numIterations 4351// 4352void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 4353 int32_t numIterations, UBool useUText) { 4354 4355#if !UCONFIG_NO_REGULAR_EXPRESSIONS 4356 4357 const int32_t TESTSTRINGLEN = 500; 4358 UnicodeString testText; 4359 int32_t numCharClasses; 4360 UVector *chClasses; 4361 int expected[TESTSTRINGLEN*2 + 1]; 4362 int expectedCount = 0; 4363 char expectedBreaks[TESTSTRINGLEN*2 + 1]; 4364 char forwardBreaks[TESTSTRINGLEN*2 + 1]; 4365 char reverseBreaks[TESTSTRINGLEN*2+1]; 4366 char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 4367 char followingBreaks[TESTSTRINGLEN*2+1]; 4368 char precedingBreaks[TESTSTRINGLEN*2+1]; 4369 int i; 4370 int loopCount = 0; 4371 4372 m_seed = seed; 4373 4374 numCharClasses = mk.charClasses()->size(); 4375 chClasses = mk.charClasses(); 4376 4377 // Check for errors that occured during the construction of the MonkeyKind object. 4378 // Can't report them where they occured because errln() is a method coming from intlTest, 4379 // and is not visible outside of RBBITest :-( 4380 if (U_FAILURE(mk.deferredStatus)) { 4381 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 4382 return; 4383 } 4384 4385 // Verify that the character classes all have at least one member. 4386 for (i=0; i<numCharClasses; i++) { 4387 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 4388 if (s == NULL || s->size() == 0) { 4389 errln("Character Class #%d is null or of zero size.", i); 4390 return; 4391 } 4392 } 4393 4394 while (loopCount < numIterations || numIterations == -1) { 4395 if (numIterations == -1 && loopCount % 10 == 0) { 4396 // If test is running in an infinite loop, display a periodic tic so 4397 // we can tell that it is making progress. 4398 fprintf(stderr, "."); 4399 } 4400 // Save current random number seed, so that we can recreate the random numbers 4401 // for this loop iteration in event of an error. 4402 seed = m_seed; 4403 4404 // Populate a test string with data. 4405 testText.truncate(0); 4406 for (i=0; i<TESTSTRINGLEN; i++) { 4407 int32_t aClassNum = m_rand() % numCharClasses; 4408 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 4409 int32_t charIdx = m_rand() % classSet->size(); 4410 UChar32 c = classSet->charAt(charIdx); 4411 if (c < 0) { // TODO: deal with sets containing strings. 4412 errln("c < 0"); 4413 break; 4414 } 4415 testText.append(c); 4416 } 4417 4418 // Calculate the expected results for this test string. 4419 mk.setText(testText); 4420 memset(expectedBreaks, 0, sizeof(expectedBreaks)); 4421 expectedBreaks[0] = 1; 4422 int32_t breakPos = 0; 4423 expectedCount = 0; 4424 for (;;) { 4425 breakPos = mk.next(breakPos); 4426 if (breakPos == -1) { 4427 break; 4428 } 4429 if (breakPos > testText.length()) { 4430 errln("breakPos > testText.length()"); 4431 } 4432 expectedBreaks[breakPos] = 1; 4433 U_ASSERT(expectedCount<testText.length()); 4434 expected[expectedCount ++] = breakPos; 4435 } 4436 4437 // Find the break positions using forward iteration 4438 memset(forwardBreaks, 0, sizeof(forwardBreaks)); 4439 if (useUText) { 4440 UErrorCode status = U_ZERO_ERROR; 4441 UText *testUText = utext_openReplaceable(NULL, &testText, &status); 4442 // testUText = utext_openUnicodeString(testUText, &testText, &status); 4443 bi->setText(testUText, status); 4444 TEST_ASSERT_SUCCESS(status); 4445 utext_close(testUText); // The break iterator does a shallow clone of the UText 4446 // This UText can be closed immediately, so long as the 4447 // testText string continues to exist. 4448 } else { 4449 bi->setText(testText); 4450 } 4451 4452 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 4453 if (i < 0 || i > testText.length()) { 4454 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4455 break; 4456 } 4457 forwardBreaks[i] = 1; 4458 } 4459 4460 // Find the break positions using reverse iteration 4461 memset(reverseBreaks, 0, sizeof(reverseBreaks)); 4462 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 4463 if (i < 0 || i > testText.length()) { 4464 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4465 break; 4466 } 4467 reverseBreaks[i] = 1; 4468 } 4469 4470 // Find the break positions using isBoundary() tests. 4471 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 4472 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 4473 for (i=0; i<=testText.length(); i++) { 4474 isBoundaryBreaks[i] = bi->isBoundary(i); 4475 } 4476 4477 4478 // Find the break positions using the following() function. 4479 // printf("."); 4480 memset(followingBreaks, 0, sizeof(followingBreaks)); 4481 int32_t lastBreakPos = 0; 4482 followingBreaks[0] = 1; 4483 for (i=0; i<testText.length(); i++) { 4484 breakPos = bi->following(i); 4485 if (breakPos <= i || 4486 breakPos < lastBreakPos || 4487 breakPos > testText.length() || 4488 breakPos > lastBreakPos && lastBreakPos > i ) { 4489 errln("%s break monkey test: " 4490 "Out of range value returned by BreakIterator::following().\n" 4491 "Random seed=%d index=%d; following returned %d; lastbreak=%d", 4492 name, seed, i, breakPos, lastBreakPos); 4493 break; 4494 } 4495 followingBreaks[breakPos] = 1; 4496 lastBreakPos = breakPos; 4497 } 4498 4499 // Find the break positions using the preceding() function. 4500 memset(precedingBreaks, 0, sizeof(precedingBreaks)); 4501 lastBreakPos = testText.length(); 4502 precedingBreaks[testText.length()] = 1; 4503 for (i=testText.length(); i>0; i--) { 4504 breakPos = bi->preceding(i); 4505 if (breakPos >= i || 4506 breakPos > lastBreakPos || 4507 breakPos < 0 && testText.getChar32Start(i)>0 || 4508 breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) { 4509 errln("%s break monkey test: " 4510 "Out of range value returned by BreakIterator::preceding().\n" 4511 "index=%d; prev returned %d; lastBreak=%d" , 4512 name, i, breakPos, lastBreakPos); 4513 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 4514 precedingBreaks[i] = 2; // Forces an error. 4515 } 4516 } else { 4517 if (breakPos >= 0) { 4518 precedingBreaks[breakPos] = 1; 4519 } 4520 lastBreakPos = breakPos; 4521 } 4522 } 4523 4524 // Compare the expected and actual results. 4525 for (i=0; i<=testText.length(); i++) { 4526 const char *errorType = NULL; 4527 if (forwardBreaks[i] != expectedBreaks[i]) { 4528 errorType = "next()"; 4529 } else if (reverseBreaks[i] != forwardBreaks[i]) { 4530 errorType = "previous()"; 4531 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 4532 errorType = "isBoundary()"; 4533 } else if (followingBreaks[i] != expectedBreaks[i]) { 4534 errorType = "following()"; 4535 } else if (precedingBreaks[i] != expectedBreaks[i]) { 4536 errorType = "preceding()"; 4537 } 4538 4539 4540 if (errorType != NULL) { 4541 // Format a range of the test text that includes the failure as 4542 // a data item that can be included in the rbbi test data file. 4543 4544 // Start of the range is the last point where expected and actual results 4545 // both agreed that there was a break position. 4546 int startContext = i; 4547 int32_t count = 0; 4548 for (;;) { 4549 if (startContext==0) { break; } 4550 startContext --; 4551 if (expectedBreaks[startContext] != 0) { 4552 if (count == 2) break; 4553 count ++; 4554 } 4555 } 4556 4557 // End of range is two expected breaks past the start position. 4558 int endContext = i + 1; 4559 int ci; 4560 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 4561 for (;;) { 4562 if (endContext >= testText.length()) {break;} 4563 if (expectedBreaks[endContext-1] != 0) { 4564 if (count == 0) break; 4565 count --; 4566 } 4567 endContext ++; 4568 } 4569 } 4570 4571 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 4572 UnicodeString errorText = "<data>"; 4573 /***if (strcmp(errorType, "next()") == 0) { 4574 startContext = 0; 4575 endContext = testText.length(); 4576 4577 printStringBreaks(testText, expected, expectedCount); 4578 }***/ 4579 4580 for (ci=startContext; ci<endContext;) { 4581 UnicodeString hexChars("0123456789abcdef"); 4582 UChar32 c; 4583 int bn; 4584 c = testText.char32At(ci); 4585 if (ci == i) { 4586 // This is the location of the error. 4587 errorText.append("<?>"); 4588 } else if (expectedBreaks[ci] != 0) { 4589 // This a non-error expected break position. 4590 errorText.append("\\"); 4591 } 4592 if (c < 0x10000) { 4593 errorText.append("\\u"); 4594 for (bn=12; bn>=0; bn-=4) { 4595 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4596 } 4597 } else { 4598 errorText.append("\\U"); 4599 for (bn=28; bn>=0; bn-=4) { 4600 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4601 } 4602 } 4603 ci = testText.moveIndex32(ci, 1); 4604 } 4605 errorText.append("\\"); 4606 errorText.append("</data>\n"); 4607 4608 // Output the error 4609 char charErrorTxt[500]; 4610 UErrorCode status = U_ZERO_ERROR; 4611 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 4612 charErrorTxt[sizeof(charErrorTxt)-1] = 0; 4613 errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 4614 name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 4615 errorType, seed, i, charErrorTxt); 4616 break; 4617 } 4618 } 4619 4620 loopCount++; 4621 } 4622#endif 4623} 4624 4625// 4626// TestDebug - A place-holder test for debugging purposes. 4627// For putting in fragments of other tests that can be invoked 4628// for tracing without a lot of unwanted extra stuff happening. 4629// 4630void RBBITest::TestDebug(void) { 4631#if 0 4632 UErrorCode status = U_ZERO_ERROR; 4633 int pos = 0; 4634 int ruleStatus = 0; 4635 4636 RuleBasedBreakIterator* bi = 4637 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 4638 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status); 4639 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 4640 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e"); 4641 // UnicodeString s("Aaa. Bcd"); 4642 s = s.unescape(); 4643 bi->setText(s); 4644 UBool r = bi->isBoundary(8); 4645 printf("%s", r?"true":"false"); 4646 return; 4647 pos = bi->last(); 4648 do { 4649 // ruleStatus = bi->getRuleStatus(); 4650 printf("%d\t%d\n", pos, ruleStatus); 4651 pos = bi->previous(); 4652 } while (pos != BreakIterator::DONE); 4653#endif 4654} 4655 4656#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 4657