rbbitst.cpp revision 6d5deb12725f146643d443090dfa11b206df528a
1/******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1999-2009, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6/************************************************************************ 7* Date Name Description 8* 12/15/99 Madhu Creation. 9* 01/12/2000 Madhu Updated for changed API and added new tests 10************************************************************************/ 11 12#include "unicode/utypes.h" 13 14#if !UCONFIG_NO_BREAK_ITERATION 15 16#include "unicode/utypes.h" 17#include "unicode/brkiter.h" 18#include "unicode/rbbi.h" 19#include "unicode/uchar.h" 20#include "unicode/utf16.h" 21#include "unicode/ucnv.h" 22#include "unicode/schriter.h" 23#include "unicode/uniset.h" 24#include "unicode/regex.h" // TODO: make conditional on regexp being built. 25#include "unicode/ustring.h" 26#include "unicode/utext.h" 27#include "intltest.h" 28#include "rbbitst.h" 29#include <string.h> 30#include "uvector.h" 31#include "uvectr32.h" 32#include "triedict.h" 33#include <string.h> 34#include <stdio.h> 35#include <stdlib.h> 36 37#define TEST_ASSERT(x) {if (!(x)) { \ 38 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 39 40#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 41 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 42 43 44//--------------------------------------------- 45// runIndexedTest 46//--------------------------------------------- 47 48void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 49{ 50 if (exec) logln("TestSuite RuleBasedBreakIterator: "); 51 52 switch (index) { 53 case 0: name = "TestBug4153072"; 54 if(exec) TestBug4153072(); break; 55 case 1: name = "TestJapaneseLineBreak"; 56 if(exec) TestJapaneseLineBreak(); break; 57 case 2: name = "TestStatusReturn"; 58 if(exec) TestStatusReturn(); break; 59 case 3: name = "TestUnicodeFiles"; 60 if(exec) TestUnicodeFiles(); break; 61 case 4: name = "TestEmptyString"; 62 if(exec) TestEmptyString(); break; 63 64 case 5: name = "TestGetAvailableLocales"; 65 if(exec) TestGetAvailableLocales(); break; 66 67 case 6: name = "TestGetDisplayName"; 68 if(exec) TestGetDisplayName(); break; 69 70 case 7: name = "TestEndBehaviour"; 71 if(exec) TestEndBehaviour(); break; 72 case 8: name = "TestMixedThaiLineBreak"; 73 if(exec) TestMixedThaiLineBreak(); break; 74 case 9: name = "TestThaiLineBreak"; 75 if(exec) TestThaiLineBreak(); break; 76 case 10: name = "TestMaiyamok"; 77 if(exec) TestMaiyamok(); break; 78 case 11: name = "TestWordBreaks"; 79 if(exec) TestWordBreaks(); break; 80 case 12: name = "TestWordBoundary"; 81 if(exec) TestWordBoundary(); break; 82 case 13: name = "TestLineBreaks"; 83 if(exec) TestLineBreaks(); break; 84 case 14: name = "TestSentBreaks"; 85 if(exec) TestSentBreaks(); break; 86 case 15: name = "TestExtended"; 87 if(exec) TestExtended(); break; 88 case 16: name = "TestMonkey"; 89 if(exec) { 90 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 91 TestMonkey(params); 92 #else 93 logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)"); 94 #endif 95 } 96 break; 97 case 17: name = "TestBug3818"; 98 if(exec) TestBug3818(); break; 99 case 18: name = "TestJapaneseWordBreak"; 100 if(exec) TestJapaneseWordBreak(); break; 101 case 19: name = "TestDebug"; 102 if(exec) TestDebug(); break; 103 case 20: name = "TestTrieDict"; 104 if(exec) TestTrieDict(); break; 105 case 21: name = "TestBug5775"; 106 if (exec) TestBug5775(); break; 107 case 22: name = "TestThaiBreaks"; 108 if (exec) TestThaiBreaks(); break; 109 case 23: name = "TestTailoredBreaks"; 110 if (exec) TestTailoredBreaks(); break; 111 112 default: name = ""; break; //needed to end loop 113 } 114} 115 116 117//--------------------------------------------------------------------------- 118// 119// class BITestData Holds a set of Break iterator test data and results 120// Includes 121// - the string data to be broken 122// - a vector of the expected break positions. 123// - a vector of source line numbers for the data, 124// (to help see where errors occured.) 125// - The expected break tag values. 126// - Vectors of actual break positions and tag values. 127// - Functions for comparing actual with expected and 128// reporting errors. 129// 130//---------------------------------------------------------------------------- 131class BITestData { 132public: 133 UnicodeString fDataToBreak; 134 UVector fExpectedBreakPositions; 135 UVector fExpectedTags; 136 UVector fLineNum; 137 UVector fActualBreakPositions; // Test Results. 138 UVector fActualTags; 139 140 BITestData(UErrorCode &status); 141 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); 142 void checkResults(const char *heading, RBBITest *test); 143 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); 144 void clearResults(); 145}; 146 147// 148// Constructor. 149// 150BITestData::BITestData(UErrorCode &status) 151: fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), 152 fActualTags(status) 153{ 154} 155 156// 157// addDataChunk. Add a section (non-breaking) piece if data to the test data. 158// The macro form collects the line number, which is helpful 159// when tracking down failures. 160// 161// A null data item is inserted at the start of each test's data 162// to put the starting zero into the data list. The position saved for 163// each non-null item is its ending position. 164// 165#define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); 166void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { 167 if (U_FAILURE(status)) {return;} 168 if (data != NULL) { 169 fDataToBreak.append(CharsToUnicodeString(data)); 170 } 171 fExpectedBreakPositions.addElement(fDataToBreak.length(), status); 172 fExpectedTags.addElement(tag, status); 173 fLineNum.addElement(lineNum, status); 174} 175 176 177// 178// checkResults. Compare the actual and expected break positions, report any differences. 179// 180void BITestData::checkResults(const char *heading, RBBITest *test) { 181 int32_t expectedIndex = 0; 182 int32_t actualIndex = 0; 183 184 for (;;) { 185 // If we've run through both the expected and actual results vectors, we're done. 186 // break out of the loop. 187 if (expectedIndex >= fExpectedBreakPositions.size() && 188 actualIndex >= fActualBreakPositions.size()) { 189 break; 190 } 191 192 193 if (expectedIndex >= fExpectedBreakPositions.size()) { 194 err(heading, test, expectedIndex-1, actualIndex); 195 actualIndex++; 196 continue; 197 } 198 199 if (actualIndex >= fActualBreakPositions.size()) { 200 err(heading, test, expectedIndex, actualIndex-1); 201 expectedIndex++; 202 continue; 203 } 204 205 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { 206 err(heading, test, expectedIndex, actualIndex); 207 // Try to resync the positions of the indices, to avoid a rash of spurious erros. 208 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { 209 actualIndex++; 210 } else { 211 expectedIndex++; 212 } 213 continue; 214 } 215 216 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { 217 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", 218 heading, fLineNum.elementAt(expectedIndex), 219 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); 220 } 221 222 actualIndex++; 223 expectedIndex++; 224 } 225} 226 227// 228// err - An error was found. Report it, along with information about where the 229// incorrectly broken test data appeared in the source file. 230// 231void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) 232{ 233 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); 234 int32_t actual = fActualBreakPositions.elementAti(actualIdx); 235 int32_t o = 0; 236 int32_t line = fLineNum.elementAti(expectedIdx); 237 if (expectedIdx > 0) { 238 // The line numbers are off by one because a premature break occurs somewhere 239 // within the previous item, rather than at the start of the current (expected) item. 240 // We want to report the offset of the unexpected break from the start of 241 // this previous item. 242 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); 243 } 244 if (actual < expected) { 245 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); 246 } else { 247 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); 248 } 249} 250 251 252void BITestData::clearResults() { 253 fActualBreakPositions.removeAllElements(); 254 fActualTags.removeAllElements(); 255} 256 257 258//----------------------------------------------------------------------------------- 259// 260// Cannned Test Characters 261// 262//----------------------------------------------------------------------------------- 263 264static const UChar cannedTestArray[] = { 265 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031, 266 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b, 267 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2, 268 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3, 269 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303, 270 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000, 271 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f, 272 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000 273}; 274 275static UnicodeString* cannedTestChars = 0; 276 277#define halfNA "\\u0928\\u094d\\u200d" 278#define halfSA "\\u0938\\u094d\\u200d" 279#define halfCHA "\\u091a\\u094d\\u200d" 280#define halfKA "\\u0915\\u094d\\u200d" 281#define deadTA "\\u0924\\u094d" 282 283//-------------------------------------------------------------------------------------- 284// 285// RBBITest constructor and destructor 286// 287//-------------------------------------------------------------------------------------- 288 289RBBITest::RBBITest() { 290 UnicodeString temp(cannedTestArray); 291 cannedTestChars = new UnicodeString(); 292 *cannedTestChars += (UChar)0x0000; 293 *cannedTestChars += temp; 294} 295 296 297RBBITest::~RBBITest() { 298 delete cannedTestChars; 299} 300 301 302static const int T_NUMBER = 100; 303static const int T_LETTER = 200; 304static const int T_H_OR_K = 300; 305static const int T_IDEO = 400; 306 307 308 309 310 311 312//-------------------------------------------------------------------- 313//Testing the BreakIterator for devanagari script 314//-------------------------------------------------------------------- 315 316#define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/ 317#define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/ 318#define deadTTHA "\\u0920\\u094d" 319#define deadPA "\\u092a\\u094d" 320#define deadSA "\\u0938\\u094d" 321#define visarga "\\u0903" /*devanagari visarga looks like a english colon*/ 322 323 324 325 326 327 328//----------------------------------------------------------------------------------- 329// 330// Test for status {tag} return value from break rules. 331// TODO: a more thorough test. 332// 333//----------------------------------------------------------------------------------- 334void RBBITest::TestStatusReturn() { 335 UnicodeString rulesString1("$Letters = [:L:];\n" 336 "$Numbers = [:N:];\n" 337 "$Letters+{1};\n" 338 "$Numbers+{2};\n" 339 "Help\\ {4}/me\\!;\n" 340 "[^$Letters $Numbers];\n" 341 "!.*;\n", -1, US_INV); 342 UnicodeString testString1 = "abc123..abc Help me Help me!"; 343 // 01234567890123456789012345678 344 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; 345 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; 346 347 UErrorCode status=U_ZERO_ERROR; 348 UParseError parseError; 349 350 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 351 if(U_FAILURE(status)) { 352 dataerrln("FAIL : in construction - %s", u_errorName(status)); 353 } else { 354 int32_t pos; 355 int32_t i = 0; 356 bi->setText(testString1); 357 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { 358 if (pos != bounds1[i]) { 359 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos); 360 break; 361 } 362 363 int tag = bi->getRuleStatus(); 364 if (tag != brkStatus[i]) { 365 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag); 366 break; 367 } 368 i++; 369 } 370 } 371 delete bi; 372} 373 374 375static void printStringBreaks(UnicodeString ustr, int expected[], 376 int expectedcount) 377{ 378 UErrorCode status = U_ZERO_ERROR; 379 char name[100]; 380 printf("code alpha extend alphanum type word sent line name\n"); 381 int j; 382 for (j = 0; j < ustr.length(); j ++) { 383 if (expectedcount > 0) { 384 int k; 385 for (k = 0; k < expectedcount; k ++) { 386 if (j == expected[k]) { 387 printf("------------------------------------------------ %d\n", 388 j); 389 } 390 } 391 } 392 UChar32 c = ustr.char32At(j); 393 if (c > 0xffff) { 394 j ++; 395 } 396 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 397 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 398 u_isUAlphabetic(c), 399 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 400 u_isalnum(c), 401 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 402 u_charType(c), 403 U_SHORT_PROPERTY_NAME), 404 u_getPropertyValueName(UCHAR_WORD_BREAK, 405 u_getIntPropertyValue(c, 406 UCHAR_WORD_BREAK), 407 U_SHORT_PROPERTY_NAME), 408 u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 409 u_getIntPropertyValue(c, 410 UCHAR_SENTENCE_BREAK), 411 U_SHORT_PROPERTY_NAME), 412 u_getPropertyValueName(UCHAR_LINE_BREAK, 413 u_getIntPropertyValue(c, 414 UCHAR_LINE_BREAK), 415 U_SHORT_PROPERTY_NAME), 416 name); 417 } 418} 419 420void RBBITest::TestThaiLineBreak() { 421 UErrorCode status = U_ZERO_ERROR; 422 BITestData thaiLineSelection(status); 423 424 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that 425 // represents elided letters at the end of a long word. It should be bound to 426 // the end of the word and not treated as an independent punctuation mark. 427 428 429 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 430 ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status); 431 ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status); 432 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status); 433 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status); 434// ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status); 435// ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status); 436 ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status); 437 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us 438 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status); 439 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status); 440 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status); 441 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status); 442 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status); 443 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status); 444 445 // the one time where the paiyannoi occurs somewhere other than at the end 446 // of a word is in the Thai abbrevation for "etc.", which both begins and 447 // ends with a paiyannoi 448 ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status); 449 ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status); 450 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status); 451 452 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance( 453 Locale("th"), status); 454 if (U_FAILURE(status)) 455 { 456 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status)); 457 return; 458 } 459 460 generalIteratorTest(*e, thaiLineSelection); 461 delete e; 462} 463 464 465 466void RBBITest::TestMixedThaiLineBreak() 467{ 468 UErrorCode status = U_ZERO_ERROR; 469 BITestData thaiLineSelection(status); 470 471 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 472 473 474 // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters 475 // start 476 477 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status); 478 ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status); 479 ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status); 480 ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status); 481 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status); 482 ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status); 483 ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status); 484 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status); 485 ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status); 486 ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status); 487 ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status); 488 ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status); 489 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status); 490 ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status); 491 ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status); 492 ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status); 493 494 // @suwit - end of changes 495 496 497 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status); 498 if (U_FAILURE(status)) 499 { 500 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status)); 501 return; 502 } 503 504 505 generalIteratorTest(*e, thaiLineSelection); 506 delete e; 507} 508 509 510void RBBITest::TestMaiyamok() 511{ 512 UErrorCode status = U_ZERO_ERROR; 513 BITestData thaiLineSelection(status); 514 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data 515 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous 516 // word". Instead of appearing as a word unto itself, however, it's kept together 517 // with the word before it 518 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status); 519 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status); 520 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status); 521 ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status); 522 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status); 523 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status); 524 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status); 525 ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status); 526 ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status); 527 528 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance( 529 Locale("th"), status); 530 531 if (U_FAILURE(status)) 532 { 533 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status)); 534 return; 535 } 536 generalIteratorTest(*e, thaiLineSelection); 537 delete e; 538} 539 540 541 542void RBBITest::TestBug3818() { 543 UErrorCode status = U_ZERO_ERROR; 544 545 // Four Thai words... 546 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 547 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 548 UnicodeString thaiStr(thaiWordData); 549 550 RuleBasedBreakIterator* bi = 551 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status); 552 if (U_FAILURE(status) || bi == NULL) { 553 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 554 return; 555 } 556 bi->setText(thaiStr); 557 558 int32_t startOfSecondWord = bi->following(1); 559 if (startOfSecondWord != 4) { 560 errln("Fail at file %s, line %d expected start of word at 4, got %d", 561 __FILE__, __LINE__, startOfSecondWord); 562 } 563 startOfSecondWord = bi->following(0); 564 if (startOfSecondWord != 4) { 565 errln("Fail at file %s, line %d expected start of word at 4, got %d", 566 __FILE__, __LINE__, startOfSecondWord); 567 } 568 delete bi; 569} 570 571 572void RBBITest::TestJapaneseWordBreak() { 573 UErrorCode status = U_ZERO_ERROR; 574 BITestData japaneseWordSelection(status); 575 576 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data 577 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2 578 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5 579 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7 580 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10 581 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11 582 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12 583 584 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance( 585 Locale("ja"), status); 586 if (U_FAILURE(status)) 587 { 588 errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n"); 589 return; 590 } 591 592 generalIteratorTest(*e, japaneseWordSelection); 593 delete e; 594} 595 596void RBBITest::TestTrieDict() { 597 UErrorCode status = U_ZERO_ERROR; 598 599 // 600 // Open and read the test data file. 601 // 602 const char *testDataDirectory = IntlTest::getSourceTestData(status); 603 char testFileName[1000]; 604 if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) { 605 errln("Can't open test data. Path too long."); 606 return; 607 } 608 strcpy(testFileName, testDataDirectory); 609 strcat(testFileName, "riwords.txt"); 610 611 // Items needing deleting at the end 612 MutableTrieDictionary *mutableDict = NULL; 613 CompactTrieDictionary *compactDict = NULL; 614 UnicodeSet *breaks = NULL; 615 UChar *testFile = NULL; 616 StringEnumeration *enumer1 = NULL; 617 StringEnumeration *enumer2 = NULL; 618 MutableTrieDictionary *mutable2 = NULL; 619 StringEnumeration *cloneEnum = NULL; 620 CompactTrieDictionary *compact2 = NULL; 621 622 623 const UnicodeString *originalWord = NULL; 624 const UnicodeString *cloneWord = NULL; 625 UChar *current; 626 UChar *word; 627 UChar uc; 628 int32_t wordLen; 629 int32_t wordCount; 630 int32_t testCount; 631 632 int len; 633 testFile = ReadAndConvertFile(testFileName, len, NULL, status); 634 if (U_FAILURE(status)) { 635 goto cleanup; /* something went wrong, error already output */ 636 } 637 638 mutableDict = new MutableTrieDictionary(0x0E1C, status); 639 if (U_FAILURE(status)) { 640 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status)); 641 goto cleanup; 642 } 643 644 breaks = new UnicodeSet; 645 breaks->add(0x000A); // Line Feed 646 breaks->add(0x000D); // Carriage Return 647 breaks->add(0x2028); // Line Separator 648 breaks->add(0x2029); // Paragraph Separator 649 650 // Now add each non-comment line of the file as a word. 651 current = testFile; 652 word = current; 653 uc = *current++; 654 wordLen = 0; 655 wordCount = 0; 656 657 while (uc) { 658 if (uc == 0x0023) { // #comment line, skip 659 while (uc && !breaks->contains(uc)) { 660 uc = *current++; 661 } 662 } 663 else while (uc && !breaks->contains(uc)) { 664 ++wordLen; 665 uc = *current++; 666 } 667 if (wordLen > 0) { 668 mutableDict->addWord(word, wordLen, status); 669 if (U_FAILURE(status)) { 670 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status)); 671 goto cleanup; 672 } 673 wordCount += 1; 674 } 675 676 // Find beginning of next line 677 while (uc && breaks->contains(uc)) { 678 uc = *current++; 679 } 680 word = current-1; 681 wordLen = 0; 682 } 683 684 if (wordCount < 50) { 685 errln("Word count (%d) unreasonably small\n", wordCount); 686 goto cleanup; 687 } 688 689 enumer1 = mutableDict->openWords(status); 690 if (U_FAILURE(status)) { 691 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status)); 692 goto cleanup; 693 } 694 695 testCount = 0; 696 if (wordCount != (testCount = enumer1->count(status))) { 697 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 698 testCount, wordCount, u_errorName(status)); 699 goto cleanup; 700 } 701 702 // Now compact it 703 compactDict = new CompactTrieDictionary(*mutableDict, status); 704 if (U_FAILURE(status)) { 705 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status)); 706 goto cleanup; 707 } 708 709 enumer2 = compactDict->openWords(status); 710 if (U_FAILURE(status)) { 711 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status)); 712 goto cleanup; 713 } 714 715 if (wordCount != (testCount = enumer2->count(status))) { 716 errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 717 testCount, wordCount, u_errorName(status)); 718 goto cleanup; 719 } 720 721 if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) { 722 errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same"); 723 } 724 delete enumer1; 725 enumer1 = NULL; 726 delete enumer2; 727 enumer2 = NULL; 728 729 // Now un-compact it 730 mutable2 = compactDict->cloneMutable(status); 731 if (U_FAILURE(status)) { 732 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status)); 733 goto cleanup; 734 } 735 736 cloneEnum = mutable2->openWords(status); 737 if (U_FAILURE(status)) { 738 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status)); 739 goto cleanup; 740 } 741 742 if (wordCount != (testCount = cloneEnum->count(status))) { 743 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 744 testCount, wordCount, u_errorName(status)); 745 goto cleanup; 746 } 747 748 // Compact original dictionary to clone. Note that we can only compare the same kind of 749 // dictionary as the order of the enumerators is not guaranteed to be the same between 750 // different kinds 751 enumer1 = mutableDict->openWords(status); 752 if (U_FAILURE(status)) { 753 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status)); 754 goto cleanup; 755 } 756 757 originalWord = enumer1->snext(status); 758 cloneWord = cloneEnum->snext(status); 759 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { 760 if (*originalWord != *cloneWord) { 761 errln("Original and cloned MutableTrieDictionary word mismatch\n"); 762 goto cleanup; 763 } 764 originalWord = enumer1->snext(status); 765 cloneWord = cloneEnum->snext(status); 766 } 767 768 if (U_FAILURE(status)) { 769 errln("Enumeration failed: %s\n", u_errorName(status)); 770 goto cleanup; 771 } 772 773 if (originalWord != cloneWord) { 774 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n"); 775 goto cleanup; 776 } 777 778 // Test the data copying constructor for CompactTrieDict, and the data access APIs. 779 compact2 = new CompactTrieDictionary(compactDict->data(), status); 780 if (U_FAILURE(status)) { 781 errln("CompactTrieDictionary(const void *,...) failed\n"); 782 goto cleanup; 783 } 784 785 if (compact2->dataSize() == 0) { 786 errln("CompactTrieDictionary->dataSize() == 0\n"); 787 goto cleanup; 788 } 789 790 // Now count the words via the second dictionary 791 delete enumer1; 792 enumer1 = compact2->openWords(status); 793 if (U_FAILURE(status)) { 794 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status)); 795 goto cleanup; 796 } 797 798 if (wordCount != (testCount = enumer1->count(status))) { 799 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n", 800 testCount, wordCount, u_errorName(status)); 801 goto cleanup; 802 } 803 804cleanup: 805 delete compactDict; 806 delete mutableDict; 807 delete breaks; 808 delete[] testFile; 809 delete enumer1; 810 delete mutable2; 811 delete cloneEnum; 812 delete compact2; 813} 814 815 816//---------------------------------------------------------------------------- 817// 818// generalIteratorTest Given a break iterator and a set of test data, 819// Run the tests and report the results. 820// 821//---------------------------------------------------------------------------- 822void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 823{ 824 825 bi.setText(td.fDataToBreak); 826 827 testFirstAndNext(bi, td); 828 829 testLastAndPrevious(bi, td); 830 831 testFollowing(bi, td); 832 testPreceding(bi, td); 833 testIsBoundary(bi, td); 834 doMultipleSelectionTest(bi, td); 835} 836 837 838// 839// testFirstAndNext. Run the iterator forwards in the obvious first(), next() 840// kind of loop. 841// 842void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) 843{ 844 UErrorCode status = U_ZERO_ERROR; 845 int32_t p; 846 int32_t lastP = -1; 847 int32_t tag; 848 849 logln("Test first and next"); 850 bi.setText(td.fDataToBreak); 851 td.clearResults(); 852 853 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { 854 td.fActualBreakPositions.addElement(p, status); // Save result. 855 tag = bi.getRuleStatus(); 856 td.fActualTags.addElement(tag, status); 857 if (p <= lastP) { 858 // If the iterator is not making forward progress, stop. 859 // No need to raise an error here, it'll be detected in the normal check of results. 860 break; 861 } 862 lastP = p; 863 } 864 td.checkResults("testFirstAndNext", this); 865} 866 867 868// 869// TestLastAndPrevious. Run the iterator backwards, starting with last(). 870// 871void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) 872{ 873 UErrorCode status = U_ZERO_ERROR; 874 int32_t p; 875 int32_t lastP = 0x7ffffffe; 876 int32_t tag; 877 878 logln("Test last and previous"); 879 bi.setText(td.fDataToBreak); 880 td.clearResults(); 881 882 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { 883 // Save break position. Insert it at start of vector of results, shoving 884 // already-saved results further towards the end. 885 td.fActualBreakPositions.insertElementAt(p, 0, status); 886 // bi.previous(); // TODO: Why does this fix things up???? 887 // bi.next(); 888 tag = bi.getRuleStatus(); 889 td.fActualTags.insertElementAt(tag, 0, status); 890 if (p >= lastP) { 891 // If the iterator is not making progress, stop. 892 // No need to raise an error here, it'll be detected in the normal check of results. 893 break; 894 } 895 lastP = p; 896 } 897 td.checkResults("testLastAndPrevious", this); 898} 899 900 901void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) 902{ 903 UErrorCode status = U_ZERO_ERROR; 904 int32_t p; 905 int32_t tag; 906 int32_t lastP = -2; // A value that will never be returned as a break position. 907 // cannot be -1; that is returned for DONE. 908 int i; 909 910 logln("testFollowing():"); 911 bi.setText(td.fDataToBreak); 912 td.clearResults(); 913 914 // Save the starting point, since we won't get that out of following. 915 p = bi.first(); 916 td.fActualBreakPositions.addElement(p, status); // Save result. 917 tag = bi.getRuleStatus(); 918 td.fActualTags.addElement(tag, status); 919 920 for (i = 0; i <= td.fDataToBreak.length()+1; i++) { 921 p = bi.following(i); 922 if (p != lastP) { 923 if (p == RuleBasedBreakIterator::DONE) { 924 break; 925 } 926 // We've reached a new break position. Save it. 927 td.fActualBreakPositions.addElement(p, status); // Save result. 928 tag = bi.getRuleStatus(); 929 td.fActualTags.addElement(tag, status); 930 lastP = p; 931 } 932 } 933 // The loop normally exits by means of the break in the middle. 934 // Make sure that the index was at the correct position for the break iterator to have 935 // returned DONE. 936 if (i != td.fDataToBreak.length()) { 937 errln("testFollowing(): iterator returned DONE prematurely."); 938 } 939 940 // Full check of all results. 941 td.checkResults("testFollowing", this); 942} 943 944 945 946void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { 947 UErrorCode status = U_ZERO_ERROR; 948 int32_t p; 949 int32_t tag; 950 int32_t lastP = 0x7ffffffe; 951 int i; 952 953 logln("testPreceding():"); 954 bi.setText(td.fDataToBreak); 955 td.clearResults(); 956 957 p = bi.last(); 958 td.fActualBreakPositions.addElement(p, status); 959 tag = bi.getRuleStatus(); 960 td.fActualTags.addElement(tag, status); 961 962 for (i = td.fDataToBreak.length(); i>=-1; i--) { 963 p = bi.preceding(i); 964 if (p != lastP) { 965 if (p == RuleBasedBreakIterator::DONE) { 966 break; 967 } 968 // We've reached a new break position. Save it. 969 td.fActualBreakPositions.insertElementAt(p, 0, status); 970 lastP = p; 971 tag = bi.getRuleStatus(); 972 td.fActualTags.insertElementAt(tag, 0, status); 973 } 974 } 975 // The loop normally exits by means of the break in the middle. 976 // Make sure that the index was at the correct position for the break iterator to have 977 // returned DONE. 978 if (i != 0) { 979 errln("testPreceding(): iterator returned DONE prematurely."); 980 } 981 982 // Full check of all results. 983 td.checkResults("testPreceding", this); 984} 985 986 987 988void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { 989 UErrorCode status = U_ZERO_ERROR; 990 int i; 991 int32_t tag; 992 993 logln("testIsBoundary():"); 994 bi.setText(td.fDataToBreak); 995 td.clearResults(); 996 997 for (i = 0; i <= td.fDataToBreak.length(); i++) { 998 if (bi.isBoundary(i)) { 999 td.fActualBreakPositions.addElement(i, status); // Save result. 1000 tag = bi.getRuleStatus(); 1001 td.fActualTags.addElement(tag, status); 1002 } 1003 } 1004 td.checkResults("testIsBoundary: ", this); 1005} 1006 1007 1008 1009void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) 1010{ 1011 iterator.setText(td.fDataToBreak); 1012 1013 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); 1014 int32_t offset = iterator.first(); 1015 int32_t testOffset; 1016 int32_t count = 0; 1017 1018 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); 1019 1020 if (*testIterator != iterator) 1021 errln("clone() or operator!= failed: two clones compared unequal"); 1022 1023 do { 1024 testOffset = testIterator->first(); 1025 testOffset = testIterator->next(count); 1026 if (offset != testOffset) 1027 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 1028 1029 if (offset != RuleBasedBreakIterator::DONE) { 1030 count++; 1031 offset = iterator.next(); 1032 1033 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { 1034 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); 1035 if (count > 10000 || offset == -1) { 1036 errln("operator== failed too many times. Stopping test."); 1037 if (offset == -1) { 1038 errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 1039 } 1040 return; 1041 } 1042 } 1043 } 1044 } while (offset != RuleBasedBreakIterator::DONE); 1045 1046 // now do it backwards... 1047 offset = iterator.last(); 1048 count = 0; 1049 1050 do { 1051 testOffset = testIterator->last(); 1052 testOffset = testIterator->next(count); // next() with a negative arg is same as previous 1053 if (offset != testOffset) 1054 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 1055 1056 if (offset != RuleBasedBreakIterator::DONE) { 1057 count--; 1058 offset = iterator.previous(); 1059 } 1060 } while (offset != RuleBasedBreakIterator::DONE); 1061 1062 delete testIterator; 1063} 1064 1065 1066//--------------------------------------------- 1067// 1068// other tests 1069// 1070//--------------------------------------------- 1071void RBBITest::TestEmptyString() 1072{ 1073 UnicodeString text = ""; 1074 UErrorCode status = U_ZERO_ERROR; 1075 1076 BITestData x(status); 1077 ADD_DATACHUNK(x, "", 0, status); // Break at start of data 1078 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 1079 if (U_FAILURE(status)) 1080 { 1081 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); 1082 return; 1083 } 1084 generalIteratorTest(*bi, x); 1085 delete bi; 1086} 1087 1088void RBBITest::TestGetAvailableLocales() 1089{ 1090 int32_t locCount = 0; 1091 const Locale* locList = BreakIterator::getAvailableLocales(locCount); 1092 1093 if (locCount == 0) 1094 dataerrln("getAvailableLocales() returned an empty list!"); 1095 // Just make sure that it's returning good memory. 1096 int32_t i; 1097 for (i = 0; i < locCount; ++i) { 1098 logln(locList[i].getName()); 1099 } 1100} 1101 1102//Testing the BreakIterator::getDisplayName() function 1103void RBBITest::TestGetDisplayName() 1104{ 1105 UnicodeString result; 1106 1107 BreakIterator::getDisplayName(Locale::getUS(), result); 1108 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 1109 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 1110 + result); 1111 1112 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 1113 if (result != "French (France)") 1114 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 1115 + result); 1116} 1117/** 1118 * Test End Behaviour 1119 * @bug 4068137 1120 */ 1121void RBBITest::TestEndBehaviour() 1122{ 1123 UErrorCode status = U_ZERO_ERROR; 1124 UnicodeString testString("boo."); 1125 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 1126 if (U_FAILURE(status)) 1127 { 1128 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 1129 return; 1130 } 1131 wb->setText(testString); 1132 1133 if (wb->first() != 0) 1134 errln("Didn't get break at beginning of string."); 1135 if (wb->next() != 3) 1136 errln("Didn't get break before period in \"boo.\""); 1137 if (wb->current() != 4 && wb->next() != 4) 1138 errln("Didn't get break at end of string."); 1139 delete wb; 1140} 1141/* 1142 * @bug 4153072 1143 */ 1144void RBBITest::TestBug4153072() { 1145 UErrorCode status = U_ZERO_ERROR; 1146 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 1147 if (U_FAILURE(status)) 1148 { 1149 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 1150 return; 1151 } 1152 UnicodeString str("...Hello, World!..."); 1153 int32_t begin = 3; 1154 int32_t end = str.length() - 3; 1155 UBool onBoundary; 1156 1157 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 1158 iter->adoptText(textIterator); 1159 int index; 1160 // Note: with the switch to UText, there is no way to restrict the 1161 // iteration range to begin at an index other than zero. 1162 // String character iterators created with a non-zero bound are 1163 // treated by RBBI as being empty. 1164 for (index = -1; index < begin + 1; ++index) { 1165 onBoundary = iter->isBoundary(index); 1166 if (index == 0? !onBoundary : onBoundary) { 1167 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 1168 " and begin index = " + begin); 1169 } 1170 } 1171 delete iter; 1172} 1173 1174 1175// 1176// Test for problem reported by Ashok Matoria on 9 July 2007 1177// One.<kSoftHyphen><kSpace>Two. 1178// 1179// Sentence break at start (0) and then on calling next() it breaks at 1180// 'T' of "Two". Now, at this point if I do next() and 1181// then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 1182// 1183void RBBITest::TestBug5775() { 1184 UErrorCode status = U_ZERO_ERROR; 1185 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1186 TEST_ASSERT_SUCCESS(status); 1187 if (U_FAILURE(status)) { 1188 return; 1189 } 1190// Check for status first for better handling of no data errors. 1191 TEST_ASSERT(bi != NULL); 1192 if (bi == NULL) { 1193 return; 1194 } 1195 1196 UnicodeString s("One.\\u00ad Two.", -1, US_INV); 1197 // 01234 56789 1198 s = s.unescape(); 1199 bi->setText(s); 1200 int pos = bi->next(); 1201 TEST_ASSERT(pos == 6); 1202 pos = bi->next(); 1203 TEST_ASSERT(pos == 10); 1204 pos = bi->previous(); 1205 TEST_ASSERT(pos == 6); 1206 delete bi; 1207} 1208 1209 1210 1211/** 1212 * Test Japanese Line Break 1213 * @bug 4095322 1214 */ 1215void RBBITest::TestJapaneseLineBreak() 1216{ 1217#if 0 1218 // Test needs updating some more... Dump it for now. 1219 1220 1221 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count 1222 // as opening and closing punctuation for line breaking. 1223 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars 1224 // from these tests. 6-13-2002 1225 // 1226 UErrorCode status = U_ZERO_ERROR; 1227 UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c"); 1228 UnicodeString precedingChars = CharsToUnicodeString( 1229 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f"); 1230 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e"); 1231 UnicodeString followingChars = CharsToUnicodeString( 1232 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc" 1233 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7" 1234 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034" 1235 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034" 1236 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302"); 1237 BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status); 1238 1239 int32_t i; 1240 if (U_FAILURE(status)) 1241 { 1242 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n"); 1243 return; 1244 } 1245 1246 for (i = 0; i < precedingChars.length(); i++) { 1247 testString.setCharAt(1, precedingChars[i]); 1248 iter->setText(testString); 1249 int32_t j = iter->first(); 1250 if (j != 0) 1251 errln("ja line break failure: failed to start at 0"); 1252 j = iter->next(); 1253 if (j != 1) 1254 errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i]) 1255 + "' (" + ((int)(precedingChars[i])) + ")"); 1256 j = iter->next(); 1257 if (j != 3) 1258 errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i]) 1259 + "' (" + ((int)(precedingChars[i])) + ")"); 1260 } 1261 1262 for (i = 0; i < followingChars.length(); i++) { 1263 testString.setCharAt(1, followingChars[i]); 1264 iter->setText(testString); 1265 int j = iter->first(); 1266 if (j != 0) 1267 errln("ja line break failure: failed to start at 0"); 1268 j = iter->next(); 1269 if (j != 2) 1270 errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i]) 1271 + "' (" + ((int)(followingChars[i])) + ")"); 1272 j = iter->next(); 1273 if (j != 3) 1274 errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i]) 1275 + "' (" + ((int)(followingChars[i])) + ")"); 1276 } 1277 delete iter; 1278#endif 1279} 1280 1281 1282//------------------------------------------------------------------------------ 1283// 1284// RBBITest::Extended Run RBBI Tests from an external test data file 1285// 1286//------------------------------------------------------------------------------ 1287 1288struct TestParams { 1289 BreakIterator *bi; 1290 UnicodeString dataToBreak; 1291 UVector32 *expectedBreaks; 1292 UVector32 *srcLine; 1293 UVector32 *srcCol; 1294}; 1295 1296void RBBITest::executeTest(TestParams *t) { 1297 int32_t bp; 1298 int32_t prevBP; 1299 int32_t i; 1300 1301 if (t->bi == NULL) { 1302 return; 1303 } 1304 1305 t->bi->setText(t->dataToBreak); 1306 // 1307 // Run the iterator forward 1308 // 1309 prevBP = -1; 1310 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 1311 if (prevBP == bp) { 1312 // Fail for lack of forward progress. 1313 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 1314 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1315 break; 1316 } 1317 1318 // Check that there were we didn't miss an expected break between the last one 1319 // and this one. 1320 for (i=prevBP+1; i<bp; i++) { 1321 if (t->expectedBreaks->elementAti(i) != 0) { 1322 int expected[] = {0, i}; 1323 printStringBreaks(t->dataToBreak, expected, 2); 1324 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1325 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1326 } 1327 } 1328 1329 // Check that the break we did find was expected 1330 if (t->expectedBreaks->elementAti(bp) == 0) { 1331 int expected[] = {0, bp}; 1332 printStringBreaks(t->dataToBreak, expected, 2); 1333 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1334 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1335 } else { 1336 // The break was expected. 1337 // Check that the {nnn} tag value is correct. 1338 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 1339 if (expectedTagVal == -1) { 1340 expectedTagVal = 0; 1341 } 1342 int32_t line = t->srcLine->elementAti(bp); 1343 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 1344 if (rs != expectedTagVal) { 1345 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 1346 " Actual, Expected status = %4d, %4d", 1347 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 1348 } 1349 } 1350 1351 1352 prevBP = bp; 1353 } 1354 1355 // Verify that there were no missed expected breaks after the last one found 1356 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) { 1357 if (t->expectedBreaks->elementAti(i) != 0) { 1358 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1359 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1360 } 1361 } 1362 1363 // 1364 // Run the iterator backwards, verify that the same breaks are found. 1365 // 1366 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen. 1367 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { 1368 if (prevBP == bp) { 1369 // Fail for lack of progress. 1370 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 1371 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1372 break; 1373 } 1374 1375 // Check that there were we didn't miss an expected break between the last one 1376 // and this one. (UVector returns zeros for index out of bounds.) 1377 for (i=prevBP-1; i>bp; i--) { 1378 if (t->expectedBreaks->elementAti(i) != 0) { 1379 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1380 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1381 } 1382 } 1383 1384 // Check that the break we did find was expected 1385 if (t->expectedBreaks->elementAti(bp) == 0) { 1386 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1387 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 1388 } else { 1389 // The break was expected. 1390 // Check that the {nnn} tag value is correct. 1391 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 1392 if (expectedTagVal == -1) { 1393 expectedTagVal = 0; 1394 } 1395 int line = t->srcLine->elementAti(bp); 1396 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 1397 if (rs != expectedTagVal) { 1398 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 1399 " Actual, Expected status = %4d, %4d", 1400 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 1401 } 1402 } 1403 1404 prevBP = bp; 1405 } 1406 1407 // Verify that there were no missed breaks prior to the last one found 1408 for (i=prevBP-1; i>=0; i--) { 1409 if (t->expectedBreaks->elementAti(i) != 0) { 1410 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1411 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 1412 } 1413 } 1414} 1415 1416 1417void RBBITest::TestExtended() { 1418#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1419 UErrorCode status = U_ZERO_ERROR; 1420 Locale locale(""); 1421 1422 UnicodeString rules; 1423 TestParams tp; 1424 tp.bi = NULL; 1425 tp.expectedBreaks = new UVector32(status); 1426 tp.srcLine = new UVector32(status); 1427 tp.srcCol = new UVector32(status); 1428 1429 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status); 1430 if (U_FAILURE(status)) { 1431 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 1432 } 1433 1434 1435 // 1436 // Open and read the test data file. 1437 // 1438 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1439 char testFileName[1000]; 1440 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1441 errln("Can't open test data. Path too long."); 1442 return; 1443 } 1444 strcpy(testFileName, testDataDirectory); 1445 strcat(testFileName, "rbbitst.txt"); 1446 1447 int len; 1448 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1449 if (U_FAILURE(status)) { 1450 return; /* something went wrong, error already output */ 1451 } 1452 1453 1454 1455 1456 // 1457 // Put the test data into a UnicodeString 1458 // 1459 UnicodeString testString(FALSE, testFile, len); 1460 1461 enum EParseState{ 1462 PARSE_COMMENT, 1463 PARSE_TAG, 1464 PARSE_DATA, 1465 PARSE_NUM 1466 } 1467 parseState = PARSE_TAG; 1468 1469 EParseState savedState = PARSE_TAG; 1470 1471 static const UChar CH_LF = 0x0a; 1472 static const UChar CH_CR = 0x0d; 1473 static const UChar CH_HASH = 0x23; 1474 /*static const UChar CH_PERIOD = 0x2e;*/ 1475 static const UChar CH_LT = 0x3c; 1476 static const UChar CH_GT = 0x3e; 1477 static const UChar CH_BACKSLASH = 0x5c; 1478 static const UChar CH_BULLET = 0x2022; 1479 1480 int32_t lineNum = 1; 1481 int32_t colStart = 0; 1482 int32_t column = 0; 1483 int32_t charIdx = 0; 1484 1485 int32_t tagValue = 0; // The numeric value of a <nnn> tag. 1486 1487 for (charIdx = 0; charIdx < len; ) { 1488 status = U_ZERO_ERROR; 1489 UChar c = testString.charAt(charIdx); 1490 charIdx++; 1491 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) { 1492 // treat CRLF as a unit 1493 c = CH_LF; 1494 charIdx++; 1495 } 1496 if (c == CH_LF || c == CH_CR) { 1497 lineNum++; 1498 colStart = charIdx; 1499 } 1500 column = charIdx - colStart + 1; 1501 1502 switch (parseState) { 1503 case PARSE_COMMENT: 1504 if (c == 0x0a || c == 0x0d) { 1505 parseState = savedState; 1506 } 1507 break; 1508 1509 case PARSE_TAG: 1510 { 1511 if (c == CH_HASH) { 1512 parseState = PARSE_COMMENT; 1513 savedState = PARSE_TAG; 1514 break; 1515 } 1516 if (u_isUWhiteSpace(c)) { 1517 break; 1518 } 1519 if (testString.compare(charIdx-1, 6, "<word>") == 0) { 1520 delete tp.bi; 1521 tp.bi = BreakIterator::createWordInstance(locale, status); 1522 charIdx += 5; 1523 break; 1524 } 1525 if (testString.compare(charIdx-1, 6, "<char>") == 0) { 1526 delete tp.bi; 1527 tp.bi = BreakIterator::createCharacterInstance(locale, status); 1528 charIdx += 5; 1529 break; 1530 } 1531 if (testString.compare(charIdx-1, 6, "<line>") == 0) { 1532 delete tp.bi; 1533 tp.bi = BreakIterator::createLineInstance(locale, status); 1534 charIdx += 5; 1535 break; 1536 } 1537 if (testString.compare(charIdx-1, 6, "<sent>") == 0) { 1538 delete tp.bi; 1539 tp.bi = NULL; 1540 tp.bi = BreakIterator::createSentenceInstance(locale, status); 1541 charIdx += 5; 1542 break; 1543 } 1544 if (testString.compare(charIdx-1, 7, "<title>") == 0) { 1545 delete tp.bi; 1546 tp.bi = BreakIterator::createTitleInstance(locale, status); 1547 charIdx += 6; 1548 break; 1549 } 1550 1551 // <locale loc_name> 1552 localeMatcher.reset(testString); 1553 if (localeMatcher.lookingAt(charIdx-1, status)) { 1554 UnicodeString localeName = localeMatcher.group(1, status); 1555 char localeName8[100]; 1556 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 1557 locale = Locale::createFromName(localeName8); 1558 charIdx += localeMatcher.group(0, status).length(); 1559 TEST_ASSERT_SUCCESS(status); 1560 break; 1561 } 1562 if (testString.compare(charIdx-1, 6, "<data>") == 0) { 1563 parseState = PARSE_DATA; 1564 charIdx += 5; 1565 tp.dataToBreak = ""; 1566 tp.expectedBreaks->removeAllElements(); 1567 tp.srcCol ->removeAllElements(); 1568 tp.srcLine->removeAllElements(); 1569 break; 1570 } 1571 1572 errln("line %d: Tag expected in test file.", lineNum); 1573 parseState = PARSE_COMMENT; 1574 savedState = PARSE_DATA; 1575 goto end_test; // Stop the test. 1576 } 1577 break; 1578 1579 case PARSE_DATA: 1580 if (c == CH_BULLET) { 1581 int32_t breakIdx = tp.dataToBreak.length(); 1582 tp.expectedBreaks->setSize(breakIdx+1); 1583 tp.expectedBreaks->setElementAt(-1, breakIdx); 1584 tp.srcLine->setSize(breakIdx+1); 1585 tp.srcLine->setElementAt(lineNum, breakIdx); 1586 tp.srcCol ->setSize(breakIdx+1); 1587 tp.srcCol ->setElementAt(column, breakIdx); 1588 break; 1589 } 1590 1591 if (testString.compare(charIdx-1, 7, "</data>") == 0) { 1592 // Add final entry to mappings from break location to source file position. 1593 // Need one extra because last break position returned is after the 1594 // last char in the data, not at the last char. 1595 tp.srcLine->addElement(lineNum, status); 1596 tp.srcCol ->addElement(column, status); 1597 1598 parseState = PARSE_TAG; 1599 charIdx += 6; 1600 1601 // RUN THE TEST! 1602 executeTest(&tp); 1603 break; 1604 } 1605 1606 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { 1607 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 1608 // Get the code point from the name and insert it into the test data. 1609 // (Damn, no API takes names in Unicode !!! 1610 // we've got to take it back to char *) 1611 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx); 1612 int32_t nameLength = nameEndIdx - (charIdx+2); 1613 char charNameBuf[200]; 1614 UChar32 theChar = -1; 1615 if (nameEndIdx != -1) { 1616 UErrorCode status = U_ZERO_ERROR; 1617 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 1618 charNameBuf[sizeof(charNameBuf)-1] = 0; 1619 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 1620 if (U_FAILURE(status)) { 1621 theChar = -1; 1622 } 1623 } 1624 if (theChar == -1) { 1625 errln("Error in named character in test file at line %d, col %d", 1626 lineNum, column); 1627 } else { 1628 // Named code point was recognized. Insert it 1629 // into the test data. 1630 tp.dataToBreak.append(theChar); 1631 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1632 tp.srcLine->addElement(lineNum, status); 1633 tp.srcCol ->addElement(column, status); 1634 } 1635 } 1636 if (nameEndIdx > charIdx) { 1637 charIdx = nameEndIdx+1; 1638 1639 } 1640 break; 1641 } 1642 1643 1644 1645 1646 if (testString.compare(charIdx-1, 2, "<>") == 0) { 1647 charIdx++; 1648 int32_t breakIdx = tp.dataToBreak.length(); 1649 tp.expectedBreaks->setSize(breakIdx+1); 1650 tp.expectedBreaks->setElementAt(-1, breakIdx); 1651 tp.srcLine->setSize(breakIdx+1); 1652 tp.srcLine->setElementAt(lineNum, breakIdx); 1653 tp.srcCol ->setSize(breakIdx+1); 1654 tp.srcCol ->setElementAt(column, breakIdx); 1655 break; 1656 } 1657 1658 if (c == CH_LT) { 1659 tagValue = 0; 1660 parseState = PARSE_NUM; 1661 break; 1662 } 1663 1664 if (c == CH_HASH && column==3) { // TODO: why is column off so far? 1665 parseState = PARSE_COMMENT; 1666 savedState = PARSE_DATA; 1667 break; 1668 } 1669 1670 if (c == CH_BACKSLASH) { 1671 // Check for \ at end of line, a line continuation. 1672 // Advance over (discard) the newline 1673 UChar32 cp = testString.char32At(charIdx); 1674 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) { 1675 // We have a CR LF 1676 // Need an extra increment of the input ptr to move over both of them 1677 charIdx++; 1678 } 1679 if (cp == CH_LF || cp == CH_CR) { 1680 lineNum++; 1681 colStart = charIdx; 1682 charIdx++; 1683 break; 1684 } 1685 1686 // Let unescape handle the back slash. 1687 cp = testString.unescapeAt(charIdx); 1688 if (cp != -1) { 1689 // Escape sequence was recognized. Insert the char 1690 // into the test data. 1691 tp.dataToBreak.append(cp); 1692 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1693 tp.srcLine->addElement(lineNum, status); 1694 tp.srcCol ->addElement(column, status); 1695 } 1696 break; 1697 } 1698 1699 1700 // Not a recognized backslash escape sequence. 1701 // Take the next char as a literal. 1702 // TODO: Should this be an error? 1703 c = testString.charAt(charIdx); 1704 charIdx = testString.moveIndex32(charIdx, 1); 1705 } 1706 1707 // Normal, non-escaped data char. 1708 tp.dataToBreak.append(c); 1709 1710 // Save the mapping from offset in the data to line/column numbers in 1711 // the original input file. Will be used for better error messages only. 1712 // If there's an expected break before this char, the slot in the mapping 1713 // vector will already be set for this char; don't overwrite it. 1714 if (tp.dataToBreak.length() > tp.srcLine->size()) { 1715 tp.srcLine->addElement(lineNum, status); 1716 tp.srcCol ->addElement(column, status); 1717 } 1718 break; 1719 1720 1721 case PARSE_NUM: 1722 // We are parsing an expected numeric tag value, like <1234>, 1723 // within a chunk of data. 1724 if (u_isUWhiteSpace(c)) { 1725 break; 1726 } 1727 1728 if (c == CH_GT) { 1729 // Finished the number. Add the info to the expected break data, 1730 // and switch parse state back to doing plain data. 1731 parseState = PARSE_DATA; 1732 if (tagValue == 0) { 1733 tagValue = -1; 1734 } 1735 int32_t breakIdx = tp.dataToBreak.length(); 1736 tp.expectedBreaks->setSize(breakIdx+1); 1737 tp.expectedBreaks->setElementAt(tagValue, breakIdx); 1738 tp.srcLine->setSize(breakIdx+1); 1739 tp.srcLine->setElementAt(lineNum, breakIdx); 1740 tp.srcCol ->setSize(breakIdx+1); 1741 tp.srcCol ->setElementAt(column, breakIdx); 1742 break; 1743 } 1744 1745 if (u_isdigit(c)) { 1746 tagValue = tagValue*10 + u_charDigitValue(c); 1747 break; 1748 } 1749 1750 errln("Syntax Error in test file at line %d, col %d", 1751 lineNum, column); 1752 parseState = PARSE_COMMENT; 1753 goto end_test; // Stop the test 1754 break; 1755 } 1756 1757 1758 if (U_FAILURE(status)) { 1759 errln("ICU Error %s while parsing test file at line %d.", 1760 u_errorName(status), lineNum); 1761 status = U_ZERO_ERROR; 1762 goto end_test; // Stop the test 1763 } 1764 1765 } 1766 1767end_test: 1768 delete tp.bi; 1769 delete tp.expectedBreaks; 1770 delete tp.srcLine; 1771 delete tp.srcCol; 1772 delete [] testFile; 1773#endif 1774} 1775 1776void RBBITest::TestThaiBreaks() { 1777 UErrorCode status=U_ZERO_ERROR; 1778 BreakIterator* b; 1779 Locale locale = Locale("th"); 1780 int32_t p, index; 1781 UChar c[]= { 1782 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, 1783 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, 1784 0x0E16, 0x0E49, 0x0E33 1785 }; 1786 int32_t expectedWordResult[] = { 1787 2, 3, 6, 10, 11, 15, 17, 20, 22 1788 }; 1789 int32_t expectedLineResult[] = { 1790 3, 6, 11, 15, 17, 20, 22 1791 }; 1792 int32_t size = sizeof(c)/sizeof(UChar); 1793 UnicodeString text=UnicodeString(c); 1794 1795 b = BreakIterator::createWordInstance(locale, status); 1796 if (U_FAILURE(status)) { 1797 errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status)); 1798 return; 1799 } 1800 b->setText(text); 1801 p = index = 0; 1802 while ((p=b->next())!=BreakIterator::DONE && p < size) { 1803 if (p != expectedWordResult[index++]) { 1804 errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p); 1805 } 1806 } 1807 delete b; 1808 1809 b = BreakIterator::createLineInstance(locale, status); 1810 if (U_FAILURE(status)) { 1811 printf("Unable to create thai line break iterator.\n"); 1812 return; 1813 } 1814 b->setText(text); 1815 p = index = 0; 1816 while ((p=b->next())!=BreakIterator::DONE && p < size) { 1817 if (p != expectedLineResult[index++]) { 1818 errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p); 1819 } 1820 } 1821 1822 delete b; 1823} 1824 1825// UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX" 1826// Words don't include colon or period (cldrbug #1969). 1827static const char posxWordText[] = "Can't have breaks in xx:yy or struct.field for CS-types."; 1828static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 }; 1829static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 }; 1830 1831// UBreakIteratorType UBRK_WORD, Locale "ja" 1832// Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009). 1833static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" 1834 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; 1835static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 }; 1836static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; 1837 1838// UBreakIteratorType UBRK_SENTENCE, Locale "el" 1839// Add break after Greek question mark (cldrbug #2069). 1840static const char elSentText[] = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. " 1841 "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3"; 1842static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 }; 1843static const int32_t elSentROffsets[] = { 20, 27, 35, 36 }; 1844 1845// UBreakIteratorType UBRK_CHARACTER, Locale "th" 1846// Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161). 1847static const char thCharText[] = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 " 1848 "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) " 1849 "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 "; 1850static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 1851 12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28, 1852 29, 30, 32, 33, 35, 37, 38, 39, 40, 41 }; 1853static const int32_t thCharROffsets[] = { 1, 3, 5, 6, 7, 8, 9, 11, 1854 12, 13, 15, 17, 19, 20, 22, 24, 26, 27, 28, 1855 29, 32, 33, 35, 37, 38, 40, 41 }; 1856 1857typedef struct { 1858 UBreakIteratorType type; 1859 const char * locale; 1860 const char * escapedText; 1861 const int32_t * tailoredOffsets; 1862 int32_t tailoredOffsetsCount; 1863 const int32_t * rootOffsets; 1864 int32_t rootOffsetsCount; 1865} TailoredBreakItem; 1866 1867#define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0])) 1868 1869static const TailoredBreakItem tbItems[] = { 1870 { UBRK_WORD, "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) }, 1871 { UBRK_WORD, "ja", jaWordText, ARRAY_PTR_LEN(jaWordTOffsets), ARRAY_PTR_LEN(jaWordROffsets) }, 1872 { UBRK_SENTENCE, "el", elSentText, ARRAY_PTR_LEN(elSentTOffsets), ARRAY_PTR_LEN(elSentROffsets) }, 1873 { UBRK_CHARACTER, "th", thCharText, ARRAY_PTR_LEN(thCharTOffsets), ARRAY_PTR_LEN(thCharROffsets) }, 1874 { UBRK_CHARACTER, NULL, NULL, NULL,0, NULL,0 } // terminator 1875}; 1876 1877static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) { 1878 while (count-- > 0) { 1879 int writeCount; 1880 sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */ 1881 buffer += writeCount; 1882 buflen -= writeCount; 1883 } 1884} 1885 1886enum { kMaxOffsetCount = 128 }; 1887 1888void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) { 1889 brkitr->setText( CharsToUnicodeString(escapedText) ); 1890 int32_t foundOffsets[kMaxOffsetCount]; 1891 int32_t offset, foundOffsetsCount = 0; 1892 // do forwards iteration test 1893 while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) { 1894 foundOffsets[foundOffsetsCount++] = offset; 1895 } 1896 if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) { 1897 // log error for forwards test 1898 char formatExpect[512], formatFound[512]; 1899 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets); 1900 formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets); 1901 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n", 1902 type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound); 1903 } else { 1904 // do backwards iteration test 1905 --foundOffsetsCount; // back off one from the end offset 1906 while ( foundOffsetsCount > 0 ) { 1907 offset = brkitr->previous(); 1908 if ( offset != foundOffsets[--foundOffsetsCount] ) { 1909 // log error for backwards test 1910 char formatExpect[512]; 1911 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets); 1912 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n", 1913 type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]); 1914 break; 1915 } 1916 } 1917 } 1918} 1919 1920void RBBITest::TestTailoredBreaks() { 1921 const TailoredBreakItem * tbItemPtr; 1922 Locale rootLocale = Locale("root"); 1923 for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) { 1924 Locale testLocale = Locale(tbItemPtr->locale); 1925 BreakIterator * tailoredBrkiter; 1926 BreakIterator * rootBrkiter; 1927 UErrorCode status = U_ZERO_ERROR; 1928 switch (tbItemPtr->type) { 1929 case UBRK_CHARACTER: 1930 tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status); 1931 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status); 1932 break; 1933 case UBRK_WORD: 1934 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status); 1935 rootBrkiter = BreakIterator::createWordInstance(rootLocale, status); 1936 break; 1937 case UBRK_LINE: 1938 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status); 1939 rootBrkiter = BreakIterator::createLineInstance(rootLocale, status); 1940 break; 1941 case UBRK_SENTENCE: 1942 tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status); 1943 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status); 1944 break; 1945 default: 1946 status = U_UNSUPPORTED_ERROR; 1947 break; 1948 } 1949 if (U_FAILURE(status)) { 1950 errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status)); 1951 continue; 1952 } 1953 TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount); 1954 TBTest(rootBrkiter, (int)(tbItemPtr->type), "root", tbItemPtr->escapedText, tbItemPtr->rootOffsets, tbItemPtr->rootOffsetsCount); 1955 1956 delete rootBrkiter; 1957 delete tailoredBrkiter; 1958 } 1959} 1960 1961 1962//------------------------------------------------------------------------------- 1963// 1964// ReadAndConvertFile Read a text data file, convert it to UChars, and 1965// return the datain one big UChar * buffer, which the caller must delete. 1966// 1967// parameters: 1968// fileName: the name of the file, with no directory part. The test data directory 1969// is assumed. 1970// ulen an out parameter, receives the actual length (in UChars) of the file data. 1971// encoding The file encoding. If the file contains a BOM, that will override the encoding 1972// specified here. The BOM, if it exists, will be stripped from the returned data. 1973// Pass NULL for the system default encoding. 1974// status 1975// returns: 1976// The file data, converted to UChar. 1977// The caller must delete this when done with 1978// delete [] theBuffer; 1979// 1980// TODO: This is a clone of RegexTest::ReadAndConvertFile. 1981// Move this function to some common place. 1982// 1983//-------------------------------------------------------------------------------- 1984UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 1985 UChar *retPtr = NULL; 1986 char *fileBuf = NULL; 1987 UConverter* conv = NULL; 1988 FILE *f = NULL; 1989 1990 ulen = 0; 1991 if (U_FAILURE(status)) { 1992 return retPtr; 1993 } 1994 1995 // 1996 // Open the file. 1997 // 1998 f = fopen(fileName, "rb"); 1999 if (f == 0) { 2000 dataerrln("Error opening test data file %s\n", fileName); 2001 status = U_FILE_ACCESS_ERROR; 2002 return NULL; 2003 } 2004 // 2005 // Read it in 2006 // 2007 int fileSize; 2008 int amt_read; 2009 2010 fseek( f, 0, SEEK_END); 2011 fileSize = ftell(f); 2012 fileBuf = new char[fileSize]; 2013 fseek(f, 0, SEEK_SET); 2014 amt_read = fread(fileBuf, 1, fileSize, f); 2015 if (amt_read != fileSize || fileSize <= 0) { 2016 errln("Error reading test data file."); 2017 goto cleanUpAndReturn; 2018 } 2019 2020 // 2021 // Look for a Unicode Signature (BOM) on the data just read 2022 // 2023 int32_t signatureLength; 2024 const char * fileBufC; 2025 const char* bomEncoding; 2026 2027 fileBufC = fileBuf; 2028 bomEncoding = ucnv_detectUnicodeSignature( 2029 fileBuf, fileSize, &signatureLength, &status); 2030 if(bomEncoding!=NULL ){ 2031 fileBufC += signatureLength; 2032 fileSize -= signatureLength; 2033 encoding = bomEncoding; 2034 } 2035 2036 // 2037 // Open a converter to take the rule file to UTF-16 2038 // 2039 conv = ucnv_open(encoding, &status); 2040 if (U_FAILURE(status)) { 2041 goto cleanUpAndReturn; 2042 } 2043 2044 // 2045 // Convert the rules to UChar. 2046 // Preflight first to determine required buffer size. 2047 // 2048 ulen = ucnv_toUChars(conv, 2049 NULL, // dest, 2050 0, // destCapacity, 2051 fileBufC, 2052 fileSize, 2053 &status); 2054 if (status == U_BUFFER_OVERFLOW_ERROR) { 2055 // Buffer Overflow is expected from the preflight operation. 2056 status = U_ZERO_ERROR; 2057 2058 retPtr = new UChar[ulen+1]; 2059 ucnv_toUChars(conv, 2060 retPtr, // dest, 2061 ulen+1, 2062 fileBufC, 2063 fileSize, 2064 &status); 2065 } 2066 2067cleanUpAndReturn: 2068 fclose(f); 2069 delete []fileBuf; 2070 ucnv_close(conv); 2071 if (U_FAILURE(status)) { 2072 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 2073 delete retPtr; 2074 retPtr = 0; 2075 ulen = 0; 2076 }; 2077 return retPtr; 2078} 2079 2080 2081 2082//-------------------------------------------------------------------------------------------- 2083// 2084// Run tests from each of the boundary test data files distributed by the Unicode Consortium 2085// 2086//------------------------------------------------------------------------------------------- 2087void RBBITest::TestUnicodeFiles() { 2088 RuleBasedBreakIterator *bi; 2089 UErrorCode status = U_ZERO_ERROR; 2090 2091 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getDefault(), status); 2092 TEST_ASSERT_SUCCESS(status); 2093 if (U_SUCCESS(status)) { 2094 runUnicodeTestData("GraphemeBreakTest.txt", bi); 2095 } 2096 delete bi; 2097 2098 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status); 2099 TEST_ASSERT_SUCCESS(status); 2100 if (U_SUCCESS(status)) { 2101 runUnicodeTestData("WordBreakTest.txt", bi); 2102 } 2103 delete bi; 2104 2105 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 2106 TEST_ASSERT_SUCCESS(status); 2107 if (U_SUCCESS(status)) { 2108 runUnicodeTestData("SentenceBreakTest.txt", bi); 2109 } 2110 delete bi; 2111 2112 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 2113 TEST_ASSERT_SUCCESS(status); 2114 if (U_SUCCESS(status)) { 2115 runUnicodeTestData("LineBreakTest.txt", bi); 2116 } 2117 delete bi; 2118} 2119 2120 2121//-------------------------------------------------------------------------------------------- 2122// 2123// Run tests from one of the boundary test data files distributed by the Unicode Consortium 2124// 2125//------------------------------------------------------------------------------------------- 2126void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 2127#if !UCONFIG_NO_REGULAR_EXPRESSIONS 2128 UErrorCode status = U_ZERO_ERROR; 2129 2130 // 2131 // Open and read the test data file, put it into a UnicodeString. 2132 // 2133 const char *testDataDirectory = IntlTest::getSourceTestData(status); 2134 char testFileName[1000]; 2135 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 2136 dataerrln("Can't open test data. Path too long."); 2137 return; 2138 } 2139 strcpy(testFileName, testDataDirectory); 2140 strcat(testFileName, fileName); 2141 2142 logln("Opening data file %s\n", fileName); 2143 2144 int len; 2145 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 2146 if (status != U_FILE_ACCESS_ERROR) { 2147 TEST_ASSERT_SUCCESS(status); 2148 TEST_ASSERT(testFile != NULL); 2149 } 2150 if (U_FAILURE(status) || testFile == NULL) { 2151 return; /* something went wrong, error already output */ 2152 } 2153 UnicodeString testFileAsString(TRUE, testFile, len); 2154 2155 // 2156 // Parse the test data file using a regular expression. 2157 // Each kind of token is recognized in its own capture group; what type of item was scanned 2158 // is identified by which group had a match. 2159 // 2160 // Caputure Group # 1 2 3 4 5 2161 // Parses this item: divide x hex digits comment \n unrecognized \n 2162 // 2163 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 2164 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 2165 UnicodeString testString; 2166 UVector32 breakPositions(status); 2167 int lineNumber = 1; 2168 TEST_ASSERT_SUCCESS(status); 2169 if (U_FAILURE(status)) { 2170 return; 2171 } 2172 2173 // 2174 // Scan through each test case, building up the string to be broken in testString, 2175 // and the positions that should be boundaries in the breakPositions vector. 2176 // 2177 while (tokenMatcher.find()) { 2178 if (tokenMatcher.start(1, status) >= 0) { 2179 // Scanned a divide sign, indicating a break position in the test data. 2180 if (testString.length()>0) { 2181 breakPositions.addElement(testString.length(), status); 2182 } 2183 } 2184 else if (tokenMatcher.start(2, status) >= 0) { 2185 // Scanned an 'x', meaning no break at this position in the test data 2186 // Nothing to be done here. 2187 } 2188 else if (tokenMatcher.start(3, status) >= 0) { 2189 // Scanned Hex digits. Convert them to binary, append to the character data string. 2190 const UnicodeString &hexNumber = tokenMatcher.group(3, status); 2191 int length = hexNumber.length(); 2192 if (length<=8) { 2193 char buf[10]; 2194 hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 2195 UChar32 c = (UChar32)strtol(buf, NULL, 16); 2196 if (c<=0x10ffff) { 2197 testString.append(c); 2198 } else { 2199 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 2200 fileName, lineNumber); 2201 } 2202 } else { 2203 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 2204 fileName, lineNumber); 2205 } 2206 } 2207 else if (tokenMatcher.start(4, status) >= 0) { 2208 // Scanned to end of a line, possibly skipping over a comment in the process. 2209 // If the line from the file contained test data, run the test now. 2210 // 2211 if (testString.length() > 0) { 2212 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 2213 } 2214 2215 // Clear out this test case. 2216 // The string and breakPositions vector will be refilled as the next 2217 // test case is parsed. 2218 testString.remove(); 2219 breakPositions.removeAllElements(); 2220 lineNumber++; 2221 } else { 2222 // Scanner catchall. Something unrecognized appeared on the line. 2223 char token[16]; 2224 UnicodeString uToken = tokenMatcher.group(0, status); 2225 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 2226 token[sizeof(token)-1] = 0; 2227 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 2228 2229 // Clean up, in preparation for continuing with the next line. 2230 testString.remove(); 2231 breakPositions.removeAllElements(); 2232 lineNumber++; 2233 } 2234 TEST_ASSERT_SUCCESS(status); 2235 if (U_FAILURE(status)) { 2236 break; 2237 } 2238 } 2239 2240 delete [] testFile; 2241 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 2242} 2243 2244//-------------------------------------------------------------------------------------------- 2245// 2246// checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 2247// test data files. Do only a simple, forward-only check - 2248// this test is mostly to check that ICU and the Unicode 2249// data agree with each other. 2250// 2251//-------------------------------------------------------------------------------------------- 2252void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 2253 const UnicodeString &testString, // Text data to be broken 2254 UVector32 *breakPositions, // Positions where breaks should be found. 2255 RuleBasedBreakIterator *bi) { 2256 int32_t pos; // Break Position in the test string 2257 int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 2258 int32_t expectedPos; // Expected break position (index into test string) 2259 2260 bi->setText(testString); 2261 pos = bi->first(); 2262 pos = bi->next(); 2263 2264 while (pos != BreakIterator::DONE) { 2265 if (expectedI >= breakPositions->size()) { 2266 errln("Test file \"%s\", line %d, unexpected break found at position %d", 2267 testFileName, lineNumber, pos); 2268 break; 2269 } 2270 expectedPos = breakPositions->elementAti(expectedI); 2271 if (pos < expectedPos) { 2272 errln("Test file \"%s\", line %d, unexpected break found at position %d", 2273 testFileName, lineNumber, pos); 2274 break; 2275 } 2276 if (pos > expectedPos) { 2277 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 2278 testFileName, lineNumber, expectedPos); 2279 break; 2280 } 2281 pos = bi->next(); 2282 expectedI++; 2283 } 2284 2285 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 2286 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 2287 testFileName, lineNumber, breakPositions->elementAti(expectedI)); 2288 } 2289} 2290 2291 2292 2293#if !UCONFIG_NO_REGULAR_EXPRESSIONS 2294//--------------------------------------------------------------------------------------- 2295// 2296// classs RBBIMonkeyKind 2297// 2298// Monkey Test for Break Iteration 2299// Abstract interface class. Concrete derived classes independently 2300// implement the break rules for different iterator types. 2301// 2302// The Monkey Test itself uses doesn't know which type of break iterator it is 2303// testing, but works purely in terms of the interface defined here. 2304// 2305//--------------------------------------------------------------------------------------- 2306class RBBIMonkeyKind { 2307public: 2308 // Return a UVector of UnicodeSets, representing the character classes used 2309 // for this type of iterator. 2310 virtual UVector *charClasses() = 0; 2311 2312 // Set the test text on which subsequent calls to next() will operate 2313 virtual void setText(const UnicodeString &s) = 0; 2314 2315 // Find the next break postion, starting from the prev break position, or from zero. 2316 // Return -1 after reaching end of string. 2317 virtual int32_t next(int32_t i) = 0; 2318 2319 virtual ~RBBIMonkeyKind(); 2320 UErrorCode deferredStatus; 2321 2322 2323protected: 2324 RBBIMonkeyKind(); 2325 2326private: 2327}; 2328 2329RBBIMonkeyKind::RBBIMonkeyKind() { 2330 deferredStatus = U_ZERO_ERROR; 2331} 2332 2333RBBIMonkeyKind::~RBBIMonkeyKind() { 2334} 2335 2336 2337//---------------------------------------------------------------------------------------- 2338// 2339// Random Numbers. Similar to standard lib rand() and srand() 2340// Not using library to 2341// 1. Get same results on all platforms. 2342// 2. Get access to current seed, to more easily reproduce failures. 2343// 2344//--------------------------------------------------------------------------------------- 2345static uint32_t m_seed = 1; 2346 2347static uint32_t m_rand() 2348{ 2349 m_seed = m_seed * 1103515245 + 12345; 2350 return (uint32_t)(m_seed/65536) % 32768; 2351} 2352 2353 2354//------------------------------------------------------------------------------------------ 2355// 2356// class RBBICharMonkey Character (Grapheme Cluster) specific implementation 2357// of RBBIMonkeyKind. 2358// 2359//------------------------------------------------------------------------------------------ 2360class RBBICharMonkey: public RBBIMonkeyKind { 2361public: 2362 RBBICharMonkey(); 2363 virtual ~RBBICharMonkey(); 2364 virtual UVector *charClasses(); 2365 virtual void setText(const UnicodeString &s); 2366 virtual int32_t next(int32_t i); 2367private: 2368 UVector *fSets; 2369 2370 UnicodeSet *fCRLFSet; 2371 UnicodeSet *fControlSet; 2372 UnicodeSet *fExtendSet; 2373 UnicodeSet *fPrependSet; 2374 UnicodeSet *fSpacingSet; 2375 UnicodeSet *fLSet; 2376 UnicodeSet *fVSet; 2377 UnicodeSet *fTSet; 2378 UnicodeSet *fLVSet; 2379 UnicodeSet *fLVTSet; 2380 UnicodeSet *fHangulSet; 2381 UnicodeSet *fAnySet; 2382 2383 const UnicodeString *fText; 2384}; 2385 2386 2387RBBICharMonkey::RBBICharMonkey() { 2388 UErrorCode status = U_ZERO_ERROR; 2389 2390 fText = NULL; 2391 2392 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 2393 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status); 2394 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status); 2395 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 2396 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 2397 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 2398 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 2399 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 2400 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 2401 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 2402 fHangulSet = new UnicodeSet(); 2403 fHangulSet->addAll(*fLSet); 2404 fHangulSet->addAll(*fVSet); 2405 fHangulSet->addAll(*fTSet); 2406 fHangulSet->addAll(*fLVSet); 2407 fHangulSet->addAll(*fLVTSet); 2408 fAnySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status); 2409 2410 fSets = new UVector(status); 2411 fSets->addElement(fCRLFSet, status); 2412 fSets->addElement(fControlSet, status); 2413 fSets->addElement(fExtendSet, status); 2414 fSets->addElement(fPrependSet, status); 2415 fSets->addElement(fSpacingSet, status); 2416 fSets->addElement(fHangulSet, status); 2417 fSets->addElement(fAnySet, status); 2418 if (U_FAILURE(status)) { 2419 deferredStatus = status; 2420 } 2421} 2422 2423 2424void RBBICharMonkey::setText(const UnicodeString &s) { 2425 fText = &s; 2426} 2427 2428 2429 2430int32_t RBBICharMonkey::next(int32_t prevPos) { 2431 int p0, p1, p2, p3; // Indices of the significant code points around the 2432 // break position being tested. The candidate break 2433 // location is before p2. 2434 2435 int breakPos = -1; 2436 2437 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2438 2439 if (U_FAILURE(deferredStatus)) { 2440 return -1; 2441 } 2442 2443 // Previous break at end of string. return DONE. 2444 if (prevPos >= fText->length()) { 2445 return -1; 2446 } 2447 p0 = p1 = p2 = p3 = prevPos; 2448 c3 = fText->char32At(prevPos); 2449 c0 = c1 = c2 = 0; 2450 2451 // Loop runs once per "significant" character position in the input text. 2452 for (;;) { 2453 // Move all of the positions forward in the input string. 2454 p0 = p1; c0 = c1; 2455 p1 = p2; c1 = c2; 2456 p2 = p3; c2 = c3; 2457 2458 // Advancd p3 by one codepoint 2459 p3 = fText->moveIndex32(p3, 1); 2460 c3 = fText->char32At(p3); 2461 2462 if (p1 == p2) { 2463 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2464 continue; 2465 } 2466 if (p2 == fText->length()) { 2467 // Reached end of string. Always a break position. 2468 break; 2469 } 2470 2471 // Rule GB3 CR x LF 2472 // No Extend or Format characters may appear between the CR and LF, 2473 // which requires the additional check for p2 immediately following p1. 2474 // 2475 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 2476 continue; 2477 } 2478 2479 // Rule (GB4). ( Control | CR | LF ) <break> 2480 if (fControlSet->contains(c1) || 2481 c1 == 0x0D || 2482 c1 == 0x0A) { 2483 break; 2484 } 2485 2486 // Rule (GB5) <break> ( Control | CR | LF ) 2487 // 2488 if (fControlSet->contains(c2) || 2489 c2 == 0x0D || 2490 c2 == 0x0A) { 2491 break; 2492 } 2493 2494 2495 // Rule (GB6) L x ( L | V | LV | LVT ) 2496 if (fLSet->contains(c1) && 2497 (fLSet->contains(c2) || 2498 fVSet->contains(c2) || 2499 fLVSet->contains(c2) || 2500 fLVTSet->contains(c2))) { 2501 continue; 2502 } 2503 2504 // Rule (GB7) ( LV | V ) x ( V | T ) 2505 if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 2506 (fVSet->contains(c2) || fTSet->contains(c2))) { 2507 continue; 2508 } 2509 2510 // Rule (GB8) ( LVT | T) x T 2511 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 2512 fTSet->contains(c2)) { 2513 continue; 2514 } 2515 2516 // Rule (GB9) Numeric x ALetter 2517 if (fExtendSet->contains(c2)) { 2518 continue; 2519 } 2520 2521 // Rule (GB9a) x SpacingMark 2522 if (fSpacingSet->contains(c2)) { 2523 continue; 2524 } 2525 2526 // Rule (GB9b) Prepend x 2527 if (fPrependSet->contains(c1)) { 2528 continue; 2529 } 2530 2531 // Rule (GB10) Any <break> Any 2532 break; 2533 } 2534 2535 breakPos = p2; 2536 return breakPos; 2537} 2538 2539 2540 2541UVector *RBBICharMonkey::charClasses() { 2542 return fSets; 2543} 2544 2545 2546RBBICharMonkey::~RBBICharMonkey() { 2547 delete fSets; 2548 delete fCRLFSet; 2549 delete fControlSet; 2550 delete fExtendSet; 2551 delete fPrependSet; 2552 delete fSpacingSet; 2553 delete fLSet; 2554 delete fVSet; 2555 delete fTSet; 2556 delete fLVSet; 2557 delete fLVTSet; 2558 delete fHangulSet; 2559 delete fAnySet; 2560} 2561 2562//------------------------------------------------------------------------------------------ 2563// 2564// class RBBIWordMonkey Word Break specific implementation 2565// of RBBIMonkeyKind. 2566// 2567//------------------------------------------------------------------------------------------ 2568class RBBIWordMonkey: public RBBIMonkeyKind { 2569public: 2570 RBBIWordMonkey(); 2571 virtual ~RBBIWordMonkey(); 2572 virtual UVector *charClasses(); 2573 virtual void setText(const UnicodeString &s); 2574 virtual int32_t next(int32_t i); 2575private: 2576 UVector *fSets; 2577 2578 UnicodeSet *fCRSet; 2579 UnicodeSet *fLFSet; 2580 UnicodeSet *fNewlineSet; 2581 UnicodeSet *fKatakanaSet; 2582 UnicodeSet *fALetterSet; 2583 UnicodeSet *fMidNumLetSet; 2584 UnicodeSet *fMidLetterSet; 2585 UnicodeSet *fMidNumSet; 2586 UnicodeSet *fNumericSet; 2587 UnicodeSet *fFormatSet; 2588 UnicodeSet *fOtherSet; 2589 UnicodeSet *fExtendSet; 2590 UnicodeSet *fExtendNumLetSet; 2591 2592 RegexMatcher *fMatcher; 2593 2594 const UnicodeString *fText; 2595}; 2596 2597 2598RBBIWordMonkey::RBBIWordMonkey() 2599{ 2600 UErrorCode status = U_ZERO_ERROR; 2601 2602 fSets = new UVector(status); 2603 2604 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 2605 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 2606 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 2607 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 2608 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 2609 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 2610 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 2611 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 2612 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); 2613 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 2614 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 2615 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 2616 2617 fOtherSet = new UnicodeSet(); 2618 if(U_FAILURE(status)) { 2619 deferredStatus = status; 2620 return; 2621 } 2622 2623 fOtherSet->complement(); 2624 fOtherSet->removeAll(*fCRSet); 2625 fOtherSet->removeAll(*fLFSet); 2626 fOtherSet->removeAll(*fNewlineSet); 2627 fOtherSet->removeAll(*fKatakanaSet); 2628 fOtherSet->removeAll(*fALetterSet); 2629 fOtherSet->removeAll(*fMidLetterSet); 2630 fOtherSet->removeAll(*fMidNumSet); 2631 fOtherSet->removeAll(*fNumericSet); 2632 fOtherSet->removeAll(*fExtendNumLetSet); 2633 fOtherSet->removeAll(*fFormatSet); 2634 fOtherSet->removeAll(*fExtendSet); 2635 // Inhibit dictionary characters from being tested at all. 2636 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); 2637 2638 fSets->addElement(fCRSet, status); 2639 fSets->addElement(fLFSet, status); 2640 fSets->addElement(fNewlineSet, status); 2641 fSets->addElement(fALetterSet, status); 2642 fSets->addElement(fKatakanaSet, status); 2643 fSets->addElement(fMidLetterSet, status); 2644 fSets->addElement(fMidNumLetSet, status); 2645 fSets->addElement(fMidNumSet, status); 2646 fSets->addElement(fNumericSet, status); 2647 fSets->addElement(fFormatSet, status); 2648 fSets->addElement(fExtendSet, status); 2649 fSets->addElement(fOtherSet, status); 2650 fSets->addElement(fExtendNumLetSet, status); 2651 2652 if (U_FAILURE(status)) { 2653 deferredStatus = status; 2654 } 2655} 2656 2657void RBBIWordMonkey::setText(const UnicodeString &s) { 2658 fText = &s; 2659} 2660 2661 2662int32_t RBBIWordMonkey::next(int32_t prevPos) { 2663 int p0, p1, p2, p3; // Indices of the significant code points around the 2664 // break position being tested. The candidate break 2665 // location is before p2. 2666 2667 int breakPos = -1; 2668 2669 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2670 2671 if (U_FAILURE(deferredStatus)) { 2672 return -1; 2673 } 2674 2675 // Prev break at end of string. return DONE. 2676 if (prevPos >= fText->length()) { 2677 return -1; 2678 } 2679 p0 = p1 = p2 = p3 = prevPos; 2680 c3 = fText->char32At(prevPos); 2681 c0 = c1 = c2 = 0; 2682 2683 // Loop runs once per "significant" character position in the input text. 2684 for (;;) { 2685 // Move all of the positions forward in the input string. 2686 p0 = p1; c0 = c1; 2687 p1 = p2; c1 = c2; 2688 p2 = p3; c2 = c3; 2689 2690 // Advancd p3 by X(Extend | Format)* Rule 4 2691 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 2692 do { 2693 p3 = fText->moveIndex32(p3, 1); 2694 c3 = fText->char32At(p3); 2695 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2696 break; 2697 }; 2698 } 2699 while (fFormatSet->contains(c3) || fExtendSet->contains(c3)); 2700 2701 2702 if (p1 == p2) { 2703 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2704 continue; 2705 } 2706 if (p2 == fText->length()) { 2707 // Reached end of string. Always a break position. 2708 break; 2709 } 2710 2711 // Rule (3) CR x LF 2712 // No Extend or Format characters may appear between the CR and LF, 2713 // which requires the additional check for p2 immediately following p1. 2714 // 2715 if (c1==0x0D && c2==0x0A) { 2716 continue; 2717 } 2718 2719 // Rule (3a) Break before and after newlines (including CR and LF) 2720 // 2721 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 2722 break; 2723 }; 2724 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2725 break; 2726 }; 2727 2728 // Rule (5). ALetter x ALetter 2729 if (fALetterSet->contains(c1) && 2730 fALetterSet->contains(c2)) { 2731 continue; 2732 } 2733 2734 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter 2735 // 2736 if ( fALetterSet->contains(c1) && 2737 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) && 2738 fALetterSet->contains(c3)) { 2739 continue; 2740 } 2741 2742 2743 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter 2744 if (fALetterSet->contains(c0) && 2745 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1)) && 2746 fALetterSet->contains(c2)) { 2747 continue; 2748 } 2749 2750 // Rule (8) Numeric x Numeric 2751 if (fNumericSet->contains(c1) && 2752 fNumericSet->contains(c2)) { 2753 continue; 2754 } 2755 2756 // Rule (9) ALetter x Numeric 2757 if (fALetterSet->contains(c1) && 2758 fNumericSet->contains(c2)) { 2759 continue; 2760 } 2761 2762 // Rule (10) Numeric x ALetter 2763 if (fNumericSet->contains(c1) && 2764 fALetterSet->contains(c2)) { 2765 continue; 2766 } 2767 2768 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric 2769 if (fNumericSet->contains(c0) && 2770 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1)) && 2771 fNumericSet->contains(c2)) { 2772 continue; 2773 } 2774 2775 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric 2776 if (fNumericSet->contains(c1) && 2777 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2)) && 2778 fNumericSet->contains(c3)) { 2779 continue; 2780 } 2781 2782 // Rule (13) Katakana x Katakana 2783 if (fKatakanaSet->contains(c1) && 2784 fKatakanaSet->contains(c2)) { 2785 continue; 2786 } 2787 2788 // Rule 13a 2789 if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) || 2790 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 2791 fExtendNumLetSet->contains(c2)) { 2792 continue; 2793 } 2794 2795 // Rule 13b 2796 if (fExtendNumLetSet->contains(c1) && 2797 (fALetterSet->contains(c2) || fNumericSet->contains(c2) || 2798 fKatakanaSet->contains(c2))) { 2799 continue; 2800 } 2801 2802 // Rule 14. Break found here. 2803 break; 2804 } 2805 2806 breakPos = p2; 2807 return breakPos; 2808} 2809 2810 2811UVector *RBBIWordMonkey::charClasses() { 2812 return fSets; 2813} 2814 2815 2816RBBIWordMonkey::~RBBIWordMonkey() { 2817 delete fSets; 2818 delete fCRSet; 2819 delete fLFSet; 2820 delete fNewlineSet; 2821 delete fKatakanaSet; 2822 delete fALetterSet; 2823 delete fMidNumLetSet; 2824 delete fMidLetterSet; 2825 delete fMidNumSet; 2826 delete fNumericSet; 2827 delete fFormatSet; 2828 delete fExtendSet; 2829 delete fExtendNumLetSet; 2830 delete fOtherSet; 2831} 2832 2833 2834 2835 2836//------------------------------------------------------------------------------------------ 2837// 2838// class RBBISentMonkey Sentence Break specific implementation 2839// of RBBIMonkeyKind. 2840// 2841//------------------------------------------------------------------------------------------ 2842class RBBISentMonkey: public RBBIMonkeyKind { 2843public: 2844 RBBISentMonkey(); 2845 virtual ~RBBISentMonkey(); 2846 virtual UVector *charClasses(); 2847 virtual void setText(const UnicodeString &s); 2848 virtual int32_t next(int32_t i); 2849private: 2850 int moveBack(int posFrom); 2851 int moveForward(int posFrom); 2852 UChar32 cAt(int pos); 2853 2854 UVector *fSets; 2855 2856 UnicodeSet *fSepSet; 2857 UnicodeSet *fFormatSet; 2858 UnicodeSet *fSpSet; 2859 UnicodeSet *fLowerSet; 2860 UnicodeSet *fUpperSet; 2861 UnicodeSet *fOLetterSet; 2862 UnicodeSet *fNumericSet; 2863 UnicodeSet *fATermSet; 2864 UnicodeSet *fSContinueSet; 2865 UnicodeSet *fSTermSet; 2866 UnicodeSet *fCloseSet; 2867 UnicodeSet *fOtherSet; 2868 UnicodeSet *fExtendSet; 2869 2870 const UnicodeString *fText; 2871 2872}; 2873 2874RBBISentMonkey::RBBISentMonkey() 2875{ 2876 UErrorCode status = U_ZERO_ERROR; 2877 2878 fSets = new UVector(status); 2879 2880 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 2881 // set and made into character classes of their own. For the monkey impl, 2882 // they remain in SEP, since Sep always appears with CR and LF in the rules. 2883 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 2884 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 2885 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 2886 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 2887 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 2888 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 2889 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 2890 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 2891 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 2892 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 2893 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 2894 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 2895 fOtherSet = new UnicodeSet(); 2896 2897 if(U_FAILURE(status)) { 2898 deferredStatus = status; 2899 return; 2900 } 2901 2902 fOtherSet->complement(); 2903 fOtherSet->removeAll(*fSepSet); 2904 fOtherSet->removeAll(*fFormatSet); 2905 fOtherSet->removeAll(*fSpSet); 2906 fOtherSet->removeAll(*fLowerSet); 2907 fOtherSet->removeAll(*fUpperSet); 2908 fOtherSet->removeAll(*fOLetterSet); 2909 fOtherSet->removeAll(*fNumericSet); 2910 fOtherSet->removeAll(*fATermSet); 2911 fOtherSet->removeAll(*fSContinueSet); 2912 fOtherSet->removeAll(*fSTermSet); 2913 fOtherSet->removeAll(*fCloseSet); 2914 fOtherSet->removeAll(*fExtendSet); 2915 2916 fSets->addElement(fSepSet, status); 2917 fSets->addElement(fFormatSet, status); 2918 fSets->addElement(fSpSet, status); 2919 fSets->addElement(fLowerSet, status); 2920 fSets->addElement(fUpperSet, status); 2921 fSets->addElement(fOLetterSet, status); 2922 fSets->addElement(fNumericSet, status); 2923 fSets->addElement(fATermSet, status); 2924 fSets->addElement(fSContinueSet, status); 2925 fSets->addElement(fSTermSet, status); 2926 fSets->addElement(fCloseSet, status); 2927 fSets->addElement(fOtherSet, status); 2928 fSets->addElement(fExtendSet, status); 2929 2930 if (U_FAILURE(status)) { 2931 deferredStatus = status; 2932 } 2933} 2934 2935 2936 2937void RBBISentMonkey::setText(const UnicodeString &s) { 2938 fText = &s; 2939} 2940 2941UVector *RBBISentMonkey::charClasses() { 2942 return fSets; 2943} 2944 2945 2946// moveBack() Find the "significant" code point preceding the index i. 2947// Skips over ($Extend | $Format)* . 2948// 2949int RBBISentMonkey::moveBack(int i) { 2950 if (i <= 0) { 2951 return -1; 2952 } 2953 UChar32 c; 2954 int32_t j = i; 2955 do { 2956 j = fText->moveIndex32(j, -1); 2957 c = fText->char32At(j); 2958 } 2959 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 2960 return j; 2961 2962 } 2963 2964 2965int RBBISentMonkey::moveForward(int i) { 2966 if (i>=fText->length()) { 2967 return fText->length(); 2968 } 2969 UChar32 c; 2970 int32_t j = i; 2971 do { 2972 j = fText->moveIndex32(j, 1); 2973 c = cAt(j); 2974 } 2975 while (fFormatSet->contains(c) || fExtendSet->contains(c)); 2976 return j; 2977} 2978 2979UChar32 RBBISentMonkey::cAt(int pos) { 2980 if (pos<0 || pos>=fText->length()) { 2981 return -1; 2982 } else { 2983 return fText->char32At(pos); 2984 } 2985} 2986 2987int32_t RBBISentMonkey::next(int32_t prevPos) { 2988 int p0, p1, p2, p3; // Indices of the significant code points around the 2989 // break position being tested. The candidate break 2990 // location is before p2. 2991 2992 int breakPos = -1; 2993 2994 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2995 UChar32 c; 2996 2997 if (U_FAILURE(deferredStatus)) { 2998 return -1; 2999 } 3000 3001 // Prev break at end of string. return DONE. 3002 if (prevPos >= fText->length()) { 3003 return -1; 3004 } 3005 p0 = p1 = p2 = p3 = prevPos; 3006 c3 = fText->char32At(prevPos); 3007 c0 = c1 = c2 = 0; 3008 3009 // Loop runs once per "significant" character position in the input text. 3010 for (;;) { 3011 // Move all of the positions forward in the input string. 3012 p0 = p1; c0 = c1; 3013 p1 = p2; c1 = c2; 3014 p2 = p3; c2 = c3; 3015 3016 // Advancd p3 by X(Extend | Format)* Rule 4 3017 p3 = moveForward(p3); 3018 c3 = cAt(p3); 3019 3020 // Rule (3) CR x LF 3021 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 3022 continue; 3023 } 3024 3025 // Rule (4). Sep <break> 3026 if (fSepSet->contains(c1)) { 3027 p2 = p1+1; // Separators don't combine with Extend or Format. 3028 break; 3029 } 3030 3031 if (p2 >= fText->length()) { 3032 // Reached end of string. Always a break position. 3033 break; 3034 } 3035 3036 if (p2 == prevPos) { 3037 // Still warming up the loop. (won't work with zero length strings, but we don't care) 3038 continue; 3039 } 3040 3041 // Rule (6). ATerm x Numeric 3042 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 3043 continue; 3044 } 3045 3046 // Rule (7). Upper ATerm x Uppper 3047 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) { 3048 continue; 3049 } 3050 3051 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 3052 // Note: STerm | ATerm are added to the negated part of the expression by a 3053 // note to the Unicode 5.0 documents. 3054 int p8 = p1; 3055 while (fSpSet->contains(cAt(p8))) { 3056 p8 = moveBack(p8); 3057 } 3058 while (fCloseSet->contains(cAt(p8))) { 3059 p8 = moveBack(p8); 3060 } 3061 if (fATermSet->contains(cAt(p8))) { 3062 p8=p2; 3063 for (;;) { 3064 c = cAt(p8); 3065 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 3066 fLowerSet->contains(c) || fSepSet->contains(c) || 3067 fATermSet->contains(c) || fSTermSet->contains(c)) { 3068 break; 3069 } 3070 p8 = moveForward(p8); 3071 } 3072 if (fLowerSet->contains(cAt(p8))) { 3073 continue; 3074 } 3075 } 3076 3077 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 3078 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 3079 p8 = p1; 3080 while (fSpSet->contains(cAt(p8))) { 3081 p8 = moveBack(p8); 3082 } 3083 while (fCloseSet->contains(cAt(p8))) { 3084 p8 = moveBack(p8); 3085 } 3086 c = cAt(p8); 3087 if (fSTermSet->contains(c) || fATermSet->contains(c)) { 3088 continue; 3089 } 3090 } 3091 3092 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 3093 int p9 = p1; 3094 while (fCloseSet->contains(cAt(p9))) { 3095 p9 = moveBack(p9); 3096 } 3097 c = cAt(p9); 3098 if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 3099 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 3100 continue; 3101 } 3102 } 3103 3104 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 3105 int p10 = p1; 3106 while (fSpSet->contains(cAt(p10))) { 3107 p10 = moveBack(p10); 3108 } 3109 while (fCloseSet->contains(cAt(p10))) { 3110 p10 = moveBack(p10); 3111 } 3112 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 3113 if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 3114 continue; 3115 } 3116 } 3117 3118 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 3119 int p11 = p1; 3120 if (fSepSet->contains(cAt(p11))) { 3121 p11 = moveBack(p11); 3122 } 3123 while (fSpSet->contains(cAt(p11))) { 3124 p11 = moveBack(p11); 3125 } 3126 while (fCloseSet->contains(cAt(p11))) { 3127 p11 = moveBack(p11); 3128 } 3129 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 3130 break; 3131 } 3132 3133 // Rule (12) Any x Any 3134 continue; 3135 } 3136 breakPos = p2; 3137 return breakPos; 3138} 3139 3140RBBISentMonkey::~RBBISentMonkey() { 3141 delete fSets; 3142 delete fSepSet; 3143 delete fFormatSet; 3144 delete fSpSet; 3145 delete fLowerSet; 3146 delete fUpperSet; 3147 delete fOLetterSet; 3148 delete fNumericSet; 3149 delete fATermSet; 3150 delete fSContinueSet; 3151 delete fSTermSet; 3152 delete fCloseSet; 3153 delete fOtherSet; 3154 delete fExtendSet; 3155} 3156 3157 3158 3159//------------------------------------------------------------------------------------------- 3160// 3161// RBBILineMonkey 3162// 3163//------------------------------------------------------------------------------------------- 3164 3165class RBBILineMonkey: public RBBIMonkeyKind { 3166public: 3167 RBBILineMonkey(); 3168 virtual ~RBBILineMonkey(); 3169 virtual UVector *charClasses(); 3170 virtual void setText(const UnicodeString &s); 3171 virtual int32_t next(int32_t i); 3172 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 3173private: 3174 UVector *fSets; 3175 3176 UnicodeSet *fBK; 3177 UnicodeSet *fCR; 3178 UnicodeSet *fLF; 3179 UnicodeSet *fCM; 3180 UnicodeSet *fNL; 3181 UnicodeSet *fSG; 3182 UnicodeSet *fWJ; 3183 UnicodeSet *fZW; 3184 UnicodeSet *fGL; 3185 UnicodeSet *fCB; 3186 UnicodeSet *fSP; 3187 UnicodeSet *fB2; 3188 UnicodeSet *fBA; 3189 UnicodeSet *fBB; 3190 UnicodeSet *fHY; 3191 UnicodeSet *fH2; 3192 UnicodeSet *fH3; 3193 UnicodeSet *fCL; 3194 UnicodeSet *fEX; 3195 UnicodeSet *fIN; 3196 UnicodeSet *fJL; 3197 UnicodeSet *fJV; 3198 UnicodeSet *fJT; 3199 UnicodeSet *fNS; 3200 UnicodeSet *fOP; 3201 UnicodeSet *fQU; 3202 UnicodeSet *fIS; 3203 UnicodeSet *fNU; 3204 UnicodeSet *fPO; 3205 UnicodeSet *fPR; 3206 UnicodeSet *fSY; 3207 UnicodeSet *fAI; 3208 UnicodeSet *fAL; 3209 UnicodeSet *fID; 3210 UnicodeSet *fSA; 3211 UnicodeSet *fXX; 3212 3213 BreakIterator *fCharBI; 3214 3215 const UnicodeString *fText; 3216 int32_t *fOrigPositions; 3217 3218 RegexMatcher *fNumberMatcher; 3219 RegexMatcher *fLB11Matcher; 3220}; 3221 3222 3223RBBILineMonkey::RBBILineMonkey() 3224{ 3225 UErrorCode status = U_ZERO_ERROR; 3226 3227 fSets = new UVector(status); 3228 3229 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 3230 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 3231 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 3232 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 3233 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 3234 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 3235 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 3236 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 3237 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 3238 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 3239 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 3240 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 3241 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 3242 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 3243 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 3244 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 3245 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 3246 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 3247 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 3248 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 3249 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 3250 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 3251 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 3252 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 3253 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 3254 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 3255 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 3256 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 3257 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 3258 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 3259 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 3260 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 3261 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 3262 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status); 3263 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 3264 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 3265 3266 if (U_FAILURE(status)) { 3267 deferredStatus = status; 3268 fCharBI = NULL; 3269 fNumberMatcher = NULL; 3270 return; 3271 } 3272 3273 fAL->addAll(*fXX); // Default behavior for XX is identical to AL 3274 fAL->addAll(*fAI); // Default behavior for AI is identical to AL 3275 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL 3276 fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 3277 3278 fSets->addElement(fBK, status); 3279 fSets->addElement(fCR, status); 3280 fSets->addElement(fLF, status); 3281 fSets->addElement(fCM, status); 3282 fSets->addElement(fNL, status); 3283 fSets->addElement(fWJ, status); 3284 fSets->addElement(fZW, status); 3285 fSets->addElement(fGL, status); 3286 fSets->addElement(fCB, status); 3287 fSets->addElement(fSP, status); 3288 fSets->addElement(fB2, status); 3289 fSets->addElement(fBA, status); 3290 fSets->addElement(fBB, status); 3291 fSets->addElement(fHY, status); 3292 fSets->addElement(fH2, status); 3293 fSets->addElement(fH3, status); 3294 fSets->addElement(fCL, status); 3295 fSets->addElement(fEX, status); 3296 fSets->addElement(fIN, status); 3297 fSets->addElement(fJL, status); 3298 fSets->addElement(fJT, status); 3299 fSets->addElement(fJV, status); 3300 fSets->addElement(fNS, status); 3301 fSets->addElement(fOP, status); 3302 fSets->addElement(fQU, status); 3303 fSets->addElement(fIS, status); 3304 fSets->addElement(fNU, status); 3305 fSets->addElement(fPO, status); 3306 fSets->addElement(fPR, status); 3307 fSets->addElement(fSY, status); 3308 fSets->addElement(fAI, status); 3309 fSets->addElement(fAL, status); 3310 fSets->addElement(fID, status); 3311 fSets->addElement(fWJ, status); 3312 fSets->addElement(fSA, status); 3313 fSets->addElement(fSG, status); 3314 3315 const char *rules = 3316 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" 3317 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" 3318 "\\p{Line_Break=NU}\\p{Line_Break=CM}*" 3319 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*" 3320 "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?" 3321 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"; 3322 3323 fNumberMatcher = new RegexMatcher( 3324 UnicodeString(rules, -1, US_INV), 0, status); 3325 3326 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 3327 3328 if (U_FAILURE(status)) { 3329 deferredStatus = status; 3330 } 3331} 3332 3333 3334void RBBILineMonkey::setText(const UnicodeString &s) { 3335 fText = &s; 3336 fCharBI->setText(s); 3337 fNumberMatcher->reset(s); 3338} 3339 3340// 3341// rule9Adjust 3342// Line Break TR rules 9 and 10 implementation. 3343// This deals with combining marks and other sequences that 3344// that must be treated as if they were something other than what they actually are. 3345// 3346// This is factored out into a separate function because it must be applied twice for 3347// each potential break, once to the chars before the position being checked, then 3348// again to the text following the possible break. 3349// 3350void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 3351 if (pos == -1) { 3352 // Invalid initial position. Happens during the warmup iteration of the 3353 // main loop in next(). 3354 return; 3355 } 3356 3357 int32_t nPos = *nextPos; 3358 3359 // LB 9 Keep combining sequences together. 3360 // advance over any CM class chars. Note that Line Break CM is different 3361 // from the normal Grapheme Extend property. 3362 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 3363 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 3364 for (;;) { 3365 *nextChar = fText->char32At(nPos); 3366 if (!fCM->contains(*nextChar)) { 3367 break; 3368 } 3369 nPos = fText->moveIndex32(nPos, 1); 3370 } 3371 } 3372 3373 3374 // LB 9 Treat X CM* as if it were x. 3375 // No explicit action required. 3376 3377 // LB 10 Treat any remaining combining mark as AL 3378 if (fCM->contains(*posChar)) { 3379 *posChar = 0x41; // thisChar = 'A'; 3380 } 3381 3382 // Push the updated nextPos and nextChar back to our caller. 3383 // This only makes a difference if posChar got bigger by consuming a 3384 // combining sequence. 3385 *nextPos = nPos; 3386 *nextChar = fText->char32At(nPos); 3387} 3388 3389 3390 3391int32_t RBBILineMonkey::next(int32_t startPos) { 3392 UErrorCode status = U_ZERO_ERROR; 3393 int32_t pos; // Index of the char following a potential break position 3394 UChar32 thisChar; // Character at above position "pos" 3395 3396 int32_t prevPos; // Index of the char preceding a potential break position 3397 UChar32 prevChar; // Character at above position. Note that prevChar 3398 // and thisChar may not be adjacent because combining 3399 // characters between them will be ignored. 3400 3401 int32_t nextPos; // Index of the next character following pos. 3402 // Usually skips over combining marks. 3403 int32_t nextCPPos; // Index of the code point following "pos." 3404 // May point to a combining mark. 3405 int32_t tPos; // temp value. 3406 UChar32 c; 3407 3408 if (U_FAILURE(deferredStatus)) { 3409 return -1; 3410 } 3411 3412 if (startPos >= fText->length()) { 3413 return -1; 3414 } 3415 3416 3417 // Initial values for loop. Loop will run the first time without finding breaks, 3418 // while the invalid values shift out and the "this" and 3419 // "prev" positions are filled in with good values. 3420 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration. 3421 thisChar = prevChar = 0; 3422 nextPos = nextCPPos = startPos; 3423 3424 3425 // Loop runs once per position in the test text, until a break position 3426 // is found. 3427 for (;;) { 3428 prevPos = pos; 3429 prevChar = thisChar; 3430 3431 pos = nextPos; 3432 thisChar = fText->char32At(pos); 3433 3434 nextCPPos = fText->moveIndex32(pos, 1); 3435 nextPos = nextCPPos; 3436 3437 // Rule LB2 - Break at end of text. 3438 if (pos >= fText->length()) { 3439 break; 3440 } 3441 3442 // Rule LB 9 - adjust for combining sequences. 3443 // We do this one out-of-order because the adjustment does not change anything 3444 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 3445 // be applied. 3446 rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 3447 nextCPPos = nextPos = fText->moveIndex32(pos, 1); 3448 c = fText->char32At(nextPos); 3449 rule9Adjust(pos, &thisChar, &nextPos, &c); 3450 3451 // If the loop is still warming up - if we haven't shifted the initial 3452 // -1 positions out of prevPos yet - loop back to advance the 3453 // position in the input without any further looking for breaks. 3454 if (prevPos == -1) { 3455 continue; 3456 } 3457 3458 // LB 4 Always break after hard line breaks, 3459 if (fBK->contains(prevChar)) { 3460 break; 3461 } 3462 3463 // LB 5 Break after CR, LF, NL, but not inside CR LF 3464 if (prevChar == 0x0d && thisChar == 0x0a) { 3465 continue; 3466 } 3467 if (prevChar == 0x0d || 3468 prevChar == 0x0a || 3469 prevChar == 0x85) { 3470 break; 3471 } 3472 3473 // LB 6 Don't break before hard line breaks 3474 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 3475 fBK->contains(thisChar)) { 3476 continue; 3477 } 3478 3479 3480 // LB 7 Don't break before spaces or zero-width space. 3481 if (fSP->contains(thisChar)) { 3482 continue; 3483 } 3484 3485 if (fZW->contains(thisChar)) { 3486 continue; 3487 } 3488 3489 // LB 8 Break after zero width space 3490 if (fZW->contains(prevChar)) { 3491 break; 3492 } 3493 3494 // LB 9, 10 Already done, at top of loop. 3495 // 3496 3497 3498 // LB 11 Do not break before or after WORD JOINER and related characters. 3499 // x WJ 3500 // WJ x 3501 // 3502 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 3503 continue; 3504 } 3505 3506 // LB 12 3507 // GL x 3508 if (fGL->contains(prevChar)) { 3509 continue; 3510 } 3511 3512 // LB 12a 3513 // [^SP BA HY] x GL 3514 if (!(fSP->contains(prevChar) || 3515 fBA->contains(prevChar) || 3516 fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 3517 continue; 3518 } 3519 3520 3521 3522 // LB 13 Don't break before closings. 3523 // NU x CL and NU x IS are not matched here so that they will 3524 // fall into LB 17 and the more general number regular expression. 3525 // 3526 if (!fNU->contains(prevChar) && fCL->contains(thisChar) || 3527 fEX->contains(thisChar) || 3528 !fNU->contains(prevChar) && fIS->contains(thisChar) || 3529 !fNU->contains(prevChar) && fSY->contains(thisChar)) { 3530 continue; 3531 } 3532 3533 // LB 14 Don't break after OP SP* 3534 // Scan backwards, checking for this sequence. 3535 // The OP char could include combining marks, so we actually check for 3536 // OP CM* SP* 3537 // Another Twist: The Rule 67 fixes may have changed a SP CM 3538 // sequence into a ID char, so before scanning back through spaces, 3539 // verify that prevChar is indeed a space. The prevChar variable 3540 // may differ from fText[prevPos] 3541 tPos = prevPos; 3542 if (fSP->contains(prevChar)) { 3543 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3544 tPos=fText->moveIndex32(tPos, -1); 3545 } 3546 } 3547 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3548 tPos=fText->moveIndex32(tPos, -1); 3549 } 3550 if (fOP->contains(fText->char32At(tPos))) { 3551 continue; 3552 } 3553 3554 3555 // LB 15 QU SP* x OP 3556 if (fOP->contains(thisChar)) { 3557 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 3558 int tPos = prevPos; 3559 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3560 tPos = fText->moveIndex32(tPos, -1); 3561 } 3562 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3563 tPos = fText->moveIndex32(tPos, -1); 3564 } 3565 if (fQU->contains(fText->char32At(tPos))) { 3566 continue; 3567 } 3568 } 3569 3570 3571 3572 // LB 16 CL SP* x NS 3573 // Scan backwards for SP* CM* CL 3574 if (fNS->contains(thisChar)) { 3575 int tPos = prevPos; 3576 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3577 tPos = fText->moveIndex32(tPos, -1); 3578 } 3579 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3580 tPos = fText->moveIndex32(tPos, -1); 3581 } 3582 if (fCL->contains(fText->char32At(tPos))) { 3583 continue; 3584 } 3585 } 3586 3587 3588 // LB 17 B2 SP* x B2 3589 if (fB2->contains(thisChar)) { 3590 // Scan backwards, checking for the B2 CM* SP* sequence. 3591 tPos = prevPos; 3592 if (fSP->contains(prevChar)) { 3593 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3594 tPos=fText->moveIndex32(tPos, -1); 3595 } 3596 } 3597 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3598 tPos=fText->moveIndex32(tPos, -1); 3599 } 3600 if (fB2->contains(fText->char32At(tPos))) { 3601 continue; 3602 } 3603 } 3604 3605 3606 // LB 18 break after space 3607 if (fSP->contains(prevChar)) { 3608 break; 3609 } 3610 3611 // LB 19 3612 // x QU 3613 // QU x 3614 if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 3615 continue; 3616 } 3617 3618 // LB 20 Break around a CB 3619 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 3620 break; 3621 } 3622 3623 // LB 21 3624 if (fBA->contains(thisChar) || 3625 fHY->contains(thisChar) || 3626 fNS->contains(thisChar) || 3627 fBB->contains(prevChar) ) { 3628 continue; 3629 } 3630 3631 // LB 22 3632 if (fAL->contains(prevChar) && fIN->contains(thisChar) || 3633 fID->contains(prevChar) && fIN->contains(thisChar) || 3634 fIN->contains(prevChar) && fIN->contains(thisChar) || 3635 fNU->contains(prevChar) && fIN->contains(thisChar) ) { 3636 continue; 3637 } 3638 3639 3640 // LB 23 ID x PO 3641 // AL x NU 3642 // NU x AL 3643 if (fID->contains(prevChar) && fPO->contains(thisChar) || 3644 fAL->contains(prevChar) && fNU->contains(thisChar) || 3645 fNU->contains(prevChar) && fAL->contains(thisChar) ) { 3646 continue; 3647 } 3648 3649 // LB 24 Do not break between prefix and letters or ideographs. 3650 // PR x ID 3651 // PR x AL 3652 // PO x AL 3653 if (fPR->contains(prevChar) && fID->contains(thisChar) || 3654 fPR->contains(prevChar) && fAL->contains(thisChar) || 3655 fPO->contains(prevChar) && fAL->contains(thisChar) ) { 3656 continue; 3657 } 3658 3659 3660 3661 // LB 25 Numbers 3662 if (fNumberMatcher->lookingAt(prevPos, status)) { 3663 if (U_FAILURE(status)) { 3664 break; 3665 } 3666 // Matched a number. But could have been just a single digit, which would 3667 // not represent a "no break here" between prevChar and thisChar 3668 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 3669 if (numEndIdx > pos) { 3670 // Number match includes at least our two chars being checked 3671 if (numEndIdx > nextPos) { 3672 // Number match includes additional chars. Update pos and nextPos 3673 // so that next loop iteration will continue at the end of the number, 3674 // checking for breaks between last char in number & whatever follows. 3675 pos = nextPos = numEndIdx; 3676 do { 3677 pos = fText->moveIndex32(pos, -1); 3678 thisChar = fText->char32At(pos); 3679 } while (fCM->contains(thisChar)); 3680 } 3681 continue; 3682 } 3683 } 3684 3685 3686 // LB 26 Do not break a Korean syllable. 3687 if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 3688 fJV->contains(thisChar) || 3689 fH2->contains(thisChar) || 3690 fH3->contains(thisChar))) { 3691 continue; 3692 } 3693 3694 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 3695 (fJV->contains(thisChar) || fJT->contains(thisChar))) { 3696 continue; 3697 } 3698 3699 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 3700 fJT->contains(thisChar)) { 3701 continue; 3702 } 3703 3704 // LB 27 Treat a Korean Syllable Block the same as ID. 3705 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3706 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3707 fIN->contains(thisChar)) { 3708 continue; 3709 } 3710 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3711 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3712 fPO->contains(thisChar)) { 3713 continue; 3714 } 3715 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 3716 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 3717 continue; 3718 } 3719 3720 3721 3722 // LB 28 Do not break between alphabetics ("at"). 3723 if (fAL->contains(prevChar) && fAL->contains(thisChar)) { 3724 continue; 3725 } 3726 3727 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 3728 if (fIS->contains(prevChar) && fAL->contains(thisChar)) { 3729 continue; 3730 } 3731 3732 // LB 31 Break everywhere else 3733 break; 3734 3735 } 3736 3737 return pos; 3738} 3739 3740 3741UVector *RBBILineMonkey::charClasses() { 3742 return fSets; 3743} 3744 3745 3746RBBILineMonkey::~RBBILineMonkey() { 3747 delete fSets; 3748 3749 delete fBK; 3750 delete fCR; 3751 delete fLF; 3752 delete fCM; 3753 delete fNL; 3754 delete fWJ; 3755 delete fZW; 3756 delete fGL; 3757 delete fCB; 3758 delete fSP; 3759 delete fB2; 3760 delete fBA; 3761 delete fBB; 3762 delete fHY; 3763 delete fH2; 3764 delete fH3; 3765 delete fCL; 3766 delete fEX; 3767 delete fIN; 3768 delete fJL; 3769 delete fJV; 3770 delete fJT; 3771 delete fNS; 3772 delete fOP; 3773 delete fQU; 3774 delete fIS; 3775 delete fNU; 3776 delete fPO; 3777 delete fPR; 3778 delete fSY; 3779 delete fAI; 3780 delete fAL; 3781 delete fID; 3782 delete fSA; 3783 delete fSG; 3784 delete fXX; 3785 3786 delete fCharBI; 3787 delete fNumberMatcher; 3788} 3789 3790 3791//------------------------------------------------------------------------------------------- 3792// 3793// TestMonkey 3794// 3795// params 3796// seed=nnnnn Random number starting seed. 3797// Setting the seed allows errors to be reproduced. 3798// loop=nnn Looping count. Controls running time. 3799// -1: run forever. 3800// 0 or greater: run length. 3801// 3802// type = char | word | line | sent | title 3803// 3804//------------------------------------------------------------------------------------------- 3805 3806static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 3807 int32_t val = defaultVal; 3808 name.append(" *= *(-?\\d+)"); 3809 UErrorCode status = U_ZERO_ERROR; 3810 RegexMatcher m(name, params, 0, status); 3811 if (m.find()) { 3812 // The param exists. Convert the string to an int. 3813 char valString[100]; 3814 int32_t paramLength = m.end(1, status) - m.start(1, status); 3815 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 3816 paramLength = (int32_t)(sizeof(valString)-2); 3817 } 3818 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 3819 val = strtol(valString, NULL, 10); 3820 3821 // Delete this parameter from the params string. 3822 m.reset(); 3823 params = m.replaceFirst("", status); 3824 } 3825 U_ASSERT(U_SUCCESS(status)); 3826 return val; 3827} 3828#endif 3829 3830static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 3831 BreakIterator *bi, 3832 int expected[], 3833 int expectedcount) 3834{ 3835 int count = 0; 3836 int i = 0; 3837 int forward[50]; 3838 bi->setText(ustr); 3839 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3840 forward[count] = i; 3841 if (count < expectedcount && expected[count] != i) { 3842 test->errln("break forward test failed: expected %d but got %d", 3843 expected[count], i); 3844 break; 3845 } 3846 count ++; 3847 } 3848 if (count != expectedcount) { 3849 printStringBreaks(ustr, expected, expectedcount); 3850 test->errln("break forward test failed: missed %d match", 3851 expectedcount - count); 3852 return; 3853 } 3854 // testing boundaries 3855 for (i = 1; i < expectedcount; i ++) { 3856 int j = expected[i - 1]; 3857 if (!bi->isBoundary(j)) { 3858 printStringBreaks(ustr, expected, expectedcount); 3859 test->errln("isBoundary() failed. Expected boundary at position %d", j); 3860 return; 3861 } 3862 for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 3863 if (bi->isBoundary(j)) { 3864 printStringBreaks(ustr, expected, expectedcount); 3865 test->errln("isBoundary() failed. Not expecting boundary at position %d", j); 3866 return; 3867 } 3868 } 3869 } 3870 3871 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3872 count --; 3873 if (forward[count] != i) { 3874 test->errln("happy break test previous() failed: expected %d but got %d", 3875 forward[count], i); 3876 break; 3877 } 3878 } 3879 if (count != 0) { 3880 printStringBreaks(ustr, expected, expectedcount); 3881 test->errln("break test previous() failed: missed a match"); 3882 return; 3883 } 3884 3885 // testing preceding 3886 for (i = 0; i < expectedcount - 1; i ++) { 3887 // int j = expected[i] + 1; 3888 int j = ustr.moveIndex32(expected[i], 1); 3889 for (; j <= expected[i + 1]; j ++) { 3890 if (bi->preceding(j) != expected[i]) { 3891 printStringBreaks(ustr, expected, expectedcount); 3892 test->errln("preceding(): Not expecting boundary at position %d", j); 3893 return; 3894 } 3895 } 3896 } 3897} 3898 3899void RBBITest::TestWordBreaks(void) 3900{ 3901#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3902 3903 Locale locale("en"); 3904 UErrorCode status = U_ZERO_ERROR; 3905 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3906 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3907 static const char *strlist[] = 3908 { 3909 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 3910 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b", 3911 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 3912 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 3913 "\\u90ca\\u3588\\u009c\\u0953\\u194b", 3914 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3915 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 3916 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e", 3917 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3918 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3919 "\\u2027\\U000e0067\\u0a47\\u00b7", 3920 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3921 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3922 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3923 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 3924 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3925 "\\u0027\\u11af\\U000e0057\\u0602", 3926 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3927 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3928 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3929 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3930 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3931 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 3932 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3933 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3934 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3935 "\\u58f4\\U000e0049\\u20e7\\u2027", 3936 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3937 "\\ua183\\u102d\\u0bec\\u003a", 3938 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3939 "\\u003a\\u0e57\\u0fad\\u002e", 3940 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3941 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3942 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 3943 "\\u003a\\u0664\\u00b7\\u1fba", 3944 "\\u003b\\u0027\\u00b7\\u47a3", 3945 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b", 3946 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 3947 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 3948 }; 3949 int loop; 3950 if (U_FAILURE(status)) { 3951 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3952 return; 3953 } 3954 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3955 // printf("looping %d\n", loop); 3956 UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 3957 // RBBICharMonkey monkey; 3958 RBBIWordMonkey monkey; 3959 3960 int expected[50]; 3961 int expectedcount = 0; 3962 3963 monkey.setText(ustr); 3964 int i; 3965 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3966 expected[expectedcount ++] = i; 3967 } 3968 3969 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3970 } 3971 delete bi; 3972#endif 3973} 3974 3975void RBBITest::TestWordBoundary(void) 3976{ 3977 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 3978 Locale locale("en"); 3979 UErrorCode status = U_ZERO_ERROR; 3980 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3981 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3982 UChar str[50]; 3983 static const char *strlist[] = 3984 { 3985 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3986 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3987 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3988 "\\u2027\\U000e0067\\u0a47\\u00b7", 3989 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3990 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3991 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3992 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 3993 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3994 "\\u0027\\u11af\\U000e0057\\u0602", 3995 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3996 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3997 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3998 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3999 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 4000 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 4001 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 4002 "\\u0233\\U000e0020\\u0a69\\u0d6a", 4003 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 4004 "\\u58f4\\U000e0049\\u20e7\\u2027", 4005 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 4006 "\\ua183\\u102d\\u0bec\\u003a", 4007 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 4008 "\\u003a\\u0e57\\u0fad\\u002e", 4009 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 4010 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 4011 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 4012 "\\u003a\\u0664\\u00b7\\u1fba", 4013 "\\u003b\\u0027\\u00b7\\u47a3", 4014 }; 4015 int loop; 4016 if (U_FAILURE(status)) { 4017 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4018 return; 4019 } 4020 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4021 // printf("looping %d\n", loop); 4022 u_unescape(strlist[loop], str, 20); 4023 UnicodeString ustr(str); 4024 int forward[50]; 4025 int count = 0; 4026 4027 bi->setText(ustr); 4028 int prev = 0; 4029 int i; 4030 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 4031 forward[count ++] = i; 4032 if (i > prev) { 4033 int j; 4034 for (j = prev + 1; j < i; j ++) { 4035 if (bi->isBoundary(j)) { 4036 printStringBreaks(ustr, forward, count); 4037 errln("happy boundary test failed: expected %d not a boundary", 4038 j); 4039 return; 4040 } 4041 } 4042 } 4043 if (!bi->isBoundary(i)) { 4044 printStringBreaks(ustr, forward, count); 4045 errln("happy boundary test failed: expected %d a boundary", 4046 i); 4047 return; 4048 } 4049 prev = i; 4050 } 4051 } 4052 delete bi; 4053} 4054 4055void RBBITest::TestLineBreaks(void) 4056{ 4057#if !UCONFIG_NO_REGULAR_EXPRESSIONS 4058 Locale locale("en"); 4059 UErrorCode status = U_ZERO_ERROR; 4060 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4061 const int32_t STRSIZE = 50; 4062 UChar str[STRSIZE]; 4063 static const char *strlist[] = 4064 { 4065 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 4066 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 4067 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 4068 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 4069 "u2014\\U000e0105\\u118c\\u000a\\u07f8", 4070 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 4071 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 4072 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 4073 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 4074 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 4075 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", 4076 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 4077 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 4078 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 4079 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 4080 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 4081 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 4082 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 4083 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 4084 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 4085 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 4086 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 4087 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 4088 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 4089 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 4090 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 4091 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", 4092 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 4093 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 4094 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 4095 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 4096 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 4097 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", 4098 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 4099 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 4100 "\\u2014\\u0020\\u000a\\u17c5\\u24fc", 4101 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 4102 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 4103 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 4104 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 4105 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 4106 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 4107 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" 4108 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" 4109 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", 4110 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 4111 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 4112 }; 4113 int loop; 4114 TEST_ASSERT_SUCCESS(status); 4115 if (U_FAILURE(status)) { 4116 return; 4117 } 4118 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4119 // printf("looping %d\n", loop); 4120 int32_t t = u_unescape(strlist[loop], str, STRSIZE); 4121 if (t >= STRSIZE) { 4122 TEST_ASSERT(FALSE); 4123 continue; 4124 } 4125 4126 4127 UnicodeString ustr(str); 4128 RBBILineMonkey monkey; 4129 if (U_FAILURE(monkey.deferredStatus)) { 4130 continue; 4131 } 4132 4133 const int EXPECTEDSIZE = 50; 4134 int expected[EXPECTEDSIZE]; 4135 int expectedcount = 0; 4136 4137 monkey.setText(ustr); 4138 int i; 4139 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4140 if (expectedcount >= EXPECTEDSIZE) { 4141 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4142 return; 4143 } 4144 expected[expectedcount ++] = i; 4145 } 4146 4147 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4148 } 4149 delete bi; 4150#endif 4151} 4152 4153void RBBITest::TestSentBreaks(void) 4154{ 4155#if !UCONFIG_NO_REGULAR_EXPRESSIONS 4156 Locale locale("en"); 4157 UErrorCode status = U_ZERO_ERROR; 4158 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4159 UChar str[200]; 4160 static const char *strlist[] = 4161 { 4162 "Now\ris\nthe\r\ntime\n\rfor\r\r", 4163 "This\n", 4164 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 4165 "\"Sentence ending with a quote.\" Bye.", 4166 " (This is it). Testing the sentence iterator. \"This isn't it.\"", 4167 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 4168 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 4169 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 4170 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 4171 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 4172 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 4173 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 4174 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 4175 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 4176 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 4177 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 4178 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 4179 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 4180 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 4181 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 4182 }; 4183 int loop; 4184 if (U_FAILURE(status)) { 4185 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4186 return; 4187 } 4188 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4189 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0]))); 4190 UnicodeString ustr(str); 4191 4192 RBBISentMonkey monkey; 4193 if (U_FAILURE(monkey.deferredStatus)) { 4194 continue; 4195 } 4196 4197 const int EXPECTEDSIZE = 50; 4198 int expected[EXPECTEDSIZE]; 4199 int expectedcount = 0; 4200 4201 monkey.setText(ustr); 4202 int i; 4203 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4204 if (expectedcount >= EXPECTEDSIZE) { 4205 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4206 return; 4207 } 4208 expected[expectedcount ++] = i; 4209 } 4210 4211 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4212 } 4213 delete bi; 4214#endif 4215} 4216 4217void RBBITest::TestMonkey(char *params) { 4218#if !UCONFIG_NO_REGULAR_EXPRESSIONS 4219 4220 UErrorCode status = U_ZERO_ERROR; 4221 int32_t loopCount = 500; 4222 int32_t seed = 1; 4223 UnicodeString breakType = "all"; 4224 Locale locale("en"); 4225 UBool useUText = FALSE; 4226 4227 if (quick == FALSE) { 4228 loopCount = 10000; 4229 } 4230 4231 if (params) { 4232 UnicodeString p(params); 4233 loopCount = getIntParam("loop", p, loopCount); 4234 seed = getIntParam("seed", p, seed); 4235 4236 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 4237 if (m.find()) { 4238 breakType = m.group(1, status); 4239 m.reset(); 4240 p = m.replaceFirst("", status); 4241 } 4242 4243 RegexMatcher u(" *utext", p, 0, status); 4244 if (u.find()) { 4245 useUText = TRUE; 4246 u.reset(); 4247 p = u.replaceFirst("", status); 4248 } 4249 4250 4251 // m.reset(p); 4252 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 4253 // Each option is stripped out of the option string as it is processed. 4254 // All options have been checked. The option string should have been completely emptied.. 4255 char buf[100]; 4256 p.extract(buf, sizeof(buf), NULL, status); 4257 buf[sizeof(buf)-1] = 0; 4258 errln("Unrecognized or extra parameter: %s\n", buf); 4259 return; 4260 } 4261 4262 } 4263 4264 if (breakType == "char" || breakType == "all") { 4265 RBBICharMonkey m; 4266 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4267 if (U_SUCCESS(status)) { 4268 RunMonkey(bi, m, "char", seed, loopCount, useUText); 4269 if (breakType == "all" && useUText==FALSE) { 4270 // Also run a quick test with UText when "all" is specified 4271 RunMonkey(bi, m, "char", seed, loopCount, TRUE); 4272 } 4273 } 4274 else { 4275 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 4276 } 4277 delete bi; 4278 } 4279 4280 if (breakType == "word" || breakType == "all") { 4281 logln("Word Break Monkey Test"); 4282 RBBIWordMonkey m; 4283 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4284 if (U_SUCCESS(status)) { 4285 RunMonkey(bi, m, "word", seed, loopCount, useUText); 4286 } 4287 else { 4288 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 4289 } 4290 delete bi; 4291 } 4292 4293 if (breakType == "line" || breakType == "all") { 4294 logln("Line Break Monkey Test"); 4295 RBBILineMonkey m; 4296 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4297 if (loopCount >= 10) { 4298 loopCount = loopCount / 5; // Line break runs slower than the others. 4299 } 4300 if (U_SUCCESS(status)) { 4301 RunMonkey(bi, m, "line", seed, loopCount, useUText); 4302 } 4303 else { 4304 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4305 } 4306 delete bi; 4307 } 4308 4309 if (breakType == "sent" || breakType == "all" ) { 4310 logln("Sentence Break Monkey Test"); 4311 RBBISentMonkey m; 4312 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4313 if (loopCount >= 10) { 4314 loopCount = loopCount / 10; // Sentence runs slower than the other break types 4315 } 4316 if (U_SUCCESS(status)) { 4317 RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 4318 } 4319 else { 4320 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4321 } 4322 delete bi; 4323 } 4324 4325#endif 4326} 4327 4328// 4329// Run a RBBI monkey test. Common routine, for all break iterator types. 4330// Parameters: 4331// bi - the break iterator to use 4332// mk - MonkeyKind, abstraction for obtaining expected results 4333// name - Name of test (char, word, etc.) for use in error messages 4334// seed - Seed for starting random number generator (parameter from user) 4335// numIterations 4336// 4337void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 4338 int32_t numIterations, UBool useUText) { 4339 4340#if !UCONFIG_NO_REGULAR_EXPRESSIONS 4341 4342 const int32_t TESTSTRINGLEN = 500; 4343 UnicodeString testText; 4344 int32_t numCharClasses; 4345 UVector *chClasses; 4346 int expected[TESTSTRINGLEN*2 + 1]; 4347 int expectedCount = 0; 4348 char expectedBreaks[TESTSTRINGLEN*2 + 1]; 4349 char forwardBreaks[TESTSTRINGLEN*2 + 1]; 4350 char reverseBreaks[TESTSTRINGLEN*2+1]; 4351 char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 4352 char followingBreaks[TESTSTRINGLEN*2+1]; 4353 char precedingBreaks[TESTSTRINGLEN*2+1]; 4354 int i; 4355 int loopCount = 0; 4356 4357 m_seed = seed; 4358 4359 numCharClasses = mk.charClasses()->size(); 4360 chClasses = mk.charClasses(); 4361 4362 // Check for errors that occured during the construction of the MonkeyKind object. 4363 // Can't report them where they occured because errln() is a method coming from intlTest, 4364 // and is not visible outside of RBBITest :-( 4365 if (U_FAILURE(mk.deferredStatus)) { 4366 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 4367 return; 4368 } 4369 4370 // Verify that the character classes all have at least one member. 4371 for (i=0; i<numCharClasses; i++) { 4372 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 4373 if (s == NULL || s->size() == 0) { 4374 errln("Character Class #%d is null or of zero size.", i); 4375 return; 4376 } 4377 } 4378 4379 while (loopCount < numIterations || numIterations == -1) { 4380 if (numIterations == -1 && loopCount % 10 == 0) { 4381 // If test is running in an infinite loop, display a periodic tic so 4382 // we can tell that it is making progress. 4383 fprintf(stderr, "."); 4384 } 4385 // Save current random number seed, so that we can recreate the random numbers 4386 // for this loop iteration in event of an error. 4387 seed = m_seed; 4388 4389 // Populate a test string with data. 4390 testText.truncate(0); 4391 for (i=0; i<TESTSTRINGLEN; i++) { 4392 int32_t aClassNum = m_rand() % numCharClasses; 4393 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 4394 int32_t charIdx = m_rand() % classSet->size(); 4395 UChar32 c = classSet->charAt(charIdx); 4396 if (c < 0) { // TODO: deal with sets containing strings. 4397 errln("c < 0"); 4398 break; 4399 } 4400 testText.append(c); 4401 } 4402 4403 // Calculate the expected results for this test string. 4404 mk.setText(testText); 4405 memset(expectedBreaks, 0, sizeof(expectedBreaks)); 4406 expectedBreaks[0] = 1; 4407 int32_t breakPos = 0; 4408 expectedCount = 0; 4409 for (;;) { 4410 breakPos = mk.next(breakPos); 4411 if (breakPos == -1) { 4412 break; 4413 } 4414 if (breakPos > testText.length()) { 4415 errln("breakPos > testText.length()"); 4416 } 4417 expectedBreaks[breakPos] = 1; 4418 U_ASSERT(expectedCount<testText.length()); 4419 expected[expectedCount ++] = breakPos; 4420 } 4421 4422 // Find the break positions using forward iteration 4423 memset(forwardBreaks, 0, sizeof(forwardBreaks)); 4424 if (useUText) { 4425 UErrorCode status = U_ZERO_ERROR; 4426 UText *testUText = utext_openReplaceable(NULL, &testText, &status); 4427 // testUText = utext_openUnicodeString(testUText, &testText, &status); 4428 bi->setText(testUText, status); 4429 TEST_ASSERT_SUCCESS(status); 4430 utext_close(testUText); // The break iterator does a shallow clone of the UText 4431 // This UText can be closed immediately, so long as the 4432 // testText string continues to exist. 4433 } else { 4434 bi->setText(testText); 4435 } 4436 4437 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 4438 if (i < 0 || i > testText.length()) { 4439 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4440 break; 4441 } 4442 forwardBreaks[i] = 1; 4443 } 4444 4445 // Find the break positions using reverse iteration 4446 memset(reverseBreaks, 0, sizeof(reverseBreaks)); 4447 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 4448 if (i < 0 || i > testText.length()) { 4449 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4450 break; 4451 } 4452 reverseBreaks[i] = 1; 4453 } 4454 4455 // Find the break positions using isBoundary() tests. 4456 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 4457 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 4458 for (i=0; i<=testText.length(); i++) { 4459 isBoundaryBreaks[i] = bi->isBoundary(i); 4460 } 4461 4462 4463 // Find the break positions using the following() function. 4464 // printf("."); 4465 memset(followingBreaks, 0, sizeof(followingBreaks)); 4466 int32_t lastBreakPos = 0; 4467 followingBreaks[0] = 1; 4468 for (i=0; i<testText.length(); i++) { 4469 breakPos = bi->following(i); 4470 if (breakPos <= i || 4471 breakPos < lastBreakPos || 4472 breakPos > testText.length() || 4473 breakPos > lastBreakPos && lastBreakPos > i ) { 4474 errln("%s break monkey test: " 4475 "Out of range value returned by BreakIterator::following().\n" 4476 "Random seed=%d index=%d; following returned %d; lastbreak=%d", 4477 name, seed, i, breakPos, lastBreakPos); 4478 break; 4479 } 4480 followingBreaks[breakPos] = 1; 4481 lastBreakPos = breakPos; 4482 } 4483 4484 // Find the break positions using the preceding() function. 4485 memset(precedingBreaks, 0, sizeof(precedingBreaks)); 4486 lastBreakPos = testText.length(); 4487 precedingBreaks[testText.length()] = 1; 4488 for (i=testText.length(); i>0; i--) { 4489 breakPos = bi->preceding(i); 4490 if (breakPos >= i || 4491 breakPos > lastBreakPos || 4492 breakPos < 0 && testText.getChar32Start(i)>0 || 4493 breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) { 4494 errln("%s break monkey test: " 4495 "Out of range value returned by BreakIterator::preceding().\n" 4496 "index=%d; prev returned %d; lastBreak=%d" , 4497 name, i, breakPos, lastBreakPos); 4498 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 4499 precedingBreaks[i] = 2; // Forces an error. 4500 } 4501 } else { 4502 if (breakPos >= 0) { 4503 precedingBreaks[breakPos] = 1; 4504 } 4505 lastBreakPos = breakPos; 4506 } 4507 } 4508 4509 // Compare the expected and actual results. 4510 for (i=0; i<=testText.length(); i++) { 4511 const char *errorType = NULL; 4512 if (forwardBreaks[i] != expectedBreaks[i]) { 4513 errorType = "next()"; 4514 } else if (reverseBreaks[i] != forwardBreaks[i]) { 4515 errorType = "previous()"; 4516 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 4517 errorType = "isBoundary()"; 4518 } else if (followingBreaks[i] != expectedBreaks[i]) { 4519 errorType = "following()"; 4520 } else if (precedingBreaks[i] != expectedBreaks[i]) { 4521 errorType = "preceding()"; 4522 } 4523 4524 4525 if (errorType != NULL) { 4526 // Format a range of the test text that includes the failure as 4527 // a data item that can be included in the rbbi test data file. 4528 4529 // Start of the range is the last point where expected and actual results 4530 // both agreed that there was a break position. 4531 int startContext = i; 4532 int32_t count = 0; 4533 for (;;) { 4534 if (startContext==0) { break; } 4535 startContext --; 4536 if (expectedBreaks[startContext] != 0) { 4537 if (count == 2) break; 4538 count ++; 4539 } 4540 } 4541 4542 // End of range is two expected breaks past the start position. 4543 int endContext = i + 1; 4544 int ci; 4545 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 4546 for (;;) { 4547 if (endContext >= testText.length()) {break;} 4548 if (expectedBreaks[endContext-1] != 0) { 4549 if (count == 0) break; 4550 count --; 4551 } 4552 endContext ++; 4553 } 4554 } 4555 4556 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 4557 UnicodeString errorText = "<data>"; 4558 /***if (strcmp(errorType, "next()") == 0) { 4559 startContext = 0; 4560 endContext = testText.length(); 4561 4562 printStringBreaks(testText, expected, expectedCount); 4563 }***/ 4564 4565 for (ci=startContext; ci<endContext;) { 4566 UnicodeString hexChars("0123456789abcdef"); 4567 UChar32 c; 4568 int bn; 4569 c = testText.char32At(ci); 4570 if (ci == i) { 4571 // This is the location of the error. 4572 errorText.append("<?>"); 4573 } else if (expectedBreaks[ci] != 0) { 4574 // This a non-error expected break position. 4575 errorText.append("\\"); 4576 } 4577 if (c < 0x10000) { 4578 errorText.append("\\u"); 4579 for (bn=12; bn>=0; bn-=4) { 4580 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4581 } 4582 } else { 4583 errorText.append("\\U"); 4584 for (bn=28; bn>=0; bn-=4) { 4585 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4586 } 4587 } 4588 ci = testText.moveIndex32(ci, 1); 4589 } 4590 errorText.append("\\"); 4591 errorText.append("</data>\n"); 4592 4593 // Output the error 4594 char charErrorTxt[500]; 4595 UErrorCode status = U_ZERO_ERROR; 4596 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 4597 charErrorTxt[sizeof(charErrorTxt)-1] = 0; 4598 errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 4599 name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 4600 errorType, seed, i, charErrorTxt); 4601 break; 4602 } 4603 } 4604 4605 loopCount++; 4606 } 4607#endif 4608} 4609 4610// 4611// TestDebug - A place-holder test for debugging purposes. 4612// For putting in fragments of other tests that can be invoked 4613// for tracing without a lot of unwanted extra stuff happening. 4614// 4615void RBBITest::TestDebug(void) { 4616#if 0 4617 UErrorCode status = U_ZERO_ERROR; 4618 int pos = 0; 4619 int ruleStatus = 0; 4620 4621 RuleBasedBreakIterator* bi = 4622 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 4623 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status); 4624 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 4625 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e"); 4626 // UnicodeString s("Aaa. Bcd"); 4627 s = s.unescape(); 4628 bi->setText(s); 4629 UBool r = bi->isBoundary(8); 4630 printf("%s", r?"true":"false"); 4631 return; 4632 pos = bi->last(); 4633 do { 4634 // ruleStatus = bi->getRuleStatus(); 4635 printf("%d\t%d\n", pos, ruleStatus); 4636 pos = bi->previous(); 4637 } while (pos != BreakIterator::DONE); 4638#endif 4639} 4640 4641#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 4642