1/******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1999-2013, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6/************************************************************************ 7* Date Name Description 8* 12/15/99 Madhu Creation. 9* 01/12/2000 Madhu Updated for changed API and added new tests 10************************************************************************/ 11 12#include "utypeinfo.h" // for 'typeid' to work 13 14#include "unicode/utypes.h" 15 16#if !UCONFIG_NO_BREAK_ITERATION 17 18#include "unicode/utypes.h" 19#include "unicode/brkiter.h" 20#include "unicode/rbbi.h" 21#include "unicode/uchar.h" 22#include "unicode/utf16.h" 23#include "unicode/ucnv.h" 24#include "unicode/schriter.h" 25#include "unicode/uniset.h" 26#if !UCONFIG_NO_REGULAR_EXPRESSIONS 27#include "unicode/regex.h" 28#endif 29#include "unicode/ustring.h" 30#include "unicode/utext.h" 31#include "intltest.h" 32#include "rbbitst.h" 33#include <string.h> 34#include "uvector.h" 35#include "uvectr32.h" 36#include <string.h> 37#include <stdio.h> 38#include <stdlib.h> 39#include "unicode/numfmt.h" 40#include "unicode/uscript.h" 41 42#define TEST_ASSERT(x) {if (!(x)) { \ 43 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 44 45#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 46 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 47 48 49//--------------------------------------------- 50// runIndexedTest 51//--------------------------------------------- 52 53 54// Note: Before adding new tests to this file, check whether the desired test data can 55// simply be added to the file testdata/rbbitest.txt. In most cases it can, 56// it's much less work than writing a new test, diagnostic output in the event of failures 57// is good, and the test data file will is shared with ICU4J, so eventually the test 58// will run there as well, without additional effort. 59 60void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 61{ 62 if (exec) logln("TestSuite RuleBasedBreakIterator: "); 63 64 switch (index) { 65#if !UCONFIG_NO_FILE_IO 66 case 0: name = "TestBug4153072"; 67 if(exec) TestBug4153072(); break; 68#else 69 case 0: name = "skip"; 70 break; 71#endif 72 73 case 1: name = "skip"; 74 break; 75 case 2: name = "TestStatusReturn"; 76 if(exec) TestStatusReturn(); break; 77 78#if !UCONFIG_NO_FILE_IO 79 case 3: name = "TestUnicodeFiles"; 80 if(exec) TestUnicodeFiles(); break; 81 case 4: name = "TestEmptyString"; 82 if(exec) TestEmptyString(); break; 83#else 84 case 3: case 4: name = "skip"; 85 break; 86#endif 87 88 case 5: name = "TestGetAvailableLocales"; 89 if(exec) TestGetAvailableLocales(); break; 90 91 case 6: name = "TestGetDisplayName"; 92 if(exec) TestGetDisplayName(); break; 93 94#if !UCONFIG_NO_FILE_IO 95 case 7: name = "TestEndBehaviour"; 96 if(exec) TestEndBehaviour(); break; 97 case 8: case 9: case 10: name = "skip"; 98 break; 99 case 11: name = "TestWordBreaks"; 100 if(exec) TestWordBreaks(); break; 101 case 12: name = "TestWordBoundary"; 102 if(exec) TestWordBoundary(); break; 103 case 13: name = "TestLineBreaks"; 104 if(exec) TestLineBreaks(); break; 105 case 14: name = "TestSentBreaks"; 106 if(exec) TestSentBreaks(); break; 107 case 15: name = "TestExtended"; 108 if(exec) TestExtended(); break; 109#else 110 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip"; 111 break; 112#endif 113 114#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 115 case 16: 116 name = "TestMonkey"; if(exec) TestMonkey(params); break; 117#else 118 case 16: 119 name = "skip"; break; 120#endif 121 122#if !UCONFIG_NO_FILE_IO 123 case 17: name = "TestBug3818"; 124 if(exec) TestBug3818(); break; 125#else 126 case 17: name = "skip"; 127 break; 128#endif 129 130 case 18: name = "skip"; 131 break; 132 case 19: name = "TestDebug"; 133 if(exec) TestDebug(); break; 134 case 20: name = "skip"; 135 break; 136 137#if !UCONFIG_NO_FILE_IO 138 case 21: name = "TestBug5775"; 139 if (exec) TestBug5775(); break; 140#else 141 case 21: name = "skip"; 142 break; 143#endif 144 145 case 22: name = "TestBug9983"; 146 if (exec) TestBug9983(); break; 147 case 23: name = "TestDictRules"; 148 if (exec) TestDictRules(); break; 149 case 24: name = "TestBug5532"; 150 if (exec) TestBug5532(); break; 151 default: name = ""; break; //needed to end loop 152 } 153} 154 155 156//--------------------------------------------------------------------------- 157// 158// class BITestData Holds a set of Break iterator test data and results 159// Includes 160// - the string data to be broken 161// - a vector of the expected break positions. 162// - a vector of source line numbers for the data, 163// (to help see where errors occured.) 164// - The expected break tag values. 165// - Vectors of actual break positions and tag values. 166// - Functions for comparing actual with expected and 167// reporting errors. 168// 169//---------------------------------------------------------------------------- 170class BITestData { 171public: 172 UnicodeString fDataToBreak; 173 UVector fExpectedBreakPositions; 174 UVector fExpectedTags; 175 UVector fLineNum; 176 UVector fActualBreakPositions; // Test Results. 177 UVector fActualTags; 178 179 BITestData(UErrorCode &status); 180 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); 181 void checkResults(const char *heading, RBBITest *test); 182 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); 183 void clearResults(); 184}; 185 186// 187// Constructor. 188// 189BITestData::BITestData(UErrorCode &status) 190: fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), 191 fActualTags(status) 192{ 193} 194 195// 196// addDataChunk. Add a section (non-breaking) piece if data to the test data. 197// The macro form collects the line number, which is helpful 198// when tracking down failures. 199// 200// A null data item is inserted at the start of each test's data 201// to put the starting zero into the data list. The position saved for 202// each non-null item is its ending position. 203// 204#define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); 205void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { 206 if (U_FAILURE(status)) {return;} 207 if (data != NULL) { 208 fDataToBreak.append(CharsToUnicodeString(data)); 209 } 210 fExpectedBreakPositions.addElement(fDataToBreak.length(), status); 211 fExpectedTags.addElement(tag, status); 212 fLineNum.addElement(lineNum, status); 213} 214 215 216// 217// checkResults. Compare the actual and expected break positions, report any differences. 218// 219void BITestData::checkResults(const char *heading, RBBITest *test) { 220 int32_t expectedIndex = 0; 221 int32_t actualIndex = 0; 222 223 for (;;) { 224 // If we've run through both the expected and actual results vectors, we're done. 225 // break out of the loop. 226 if (expectedIndex >= fExpectedBreakPositions.size() && 227 actualIndex >= fActualBreakPositions.size()) { 228 break; 229 } 230 231 232 if (expectedIndex >= fExpectedBreakPositions.size()) { 233 err(heading, test, expectedIndex-1, actualIndex); 234 actualIndex++; 235 continue; 236 } 237 238 if (actualIndex >= fActualBreakPositions.size()) { 239 err(heading, test, expectedIndex, actualIndex-1); 240 expectedIndex++; 241 continue; 242 } 243 244 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { 245 err(heading, test, expectedIndex, actualIndex); 246 // Try to resync the positions of the indices, to avoid a rash of spurious erros. 247 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { 248 actualIndex++; 249 } else { 250 expectedIndex++; 251 } 252 continue; 253 } 254 255 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { 256 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", 257 heading, fLineNum.elementAt(expectedIndex), 258 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); 259 } 260 261 actualIndex++; 262 expectedIndex++; 263 } 264} 265 266// 267// err - An error was found. Report it, along with information about where the 268// incorrectly broken test data appeared in the source file. 269// 270void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) 271{ 272 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); 273 int32_t actual = fActualBreakPositions.elementAti(actualIdx); 274 int32_t o = 0; 275 int32_t line = fLineNum.elementAti(expectedIdx); 276 if (expectedIdx > 0) { 277 // The line numbers are off by one because a premature break occurs somewhere 278 // within the previous item, rather than at the start of the current (expected) item. 279 // We want to report the offset of the unexpected break from the start of 280 // this previous item. 281 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); 282 } 283 if (actual < expected) { 284 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); 285 } else { 286 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); 287 } 288} 289 290 291void BITestData::clearResults() { 292 fActualBreakPositions.removeAllElements(); 293 fActualTags.removeAllElements(); 294} 295 296 297//-------------------------------------------------------------------------------------- 298// 299// RBBITest constructor and destructor 300// 301//-------------------------------------------------------------------------------------- 302 303RBBITest::RBBITest() { 304} 305 306 307RBBITest::~RBBITest() { 308} 309 310//----------------------------------------------------------------------------------- 311// 312// Test for status {tag} return value from break rules. 313// TODO: a more thorough test. 314// 315//----------------------------------------------------------------------------------- 316void RBBITest::TestStatusReturn() { 317 UnicodeString rulesString1("$Letters = [:L:];\n" 318 "$Numbers = [:N:];\n" 319 "$Letters+{1};\n" 320 "$Numbers+{2};\n" 321 "Help\\ {4}/me\\!;\n" 322 "[^$Letters $Numbers];\n" 323 "!.*;\n", -1, US_INV); 324 UnicodeString testString1 = "abc123..abc Help me Help me!"; 325 // 01234567890123456789012345678 326 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; 327 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; 328 329 UErrorCode status=U_ZERO_ERROR; 330 UParseError parseError; 331 332 BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 333 if(U_FAILURE(status)) { 334 dataerrln("FAIL : in construction - %s", u_errorName(status)); 335 } else { 336 int32_t pos; 337 int32_t i = 0; 338 bi->setText(testString1); 339 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { 340 if (pos != bounds1[i]) { 341 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos); 342 break; 343 } 344 345 int tag = bi->getRuleStatus(); 346 if (tag != brkStatus[i]) { 347 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag); 348 break; 349 } 350 i++; 351 } 352 } 353 delete bi; 354} 355 356 357static void printStringBreaks(UnicodeString ustr, int expected[], 358 int expectedcount) 359{ 360 UErrorCode status = U_ZERO_ERROR; 361 char name[100]; 362 printf("code alpha extend alphanum type word sent line name\n"); 363 int j; 364 for (j = 0; j < ustr.length(); j ++) { 365 if (expectedcount > 0) { 366 int k; 367 for (k = 0; k < expectedcount; k ++) { 368 if (j == expected[k]) { 369 printf("------------------------------------------------ %d\n", 370 j); 371 } 372 } 373 } 374 UChar32 c = ustr.char32At(j); 375 if (c > 0xffff) { 376 j ++; 377 } 378 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 379 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 380 u_isUAlphabetic(c), 381 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 382 u_isalnum(c), 383 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 384 u_charType(c), 385 U_SHORT_PROPERTY_NAME), 386 u_getPropertyValueName(UCHAR_WORD_BREAK, 387 u_getIntPropertyValue(c, 388 UCHAR_WORD_BREAK), 389 U_SHORT_PROPERTY_NAME), 390 u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 391 u_getIntPropertyValue(c, 392 UCHAR_SENTENCE_BREAK), 393 U_SHORT_PROPERTY_NAME), 394 u_getPropertyValueName(UCHAR_LINE_BREAK, 395 u_getIntPropertyValue(c, 396 UCHAR_LINE_BREAK), 397 U_SHORT_PROPERTY_NAME), 398 name); 399 } 400} 401 402 403void RBBITest::TestBug3818() { 404 UErrorCode status = U_ZERO_ERROR; 405 406 // Four Thai words... 407 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 408 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 409 UnicodeString thaiStr(thaiWordData); 410 411 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status); 412 if (U_FAILURE(status) || bi == NULL) { 413 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 414 return; 415 } 416 bi->setText(thaiStr); 417 418 int32_t startOfSecondWord = bi->following(1); 419 if (startOfSecondWord != 4) { 420 errln("Fail at file %s, line %d expected start of word at 4, got %d", 421 __FILE__, __LINE__, startOfSecondWord); 422 } 423 startOfSecondWord = bi->following(0); 424 if (startOfSecondWord != 4) { 425 errln("Fail at file %s, line %d expected start of word at 4, got %d", 426 __FILE__, __LINE__, startOfSecondWord); 427 } 428 delete bi; 429} 430 431//---------------------------------------------------------------------------- 432// 433// generalIteratorTest Given a break iterator and a set of test data, 434// Run the tests and report the results. 435// 436//---------------------------------------------------------------------------- 437void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 438{ 439 440 bi.setText(td.fDataToBreak); 441 442 testFirstAndNext(bi, td); 443 444 testLastAndPrevious(bi, td); 445 446 testFollowing(bi, td); 447 testPreceding(bi, td); 448 testIsBoundary(bi, td); 449 doMultipleSelectionTest(bi, td); 450} 451 452 453// 454// testFirstAndNext. Run the iterator forwards in the obvious first(), next() 455// kind of loop. 456// 457void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) 458{ 459 UErrorCode status = U_ZERO_ERROR; 460 int32_t p; 461 int32_t lastP = -1; 462 int32_t tag; 463 464 logln("Test first and next"); 465 bi.setText(td.fDataToBreak); 466 td.clearResults(); 467 468 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { 469 td.fActualBreakPositions.addElement(p, status); // Save result. 470 tag = bi.getRuleStatus(); 471 td.fActualTags.addElement(tag, status); 472 if (p <= lastP) { 473 // If the iterator is not making forward progress, stop. 474 // No need to raise an error here, it'll be detected in the normal check of results. 475 break; 476 } 477 lastP = p; 478 } 479 td.checkResults("testFirstAndNext", this); 480} 481 482 483// 484// TestLastAndPrevious. Run the iterator backwards, starting with last(). 485// 486void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) 487{ 488 UErrorCode status = U_ZERO_ERROR; 489 int32_t p; 490 int32_t lastP = 0x7ffffffe; 491 int32_t tag; 492 493 logln("Test last and previous"); 494 bi.setText(td.fDataToBreak); 495 td.clearResults(); 496 497 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { 498 // Save break position. Insert it at start of vector of results, shoving 499 // already-saved results further towards the end. 500 td.fActualBreakPositions.insertElementAt(p, 0, status); 501 // bi.previous(); // TODO: Why does this fix things up???? 502 // bi.next(); 503 tag = bi.getRuleStatus(); 504 td.fActualTags.insertElementAt(tag, 0, status); 505 if (p >= lastP) { 506 // If the iterator is not making progress, stop. 507 // No need to raise an error here, it'll be detected in the normal check of results. 508 break; 509 } 510 lastP = p; 511 } 512 td.checkResults("testLastAndPrevious", this); 513} 514 515 516void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) 517{ 518 UErrorCode status = U_ZERO_ERROR; 519 int32_t p; 520 int32_t tag; 521 int32_t lastP = -2; // A value that will never be returned as a break position. 522 // cannot be -1; that is returned for DONE. 523 int i; 524 525 logln("testFollowing():"); 526 bi.setText(td.fDataToBreak); 527 td.clearResults(); 528 529 // Save the starting point, since we won't get that out of following. 530 p = bi.first(); 531 td.fActualBreakPositions.addElement(p, status); // Save result. 532 tag = bi.getRuleStatus(); 533 td.fActualTags.addElement(tag, status); 534 535 for (i = 0; i <= td.fDataToBreak.length()+1; i++) { 536 p = bi.following(i); 537 if (p != lastP) { 538 if (p == RuleBasedBreakIterator::DONE) { 539 break; 540 } 541 // We've reached a new break position. Save it. 542 td.fActualBreakPositions.addElement(p, status); // Save result. 543 tag = bi.getRuleStatus(); 544 td.fActualTags.addElement(tag, status); 545 lastP = p; 546 } 547 } 548 // The loop normally exits by means of the break in the middle. 549 // Make sure that the index was at the correct position for the break iterator to have 550 // returned DONE. 551 if (i != td.fDataToBreak.length()) { 552 errln("testFollowing(): iterator returned DONE prematurely."); 553 } 554 555 // Full check of all results. 556 td.checkResults("testFollowing", this); 557} 558 559 560 561void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { 562 UErrorCode status = U_ZERO_ERROR; 563 int32_t p; 564 int32_t tag; 565 int32_t lastP = 0x7ffffffe; 566 int i; 567 568 logln("testPreceding():"); 569 bi.setText(td.fDataToBreak); 570 td.clearResults(); 571 572 p = bi.last(); 573 td.fActualBreakPositions.addElement(p, status); 574 tag = bi.getRuleStatus(); 575 td.fActualTags.addElement(tag, status); 576 577 for (i = td.fDataToBreak.length(); i>=-1; i--) { 578 p = bi.preceding(i); 579 if (p != lastP) { 580 if (p == RuleBasedBreakIterator::DONE) { 581 break; 582 } 583 // We've reached a new break position. Save it. 584 td.fActualBreakPositions.insertElementAt(p, 0, status); 585 lastP = p; 586 tag = bi.getRuleStatus(); 587 td.fActualTags.insertElementAt(tag, 0, status); 588 } 589 } 590 // The loop normally exits by means of the break in the middle. 591 // Make sure that the index was at the correct position for the break iterator to have 592 // returned DONE. 593 if (i != 0) { 594 errln("testPreceding(): iterator returned DONE prematurely."); 595 } 596 597 // Full check of all results. 598 td.checkResults("testPreceding", this); 599} 600 601 602 603void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { 604 UErrorCode status = U_ZERO_ERROR; 605 int i; 606 int32_t tag; 607 608 logln("testIsBoundary():"); 609 bi.setText(td.fDataToBreak); 610 td.clearResults(); 611 612 for (i = 0; i <= td.fDataToBreak.length(); i++) { 613 if (bi.isBoundary(i)) { 614 td.fActualBreakPositions.addElement(i, status); // Save result. 615 tag = bi.getRuleStatus(); 616 td.fActualTags.addElement(tag, status); 617 } 618 } 619 td.checkResults("testIsBoundary: ", this); 620} 621 622 623 624void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) 625{ 626 iterator.setText(td.fDataToBreak); 627 628 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); 629 int32_t offset = iterator.first(); 630 int32_t testOffset; 631 int32_t count = 0; 632 633 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); 634 635 if (*testIterator != iterator) 636 errln("clone() or operator!= failed: two clones compared unequal"); 637 638 do { 639 testOffset = testIterator->first(); 640 testOffset = testIterator->next(count); 641 if (offset != testOffset) 642 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 643 644 if (offset != RuleBasedBreakIterator::DONE) { 645 count++; 646 offset = iterator.next(); 647 648 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { 649 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); 650 if (count > 10000 || offset == -1) { 651 errln("operator== failed too many times. Stopping test."); 652 if (offset == -1) { 653 errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 654 } 655 return; 656 } 657 } 658 } 659 } while (offset != RuleBasedBreakIterator::DONE); 660 661 // now do it backwards... 662 offset = iterator.last(); 663 count = 0; 664 665 do { 666 testOffset = testIterator->last(); 667 testOffset = testIterator->next(count); // next() with a negative arg is same as previous 668 if (offset != testOffset) 669 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 670 671 if (offset != RuleBasedBreakIterator::DONE) { 672 count--; 673 offset = iterator.previous(); 674 } 675 } while (offset != RuleBasedBreakIterator::DONE); 676 677 delete testIterator; 678} 679 680 681//--------------------------------------------- 682// 683// other tests 684// 685//--------------------------------------------- 686void RBBITest::TestEmptyString() 687{ 688 UnicodeString text = ""; 689 UErrorCode status = U_ZERO_ERROR; 690 691 BITestData x(status); 692 ADD_DATACHUNK(x, "", 0, status); // Break at start of data 693 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 694 if (U_FAILURE(status)) 695 { 696 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); 697 return; 698 } 699 generalIteratorTest(*bi, x); 700 delete bi; 701} 702 703void RBBITest::TestGetAvailableLocales() 704{ 705 int32_t locCount = 0; 706 const Locale* locList = BreakIterator::getAvailableLocales(locCount); 707 708 if (locCount == 0) 709 dataerrln("getAvailableLocales() returned an empty list!"); 710 // Just make sure that it's returning good memory. 711 int32_t i; 712 for (i = 0; i < locCount; ++i) { 713 logln(locList[i].getName()); 714 } 715} 716 717//Testing the BreakIterator::getDisplayName() function 718void RBBITest::TestGetDisplayName() 719{ 720 UnicodeString result; 721 722 BreakIterator::getDisplayName(Locale::getUS(), result); 723 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 724 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 725 + result); 726 727 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 728 if (result != "French (France)") 729 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 730 + result); 731} 732/** 733 * Test End Behaviour 734 * @bug 4068137 735 */ 736void RBBITest::TestEndBehaviour() 737{ 738 UErrorCode status = U_ZERO_ERROR; 739 UnicodeString testString("boo."); 740 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 741 if (U_FAILURE(status)) 742 { 743 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 744 return; 745 } 746 wb->setText(testString); 747 748 if (wb->first() != 0) 749 errln("Didn't get break at beginning of string."); 750 if (wb->next() != 3) 751 errln("Didn't get break before period in \"boo.\""); 752 if (wb->current() != 4 && wb->next() != 4) 753 errln("Didn't get break at end of string."); 754 delete wb; 755} 756/* 757 * @bug 4153072 758 */ 759void RBBITest::TestBug4153072() { 760 UErrorCode status = U_ZERO_ERROR; 761 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 762 if (U_FAILURE(status)) 763 { 764 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 765 return; 766 } 767 UnicodeString str("...Hello, World!..."); 768 int32_t begin = 3; 769 int32_t end = str.length() - 3; 770 UBool onBoundary; 771 772 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 773 iter->adoptText(textIterator); 774 int index; 775 // Note: with the switch to UText, there is no way to restrict the 776 // iteration range to begin at an index other than zero. 777 // String character iterators created with a non-zero bound are 778 // treated by RBBI as being empty. 779 for (index = -1; index < begin + 1; ++index) { 780 onBoundary = iter->isBoundary(index); 781 if (index == 0? !onBoundary : onBoundary) { 782 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 783 " and begin index = " + begin); 784 } 785 } 786 delete iter; 787} 788 789 790// 791// Test for problem reported by Ashok Matoria on 9 July 2007 792// One.<kSoftHyphen><kSpace>Two. 793// 794// Sentence break at start (0) and then on calling next() it breaks at 795// 'T' of "Two". Now, at this point if I do next() and 796// then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 797// 798void RBBITest::TestBug5775() { 799 UErrorCode status = U_ZERO_ERROR; 800 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 801 TEST_ASSERT_SUCCESS(status); 802 if (U_FAILURE(status)) { 803 return; 804 } 805// Check for status first for better handling of no data errors. 806 TEST_ASSERT(bi != NULL); 807 if (bi == NULL) { 808 return; 809 } 810 811 UnicodeString s("One.\\u00ad Two.", -1, US_INV); 812 // 01234 56789 813 s = s.unescape(); 814 bi->setText(s); 815 int pos = bi->next(); 816 TEST_ASSERT(pos == 6); 817 pos = bi->next(); 818 TEST_ASSERT(pos == 10); 819 pos = bi->previous(); 820 TEST_ASSERT(pos == 6); 821 delete bi; 822} 823 824 825 826//------------------------------------------------------------------------------ 827// 828// RBBITest::Extended Run RBBI Tests from an external test data file 829// 830//------------------------------------------------------------------------------ 831 832struct TestParams { 833 BreakIterator *bi; 834 UnicodeString dataToBreak; 835 UVector32 *expectedBreaks; 836 UVector32 *srcLine; 837 UVector32 *srcCol; 838}; 839 840void RBBITest::executeTest(TestParams *t) { 841 int32_t bp; 842 int32_t prevBP; 843 int32_t i; 844 845 if (t->bi == NULL) { 846 return; 847 } 848 849 t->bi->setText(t->dataToBreak); 850 // 851 // Run the iterator forward 852 // 853 prevBP = -1; 854 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 855 if (prevBP == bp) { 856 // Fail for lack of forward progress. 857 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 858 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 859 break; 860 } 861 862 // Check that there were we didn't miss an expected break between the last one 863 // and this one. 864 for (i=prevBP+1; i<bp; i++) { 865 if (t->expectedBreaks->elementAti(i) != 0) { 866 int expected[] = {0, i}; 867 printStringBreaks(t->dataToBreak, expected, 2); 868 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 869 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 870 } 871 } 872 873 // Check that the break we did find was expected 874 if (t->expectedBreaks->elementAti(bp) == 0) { 875 int expected[] = {0, bp}; 876 printStringBreaks(t->dataToBreak, expected, 2); 877 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 878 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 879 } else { 880 // The break was expected. 881 // Check that the {nnn} tag value is correct. 882 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 883 if (expectedTagVal == -1) { 884 expectedTagVal = 0; 885 } 886 int32_t line = t->srcLine->elementAti(bp); 887 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 888 if (rs != expectedTagVal) { 889 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 890 " Actual, Expected status = %4d, %4d", 891 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 892 } 893 } 894 895 896 prevBP = bp; 897 } 898 899 // Verify that there were no missed expected breaks after the last one found 900 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) { 901 if (t->expectedBreaks->elementAti(i) != 0) { 902 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 903 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 904 } 905 } 906 907 // 908 // Run the iterator backwards, verify that the same breaks are found. 909 // 910 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen. 911 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { 912 if (prevBP == bp) { 913 // Fail for lack of progress. 914 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 915 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 916 break; 917 } 918 919 // Check that there were we didn't miss an expected break between the last one 920 // and this one. (UVector returns zeros for index out of bounds.) 921 for (i=prevBP-1; i>bp; i--) { 922 if (t->expectedBreaks->elementAti(i) != 0) { 923 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 924 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 925 } 926 } 927 928 // Check that the break we did find was expected 929 if (t->expectedBreaks->elementAti(bp) == 0) { 930 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 931 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); 932 } else { 933 // The break was expected. 934 // Check that the {nnn} tag value is correct. 935 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp); 936 if (expectedTagVal == -1) { 937 expectedTagVal = 0; 938 } 939 int line = t->srcLine->elementAti(bp); 940 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 941 if (rs != expectedTagVal) { 942 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 943 " Actual, Expected status = %4d, %4d", 944 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); 945 } 946 } 947 948 prevBP = bp; 949 } 950 951 // Verify that there were no missed breaks prior to the last one found 952 for (i=prevBP-1; i>=0; i--) { 953 if (t->expectedBreaks->elementAti(i) != 0) { 954 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 955 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); 956 } 957 } 958 959 // Check isBoundary() 960 for (i=0; i<t->expectedBreaks->size(); i++) { 961 UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0); 962 UBool boundaryFound = t->bi->isBoundary(i); 963 if (boundaryExpected != boundaryFound) { 964 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n" 965 " Expected, Actual= %s, %s", 966 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), 967 boundaryExpected ? "true":"false", boundaryFound? "true" : "false"); 968 } 969 } 970 971 // Check following() 972 for (i=0; i<t->expectedBreaks->size(); i++) { 973 int32_t actualBreak = t->bi->following(i); 974 int32_t expectedBreak = BreakIterator::DONE; 975 for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) { 976 if (t->expectedBreaks->elementAti(j) != 0) { 977 expectedBreak = j; 978 break; 979 } 980 } 981 if (expectedBreak != actualBreak) { 982 errln("following(%d) incorrect. File line,col= %4d,%4d\n" 983 " Expected, Actual= %d, %d", 984 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak); 985 } 986 } 987 988 // Check preceding() 989 for (i=t->expectedBreaks->size(); i>=0; i--) { 990 int32_t actualBreak = t->bi->preceding(i); 991 int32_t expectedBreak = BreakIterator::DONE; 992 993 for (int32_t j=i-1; j >= 0; j--) { 994 if (t->expectedBreaks->elementAti(j) != 0) { 995 expectedBreak = j; 996 break; 997 } 998 } 999 if (expectedBreak != actualBreak) { 1000 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n" 1001 " Expected, Actual= %d, %d", 1002 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak); 1003 } 1004 } 1005} 1006 1007 1008void RBBITest::TestExtended() { 1009#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1010 UErrorCode status = U_ZERO_ERROR; 1011 Locale locale(""); 1012 1013 UnicodeString rules; 1014 TestParams tp; 1015 tp.bi = NULL; 1016 tp.expectedBreaks = new UVector32(status); 1017 tp.srcLine = new UVector32(status); 1018 tp.srcCol = new UVector32(status); 1019 1020 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status); 1021 if (U_FAILURE(status)) { 1022 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 1023 } 1024 1025 1026 // 1027 // Open and read the test data file. 1028 // 1029 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1030 char testFileName[1000]; 1031 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1032 errln("Can't open test data. Path too long."); 1033 return; 1034 } 1035 strcpy(testFileName, testDataDirectory); 1036 strcat(testFileName, "rbbitst.txt"); 1037 1038 int len; 1039 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1040 if (U_FAILURE(status)) { 1041 return; /* something went wrong, error already output */ 1042 } 1043 1044 1045 1046 1047 // 1048 // Put the test data into a UnicodeString 1049 // 1050 UnicodeString testString(FALSE, testFile, len); 1051 1052 enum EParseState{ 1053 PARSE_COMMENT, 1054 PARSE_TAG, 1055 PARSE_DATA, 1056 PARSE_NUM 1057 } 1058 parseState = PARSE_TAG; 1059 1060 EParseState savedState = PARSE_TAG; 1061 1062 static const UChar CH_LF = 0x0a; 1063 static const UChar CH_CR = 0x0d; 1064 static const UChar CH_HASH = 0x23; 1065 /*static const UChar CH_PERIOD = 0x2e;*/ 1066 static const UChar CH_LT = 0x3c; 1067 static const UChar CH_GT = 0x3e; 1068 static const UChar CH_BACKSLASH = 0x5c; 1069 static const UChar CH_BULLET = 0x2022; 1070 1071 int32_t lineNum = 1; 1072 int32_t colStart = 0; 1073 int32_t column = 0; 1074 int32_t charIdx = 0; 1075 1076 int32_t tagValue = 0; // The numeric value of a <nnn> tag. 1077 1078 for (charIdx = 0; charIdx < len; ) { 1079 status = U_ZERO_ERROR; 1080 UChar c = testString.charAt(charIdx); 1081 charIdx++; 1082 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) { 1083 // treat CRLF as a unit 1084 c = CH_LF; 1085 charIdx++; 1086 } 1087 if (c == CH_LF || c == CH_CR) { 1088 lineNum++; 1089 colStart = charIdx; 1090 } 1091 column = charIdx - colStart + 1; 1092 1093 switch (parseState) { 1094 case PARSE_COMMENT: 1095 if (c == 0x0a || c == 0x0d) { 1096 parseState = savedState; 1097 } 1098 break; 1099 1100 case PARSE_TAG: 1101 { 1102 if (c == CH_HASH) { 1103 parseState = PARSE_COMMENT; 1104 savedState = PARSE_TAG; 1105 break; 1106 } 1107 if (u_isUWhiteSpace(c)) { 1108 break; 1109 } 1110 if (testString.compare(charIdx-1, 6, "<word>") == 0) { 1111 delete tp.bi; 1112 tp.bi = BreakIterator::createWordInstance(locale, status); 1113 charIdx += 5; 1114 break; 1115 } 1116 if (testString.compare(charIdx-1, 6, "<char>") == 0) { 1117 delete tp.bi; 1118 tp.bi = BreakIterator::createCharacterInstance(locale, status); 1119 charIdx += 5; 1120 break; 1121 } 1122 if (testString.compare(charIdx-1, 6, "<line>") == 0) { 1123 delete tp.bi; 1124 tp.bi = BreakIterator::createLineInstance(locale, status); 1125 charIdx += 5; 1126 break; 1127 } 1128 if (testString.compare(charIdx-1, 6, "<sent>") == 0) { 1129 delete tp.bi; 1130 tp.bi = NULL; 1131 tp.bi = BreakIterator::createSentenceInstance(locale, status); 1132 charIdx += 5; 1133 break; 1134 } 1135 if (testString.compare(charIdx-1, 7, "<title>") == 0) { 1136 delete tp.bi; 1137 tp.bi = BreakIterator::createTitleInstance(locale, status); 1138 charIdx += 6; 1139 break; 1140 } 1141 1142 // <locale loc_name> 1143 localeMatcher.reset(testString); 1144 if (localeMatcher.lookingAt(charIdx-1, status)) { 1145 UnicodeString localeName = localeMatcher.group(1, status); 1146 char localeName8[100]; 1147 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 1148 locale = Locale::createFromName(localeName8); 1149 charIdx += localeMatcher.group(0, status).length() - 1; 1150 TEST_ASSERT_SUCCESS(status); 1151 break; 1152 } 1153 if (testString.compare(charIdx-1, 6, "<data>") == 0) { 1154 parseState = PARSE_DATA; 1155 charIdx += 5; 1156 tp.dataToBreak = ""; 1157 tp.expectedBreaks->removeAllElements(); 1158 tp.srcCol ->removeAllElements(); 1159 tp.srcLine->removeAllElements(); 1160 break; 1161 } 1162 1163 errln("line %d: Tag expected in test file.", lineNum); 1164 parseState = PARSE_COMMENT; 1165 savedState = PARSE_DATA; 1166 goto end_test; // Stop the test. 1167 } 1168 break; 1169 1170 case PARSE_DATA: 1171 if (c == CH_BULLET) { 1172 int32_t breakIdx = tp.dataToBreak.length(); 1173 tp.expectedBreaks->setSize(breakIdx+1); 1174 tp.expectedBreaks->setElementAt(-1, breakIdx); 1175 tp.srcLine->setSize(breakIdx+1); 1176 tp.srcLine->setElementAt(lineNum, breakIdx); 1177 tp.srcCol ->setSize(breakIdx+1); 1178 tp.srcCol ->setElementAt(column, breakIdx); 1179 break; 1180 } 1181 1182 if (testString.compare(charIdx-1, 7, "</data>") == 0) { 1183 // Add final entry to mappings from break location to source file position. 1184 // Need one extra because last break position returned is after the 1185 // last char in the data, not at the last char. 1186 tp.srcLine->addElement(lineNum, status); 1187 tp.srcCol ->addElement(column, status); 1188 1189 parseState = PARSE_TAG; 1190 charIdx += 6; 1191 1192 // RUN THE TEST! 1193 executeTest(&tp); 1194 break; 1195 } 1196 1197 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { 1198 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 1199 // Get the code point from the name and insert it into the test data. 1200 // (Damn, no API takes names in Unicode !!! 1201 // we've got to take it back to char *) 1202 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx); 1203 int32_t nameLength = nameEndIdx - (charIdx+2); 1204 char charNameBuf[200]; 1205 UChar32 theChar = -1; 1206 if (nameEndIdx != -1) { 1207 UErrorCode status = U_ZERO_ERROR; 1208 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 1209 charNameBuf[sizeof(charNameBuf)-1] = 0; 1210 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 1211 if (U_FAILURE(status)) { 1212 theChar = -1; 1213 } 1214 } 1215 if (theChar == -1) { 1216 errln("Error in named character in test file at line %d, col %d", 1217 lineNum, column); 1218 } else { 1219 // Named code point was recognized. Insert it 1220 // into the test data. 1221 tp.dataToBreak.append(theChar); 1222 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1223 tp.srcLine->addElement(lineNum, status); 1224 tp.srcCol ->addElement(column, status); 1225 } 1226 } 1227 if (nameEndIdx > charIdx) { 1228 charIdx = nameEndIdx+1; 1229 1230 } 1231 break; 1232 } 1233 1234 1235 1236 1237 if (testString.compare(charIdx-1, 2, "<>") == 0) { 1238 charIdx++; 1239 int32_t breakIdx = tp.dataToBreak.length(); 1240 tp.expectedBreaks->setSize(breakIdx+1); 1241 tp.expectedBreaks->setElementAt(-1, breakIdx); 1242 tp.srcLine->setSize(breakIdx+1); 1243 tp.srcLine->setElementAt(lineNum, breakIdx); 1244 tp.srcCol ->setSize(breakIdx+1); 1245 tp.srcCol ->setElementAt(column, breakIdx); 1246 break; 1247 } 1248 1249 if (c == CH_LT) { 1250 tagValue = 0; 1251 parseState = PARSE_NUM; 1252 break; 1253 } 1254 1255 if (c == CH_HASH && column==3) { // TODO: why is column off so far? 1256 parseState = PARSE_COMMENT; 1257 savedState = PARSE_DATA; 1258 break; 1259 } 1260 1261 if (c == CH_BACKSLASH) { 1262 // Check for \ at end of line, a line continuation. 1263 // Advance over (discard) the newline 1264 UChar32 cp = testString.char32At(charIdx); 1265 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) { 1266 // We have a CR LF 1267 // Need an extra increment of the input ptr to move over both of them 1268 charIdx++; 1269 } 1270 if (cp == CH_LF || cp == CH_CR) { 1271 lineNum++; 1272 colStart = charIdx; 1273 charIdx++; 1274 break; 1275 } 1276 1277 // Let unescape handle the back slash. 1278 cp = testString.unescapeAt(charIdx); 1279 if (cp != -1) { 1280 // Escape sequence was recognized. Insert the char 1281 // into the test data. 1282 tp.dataToBreak.append(cp); 1283 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1284 tp.srcLine->addElement(lineNum, status); 1285 tp.srcCol ->addElement(column, status); 1286 } 1287 break; 1288 } 1289 1290 1291 // Not a recognized backslash escape sequence. 1292 // Take the next char as a literal. 1293 // TODO: Should this be an error? 1294 c = testString.charAt(charIdx); 1295 charIdx = testString.moveIndex32(charIdx, 1); 1296 } 1297 1298 // Normal, non-escaped data char. 1299 tp.dataToBreak.append(c); 1300 1301 // Save the mapping from offset in the data to line/column numbers in 1302 // the original input file. Will be used for better error messages only. 1303 // If there's an expected break before this char, the slot in the mapping 1304 // vector will already be set for this char; don't overwrite it. 1305 if (tp.dataToBreak.length() > tp.srcLine->size()) { 1306 tp.srcLine->addElement(lineNum, status); 1307 tp.srcCol ->addElement(column, status); 1308 } 1309 break; 1310 1311 1312 case PARSE_NUM: 1313 // We are parsing an expected numeric tag value, like <1234>, 1314 // within a chunk of data. 1315 if (u_isUWhiteSpace(c)) { 1316 break; 1317 } 1318 1319 if (c == CH_GT) { 1320 // Finished the number. Add the info to the expected break data, 1321 // and switch parse state back to doing plain data. 1322 parseState = PARSE_DATA; 1323 if (tagValue == 0) { 1324 tagValue = -1; 1325 } 1326 int32_t breakIdx = tp.dataToBreak.length(); 1327 tp.expectedBreaks->setSize(breakIdx+1); 1328 tp.expectedBreaks->setElementAt(tagValue, breakIdx); 1329 tp.srcLine->setSize(breakIdx+1); 1330 tp.srcLine->setElementAt(lineNum, breakIdx); 1331 tp.srcCol ->setSize(breakIdx+1); 1332 tp.srcCol ->setElementAt(column, breakIdx); 1333 break; 1334 } 1335 1336 if (u_isdigit(c)) { 1337 tagValue = tagValue*10 + u_charDigitValue(c); 1338 break; 1339 } 1340 1341 errln("Syntax Error in test file at line %d, col %d", 1342 lineNum, column); 1343 parseState = PARSE_COMMENT; 1344 goto end_test; // Stop the test 1345 break; 1346 } 1347 1348 1349 if (U_FAILURE(status)) { 1350 dataerrln("ICU Error %s while parsing test file at line %d.", 1351 u_errorName(status), lineNum); 1352 status = U_ZERO_ERROR; 1353 goto end_test; // Stop the test 1354 } 1355 1356 } 1357 1358end_test: 1359 delete tp.bi; 1360 delete tp.expectedBreaks; 1361 delete tp.srcLine; 1362 delete tp.srcCol; 1363 delete [] testFile; 1364#endif 1365} 1366 1367 1368//------------------------------------------------------------------------------- 1369// 1370// TestDictRules create a break iterator from source rules that includes a 1371// dictionary range. Regression for bug #7130. Source rules 1372// do not declare a break iterator type (word, line, sentence, etc. 1373// but the dictionary code, without a type, would loop. 1374// 1375//------------------------------------------------------------------------------- 1376void RBBITest::TestDictRules() { 1377 const char *rules = "$dictionary = [a-z]; \n" 1378 "!!forward; \n" 1379 "$dictionary $dictionary; \n" 1380 "!!reverse; \n" 1381 "$dictionary $dictionary; \n"; 1382 const char *text = "aa"; 1383 UErrorCode status = U_ZERO_ERROR; 1384 UParseError parseError; 1385 1386 RuleBasedBreakIterator bi(rules, parseError, status); 1387 if (U_SUCCESS(status)) { 1388 UnicodeString utext = text; 1389 bi.setText(utext); 1390 int32_t position; 1391 int32_t loops; 1392 for (loops = 0; loops<10; loops++) { 1393 position = bi.next(); 1394 if (position == RuleBasedBreakIterator::DONE) { 1395 break; 1396 } 1397 } 1398 TEST_ASSERT(loops == 1); 1399 } else { 1400 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status)); 1401 } 1402} 1403 1404 1405 1406//------------------------------------------------------------------------------- 1407// 1408// ReadAndConvertFile Read a text data file, convert it to UChars, and 1409// return the datain one big UChar * buffer, which the caller must delete. 1410// 1411// parameters: 1412// fileName: the name of the file, with no directory part. The test data directory 1413// is assumed. 1414// ulen an out parameter, receives the actual length (in UChars) of the file data. 1415// encoding The file encoding. If the file contains a BOM, that will override the encoding 1416// specified here. The BOM, if it exists, will be stripped from the returned data. 1417// Pass NULL for the system default encoding. 1418// status 1419// returns: 1420// The file data, converted to UChar. 1421// The caller must delete this when done with 1422// delete [] theBuffer; 1423// 1424// TODO: This is a clone of RegexTest::ReadAndConvertFile. 1425// Move this function to some common place. 1426// 1427//-------------------------------------------------------------------------------- 1428UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 1429 UChar *retPtr = NULL; 1430 char *fileBuf = NULL; 1431 UConverter* conv = NULL; 1432 FILE *f = NULL; 1433 1434 ulen = 0; 1435 if (U_FAILURE(status)) { 1436 return retPtr; 1437 } 1438 1439 // 1440 // Open the file. 1441 // 1442 f = fopen(fileName, "rb"); 1443 if (f == 0) { 1444 dataerrln("Error opening test data file %s\n", fileName); 1445 status = U_FILE_ACCESS_ERROR; 1446 return NULL; 1447 } 1448 // 1449 // Read it in 1450 // 1451 int fileSize; 1452 int amt_read; 1453 1454 fseek( f, 0, SEEK_END); 1455 fileSize = ftell(f); 1456 fileBuf = new char[fileSize]; 1457 fseek(f, 0, SEEK_SET); 1458 amt_read = fread(fileBuf, 1, fileSize, f); 1459 if (amt_read != fileSize || fileSize <= 0) { 1460 errln("Error reading test data file."); 1461 goto cleanUpAndReturn; 1462 } 1463 1464 // 1465 // Look for a Unicode Signature (BOM) on the data just read 1466 // 1467 int32_t signatureLength; 1468 const char * fileBufC; 1469 const char* bomEncoding; 1470 1471 fileBufC = fileBuf; 1472 bomEncoding = ucnv_detectUnicodeSignature( 1473 fileBuf, fileSize, &signatureLength, &status); 1474 if(bomEncoding!=NULL ){ 1475 fileBufC += signatureLength; 1476 fileSize -= signatureLength; 1477 encoding = bomEncoding; 1478 } 1479 1480 // 1481 // Open a converter to take the rule file to UTF-16 1482 // 1483 conv = ucnv_open(encoding, &status); 1484 if (U_FAILURE(status)) { 1485 goto cleanUpAndReturn; 1486 } 1487 1488 // 1489 // Convert the rules to UChar. 1490 // Preflight first to determine required buffer size. 1491 // 1492 ulen = ucnv_toUChars(conv, 1493 NULL, // dest, 1494 0, // destCapacity, 1495 fileBufC, 1496 fileSize, 1497 &status); 1498 if (status == U_BUFFER_OVERFLOW_ERROR) { 1499 // Buffer Overflow is expected from the preflight operation. 1500 status = U_ZERO_ERROR; 1501 1502 retPtr = new UChar[ulen+1]; 1503 ucnv_toUChars(conv, 1504 retPtr, // dest, 1505 ulen+1, 1506 fileBufC, 1507 fileSize, 1508 &status); 1509 } 1510 1511cleanUpAndReturn: 1512 fclose(f); 1513 delete []fileBuf; 1514 ucnv_close(conv); 1515 if (U_FAILURE(status)) { 1516 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 1517 delete []retPtr; 1518 retPtr = 0; 1519 ulen = 0; 1520 }; 1521 return retPtr; 1522} 1523 1524 1525 1526//-------------------------------------------------------------------------------------------- 1527// 1528// Run tests from each of the boundary test data files distributed by the Unicode Consortium 1529// 1530//------------------------------------------------------------------------------------------- 1531void RBBITest::TestUnicodeFiles() { 1532 RuleBasedBreakIterator *bi; 1533 UErrorCode status = U_ZERO_ERROR; 1534 1535 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 1536 TEST_ASSERT_SUCCESS(status); 1537 if (U_SUCCESS(status)) { 1538 runUnicodeTestData("GraphemeBreakTest.txt", bi); 1539 } 1540 delete bi; 1541 1542 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); 1543 TEST_ASSERT_SUCCESS(status); 1544 if (U_SUCCESS(status)) { 1545 runUnicodeTestData("WordBreakTest.txt", bi); 1546 } 1547 delete bi; 1548 1549 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1550 TEST_ASSERT_SUCCESS(status); 1551 if (U_SUCCESS(status)) { 1552 runUnicodeTestData("SentenceBreakTest.txt", bi); 1553 } 1554 delete bi; 1555 1556 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); 1557 TEST_ASSERT_SUCCESS(status); 1558 if (U_SUCCESS(status)) { 1559 runUnicodeTestData("LineBreakTest.txt", bi); 1560 } 1561 delete bi; 1562} 1563 1564 1565//-------------------------------------------------------------------------------------------- 1566// 1567// Run tests from one of the boundary test data files distributed by the Unicode Consortium 1568// 1569//------------------------------------------------------------------------------------------- 1570void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 1571#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1572 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270 1573 UBool isTicket7270Fixed = !logKnownIssue("7270"); 1574 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt"); 1575 UErrorCode status = U_ZERO_ERROR; 1576 1577 // 1578 // Open and read the test data file, put it into a UnicodeString. 1579 // 1580 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1581 char testFileName[1000]; 1582 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1583 dataerrln("Can't open test data. Path too long."); 1584 return; 1585 } 1586 strcpy(testFileName, testDataDirectory); 1587 strcat(testFileName, fileName); 1588 1589 logln("Opening data file %s\n", fileName); 1590 1591 int len; 1592 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1593 if (status != U_FILE_ACCESS_ERROR) { 1594 TEST_ASSERT_SUCCESS(status); 1595 TEST_ASSERT(testFile != NULL); 1596 } 1597 if (U_FAILURE(status) || testFile == NULL) { 1598 return; /* something went wrong, error already output */ 1599 } 1600 UnicodeString testFileAsString(TRUE, testFile, len); 1601 1602 // 1603 // Parse the test data file using a regular expression. 1604 // Each kind of token is recognized in its own capture group; what type of item was scanned 1605 // is identified by which group had a match. 1606 // 1607 // Caputure Group # 1 2 3 4 5 1608 // Parses this item: divide x hex digits comment \n unrecognized \n 1609 // 1610 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 1611 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 1612 UnicodeString testString; 1613 UVector32 breakPositions(status); 1614 int lineNumber = 1; 1615 TEST_ASSERT_SUCCESS(status); 1616 if (U_FAILURE(status)) { 1617 return; 1618 } 1619 1620 // 1621 // Scan through each test case, building up the string to be broken in testString, 1622 // and the positions that should be boundaries in the breakPositions vector. 1623 // 1624 int spin = 0; 1625 while (tokenMatcher.find()) { 1626 if(tokenMatcher.hitEnd()) { 1627 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for. 1628 This occurred when the text file was corrupt (wasn't marked as UTF-8) 1629 and caused an infinite loop here on EBCDIC systems! 1630 */ 1631 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin); 1632 // return; 1633 } 1634 if (tokenMatcher.start(1, status) >= 0) { 1635 // Scanned a divide sign, indicating a break position in the test data. 1636 if (testString.length()>0) { 1637 breakPositions.addElement(testString.length(), status); 1638 } 1639 } 1640 else if (tokenMatcher.start(2, status) >= 0) { 1641 // Scanned an 'x', meaning no break at this position in the test data 1642 // Nothing to be done here. 1643 } 1644 else if (tokenMatcher.start(3, status) >= 0) { 1645 // Scanned Hex digits. Convert them to binary, append to the character data string. 1646 const UnicodeString &hexNumber = tokenMatcher.group(3, status); 1647 int length = hexNumber.length(); 1648 if (length<=8) { 1649 char buf[10]; 1650 hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 1651 UChar32 c = (UChar32)strtol(buf, NULL, 16); 1652 if (c<=0x10ffff) { 1653 testString.append(c); 1654 } else { 1655 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 1656 fileName, lineNumber); 1657 } 1658 } else { 1659 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 1660 fileName, lineNumber); 1661 } 1662 } 1663 else if (tokenMatcher.start(4, status) >= 0) { 1664 // Scanned to end of a line, possibly skipping over a comment in the process. 1665 // If the line from the file contained test data, run the test now. 1666 // 1667 if (testString.length() > 0) { 1668// TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data. 1669// Rule 8 1670// ZW SP* <break> 1671// is not yet implemented. 1672if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber || 1673 5202 == lineNumber || 1674 5214 == lineNumber || 1675 5246 == lineNumber || 1676 5298 == lineNumber || 1677 5302 == lineNumber ))) { 1678 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 1679} 1680 } 1681 1682 // Clear out this test case. 1683 // The string and breakPositions vector will be refilled as the next 1684 // test case is parsed. 1685 testString.remove(); 1686 breakPositions.removeAllElements(); 1687 lineNumber++; 1688 } else { 1689 // Scanner catchall. Something unrecognized appeared on the line. 1690 char token[16]; 1691 UnicodeString uToken = tokenMatcher.group(0, status); 1692 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 1693 token[sizeof(token)-1] = 0; 1694 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 1695 1696 // Clean up, in preparation for continuing with the next line. 1697 testString.remove(); 1698 breakPositions.removeAllElements(); 1699 lineNumber++; 1700 } 1701 TEST_ASSERT_SUCCESS(status); 1702 if (U_FAILURE(status)) { 1703 break; 1704 } 1705 } 1706 1707 delete [] testFile; 1708 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1709} 1710 1711//-------------------------------------------------------------------------------------------- 1712// 1713// checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 1714// test data files. Do only a simple, forward-only check - 1715// this test is mostly to check that ICU and the Unicode 1716// data agree with each other. 1717// 1718//-------------------------------------------------------------------------------------------- 1719void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 1720 const UnicodeString &testString, // Text data to be broken 1721 UVector32 *breakPositions, // Positions where breaks should be found. 1722 RuleBasedBreakIterator *bi) { 1723 int32_t pos; // Break Position in the test string 1724 int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 1725 int32_t expectedPos; // Expected break position (index into test string) 1726 1727 bi->setText(testString); 1728 pos = bi->first(); 1729 pos = bi->next(); 1730 1731 while (pos != BreakIterator::DONE) { 1732 if (expectedI >= breakPositions->size()) { 1733 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1734 testFileName, lineNumber, pos); 1735 break; 1736 } 1737 expectedPos = breakPositions->elementAti(expectedI); 1738 if (pos < expectedPos) { 1739 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1740 testFileName, lineNumber, pos); 1741 break; 1742 } 1743 if (pos > expectedPos) { 1744 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1745 testFileName, lineNumber, expectedPos); 1746 break; 1747 } 1748 pos = bi->next(); 1749 expectedI++; 1750 } 1751 1752 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 1753 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1754 testFileName, lineNumber, breakPositions->elementAti(expectedI)); 1755 } 1756} 1757 1758 1759 1760#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1761//--------------------------------------------------------------------------------------- 1762// 1763// classs RBBIMonkeyKind 1764// 1765// Monkey Test for Break Iteration 1766// Abstract interface class. Concrete derived classes independently 1767// implement the break rules for different iterator types. 1768// 1769// The Monkey Test itself uses doesn't know which type of break iterator it is 1770// testing, but works purely in terms of the interface defined here. 1771// 1772//--------------------------------------------------------------------------------------- 1773class RBBIMonkeyKind { 1774public: 1775 // Return a UVector of UnicodeSets, representing the character classes used 1776 // for this type of iterator. 1777 virtual UVector *charClasses() = 0; 1778 1779 // Set the test text on which subsequent calls to next() will operate 1780 virtual void setText(const UnicodeString &s) = 0; 1781 1782 // Find the next break postion, starting from the prev break position, or from zero. 1783 // Return -1 after reaching end of string. 1784 virtual int32_t next(int32_t i) = 0; 1785 1786 virtual ~RBBIMonkeyKind(); 1787 UErrorCode deferredStatus; 1788 1789 1790protected: 1791 RBBIMonkeyKind(); 1792 1793private: 1794}; 1795 1796RBBIMonkeyKind::RBBIMonkeyKind() { 1797 deferredStatus = U_ZERO_ERROR; 1798} 1799 1800RBBIMonkeyKind::~RBBIMonkeyKind() { 1801} 1802 1803 1804//---------------------------------------------------------------------------------------- 1805// 1806// Random Numbers. Similar to standard lib rand() and srand() 1807// Not using library to 1808// 1. Get same results on all platforms. 1809// 2. Get access to current seed, to more easily reproduce failures. 1810// 1811//--------------------------------------------------------------------------------------- 1812static uint32_t m_seed = 1; 1813 1814static uint32_t m_rand() 1815{ 1816 m_seed = m_seed * 1103515245 + 12345; 1817 return (uint32_t)(m_seed/65536) % 32768; 1818} 1819 1820 1821//------------------------------------------------------------------------------------------ 1822// 1823// class RBBICharMonkey Character (Grapheme Cluster) specific implementation 1824// of RBBIMonkeyKind. 1825// 1826//------------------------------------------------------------------------------------------ 1827class RBBICharMonkey: public RBBIMonkeyKind { 1828public: 1829 RBBICharMonkey(); 1830 virtual ~RBBICharMonkey(); 1831 virtual UVector *charClasses(); 1832 virtual void setText(const UnicodeString &s); 1833 virtual int32_t next(int32_t i); 1834private: 1835 UVector *fSets; 1836 1837 UnicodeSet *fCRLFSet; 1838 UnicodeSet *fControlSet; 1839 UnicodeSet *fExtendSet; 1840 UnicodeSet *fRegionalIndicatorSet; 1841 UnicodeSet *fPrependSet; 1842 UnicodeSet *fSpacingSet; 1843 UnicodeSet *fLSet; 1844 UnicodeSet *fVSet; 1845 UnicodeSet *fTSet; 1846 UnicodeSet *fLVSet; 1847 UnicodeSet *fLVTSet; 1848 UnicodeSet *fHangulSet; 1849 UnicodeSet *fAnySet; 1850 1851 const UnicodeString *fText; 1852}; 1853 1854 1855RBBICharMonkey::RBBICharMonkey() { 1856 UErrorCode status = U_ZERO_ERROR; 1857 1858 fText = NULL; 1859 1860 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 1861 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status); 1862 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status); 1863 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status); 1864 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 1865 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 1866 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 1867 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 1868 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 1869 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 1870 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 1871 fHangulSet = new UnicodeSet(); 1872 fHangulSet->addAll(*fLSet); 1873 fHangulSet->addAll(*fVSet); 1874 fHangulSet->addAll(*fTSet); 1875 fHangulSet->addAll(*fLVSet); 1876 fHangulSet->addAll(*fLVTSet); 1877 fAnySet = new UnicodeSet(0, 0x10ffff); 1878 1879 fSets = new UVector(status); 1880 fSets->addElement(fCRLFSet, status); 1881 fSets->addElement(fControlSet, status); 1882 fSets->addElement(fExtendSet, status); 1883 fSets->addElement(fRegionalIndicatorSet, status); 1884 if (!fPrependSet->isEmpty()) { 1885 fSets->addElement(fPrependSet, status); 1886 } 1887 fSets->addElement(fSpacingSet, status); 1888 fSets->addElement(fHangulSet, status); 1889 fSets->addElement(fAnySet, status); 1890 if (U_FAILURE(status)) { 1891 deferredStatus = status; 1892 } 1893} 1894 1895 1896void RBBICharMonkey::setText(const UnicodeString &s) { 1897 fText = &s; 1898} 1899 1900 1901 1902int32_t RBBICharMonkey::next(int32_t prevPos) { 1903 int p0, p1, p2, p3; // Indices of the significant code points around the 1904 // break position being tested. The candidate break 1905 // location is before p2. 1906 1907 int breakPos = -1; 1908 1909 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 1910 1911 if (U_FAILURE(deferredStatus)) { 1912 return -1; 1913 } 1914 1915 // Previous break at end of string. return DONE. 1916 if (prevPos >= fText->length()) { 1917 return -1; 1918 } 1919 p0 = p1 = p2 = p3 = prevPos; 1920 c3 = fText->char32At(prevPos); 1921 c0 = c1 = c2 = 0; 1922 (void)p0; // suppress set but not used warning. 1923 (void)c0; 1924 1925 // Loop runs once per "significant" character position in the input text. 1926 for (;;) { 1927 // Move all of the positions forward in the input string. 1928 p0 = p1; c0 = c1; 1929 p1 = p2; c1 = c2; 1930 p2 = p3; c2 = c3; 1931 1932 // Advancd p3 by one codepoint 1933 p3 = fText->moveIndex32(p3, 1); 1934 c3 = fText->char32At(p3); 1935 1936 if (p1 == p2) { 1937 // Still warming up the loop. (won't work with zero length strings, but we don't care) 1938 continue; 1939 } 1940 if (p2 == fText->length()) { 1941 // Reached end of string. Always a break position. 1942 break; 1943 } 1944 1945 // Rule GB3 CR x LF 1946 // No Extend or Format characters may appear between the CR and LF, 1947 // which requires the additional check for p2 immediately following p1. 1948 // 1949 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 1950 continue; 1951 } 1952 1953 // Rule (GB4). ( Control | CR | LF ) <break> 1954 if (fControlSet->contains(c1) || 1955 c1 == 0x0D || 1956 c1 == 0x0A) { 1957 break; 1958 } 1959 1960 // Rule (GB5) <break> ( Control | CR | LF ) 1961 // 1962 if (fControlSet->contains(c2) || 1963 c2 == 0x0D || 1964 c2 == 0x0A) { 1965 break; 1966 } 1967 1968 1969 // Rule (GB6) L x ( L | V | LV | LVT ) 1970 if (fLSet->contains(c1) && 1971 (fLSet->contains(c2) || 1972 fVSet->contains(c2) || 1973 fLVSet->contains(c2) || 1974 fLVTSet->contains(c2))) { 1975 continue; 1976 } 1977 1978 // Rule (GB7) ( LV | V ) x ( V | T ) 1979 if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 1980 (fVSet->contains(c2) || fTSet->contains(c2))) { 1981 continue; 1982 } 1983 1984 // Rule (GB8) ( LVT | T) x T 1985 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 1986 fTSet->contains(c2)) { 1987 continue; 1988 } 1989 1990 // Rule (GB8a) Regional_Indicator x Regional_Indicator 1991 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 1992 continue; 1993 } 1994 1995 // Rule (GB9) Numeric x ALetter 1996 if (fExtendSet->contains(c2)) { 1997 continue; 1998 } 1999 2000 // Rule (GB9a) x SpacingMark 2001 if (fSpacingSet->contains(c2)) { 2002 continue; 2003 } 2004 2005 // Rule (GB9b) Prepend x 2006 if (fPrependSet->contains(c1)) { 2007 continue; 2008 } 2009 2010 // Rule (GB10) Any <break> Any 2011 break; 2012 } 2013 2014 breakPos = p2; 2015 return breakPos; 2016} 2017 2018 2019 2020UVector *RBBICharMonkey::charClasses() { 2021 return fSets; 2022} 2023 2024 2025RBBICharMonkey::~RBBICharMonkey() { 2026 delete fSets; 2027 delete fCRLFSet; 2028 delete fControlSet; 2029 delete fExtendSet; 2030 delete fRegionalIndicatorSet; 2031 delete fPrependSet; 2032 delete fSpacingSet; 2033 delete fLSet; 2034 delete fVSet; 2035 delete fTSet; 2036 delete fLVSet; 2037 delete fLVTSet; 2038 delete fHangulSet; 2039 delete fAnySet; 2040} 2041 2042//------------------------------------------------------------------------------------------ 2043// 2044// class RBBIWordMonkey Word Break specific implementation 2045// of RBBIMonkeyKind. 2046// 2047//------------------------------------------------------------------------------------------ 2048class RBBIWordMonkey: public RBBIMonkeyKind { 2049public: 2050 RBBIWordMonkey(); 2051 virtual ~RBBIWordMonkey(); 2052 virtual UVector *charClasses(); 2053 virtual void setText(const UnicodeString &s); 2054 virtual int32_t next(int32_t i); 2055private: 2056 UVector *fSets; 2057 2058 UnicodeSet *fCRSet; 2059 UnicodeSet *fLFSet; 2060 UnicodeSet *fNewlineSet; 2061 UnicodeSet *fRegionalIndicatorSet; 2062 UnicodeSet *fKatakanaSet; 2063 UnicodeSet *fHebrew_LetterSet; 2064 UnicodeSet *fALetterSet; 2065 // TODO(jungshik): Do we still need this change? 2066 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt 2067 UnicodeSet *fSingle_QuoteSet; 2068 UnicodeSet *fDouble_QuoteSet; 2069 UnicodeSet *fMidNumLetSet; 2070 UnicodeSet *fMidLetterSet; 2071 UnicodeSet *fMidNumSet; 2072 UnicodeSet *fNumericSet; 2073 UnicodeSet *fFormatSet; 2074 UnicodeSet *fOtherSet; 2075 UnicodeSet *fExtendSet; 2076 UnicodeSet *fExtendNumLetSet; 2077 UnicodeSet *fDictionaryCjkSet; 2078 2079 const UnicodeString *fText; 2080}; 2081 2082 2083RBBIWordMonkey::RBBIWordMonkey() 2084{ 2085 UErrorCode status = U_ZERO_ERROR; 2086 2087 fSets = new UVector(status); 2088 2089 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 2090 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 2091 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 2092 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status); 2093 // Exclude Hangul syllables from ALetterSet during testing. 2094 // Leave CJK dictionary characters out from the monkey tests! 2095#if 0 2096 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" 2097 "[\\p{Line_Break = Complex_Context}" 2098 "-\\p{Grapheme_Cluster_Break = Extend}" 2099 "-\\p{Grapheme_Cluster_Break = Control}" 2100 "]]", 2101 status); 2102#endif 2103 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status); 2104 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 2105 fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status); 2106 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 2107 fALetterSet->removeAll(*fDictionaryCjkSet); 2108 fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status); 2109 fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status); 2110 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 2111 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 2112 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 2113 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test 2114 // we should figure out why 2115 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); 2116 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 2117 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 2118 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 2119 2120 fOtherSet = new UnicodeSet(); 2121 if(U_FAILURE(status)) { 2122 deferredStatus = status; 2123 return; 2124 } 2125 2126 fOtherSet->complement(); 2127 fOtherSet->removeAll(*fCRSet); 2128 fOtherSet->removeAll(*fLFSet); 2129 fOtherSet->removeAll(*fNewlineSet); 2130 fOtherSet->removeAll(*fKatakanaSet); 2131 fOtherSet->removeAll(*fHebrew_LetterSet); 2132 fOtherSet->removeAll(*fALetterSet); 2133 fOtherSet->removeAll(*fSingle_QuoteSet); 2134 fOtherSet->removeAll(*fDouble_QuoteSet); 2135 fOtherSet->removeAll(*fMidLetterSet); 2136 fOtherSet->removeAll(*fMidNumSet); 2137 fOtherSet->removeAll(*fNumericSet); 2138 fOtherSet->removeAll(*fExtendNumLetSet); 2139 fOtherSet->removeAll(*fFormatSet); 2140 fOtherSet->removeAll(*fExtendSet); 2141 fOtherSet->removeAll(*fRegionalIndicatorSet); 2142 // Inhibit dictionary characters from being tested at all. 2143 fOtherSet->removeAll(*fDictionaryCjkSet); 2144 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); 2145 2146 fSets->addElement(fCRSet, status); 2147 fSets->addElement(fLFSet, status); 2148 fSets->addElement(fNewlineSet, status); 2149 fSets->addElement(fRegionalIndicatorSet, status); 2150 fSets->addElement(fHebrew_LetterSet, status); 2151 fSets->addElement(fALetterSet, status); 2152 fSets->addElement(fSingle_QuoteSet, status); 2153 fSets->addElement(fDouble_QuoteSet, status); 2154 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana 2155 fSets->addElement(fMidLetterSet, status); 2156 fSets->addElement(fMidNumLetSet, status); 2157 fSets->addElement(fMidNumSet, status); 2158 fSets->addElement(fNumericSet, status); 2159 fSets->addElement(fFormatSet, status); 2160 fSets->addElement(fExtendSet, status); 2161 fSets->addElement(fOtherSet, status); 2162 fSets->addElement(fExtendNumLetSet, status); 2163 2164 if (U_FAILURE(status)) { 2165 deferredStatus = status; 2166 } 2167} 2168 2169void RBBIWordMonkey::setText(const UnicodeString &s) { 2170 fText = &s; 2171} 2172 2173 2174int32_t RBBIWordMonkey::next(int32_t prevPos) { 2175 int p0, p1, p2, p3; // Indices of the significant code points around the 2176 // break position being tested. The candidate break 2177 // location is before p2. 2178 2179 int breakPos = -1; 2180 2181 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2182 2183 if (U_FAILURE(deferredStatus)) { 2184 return -1; 2185 } 2186 2187 // Prev break at end of string. return DONE. 2188 if (prevPos >= fText->length()) { 2189 return -1; 2190 } 2191 p0 = p1 = p2 = p3 = prevPos; 2192 c3 = fText->char32At(prevPos); 2193 c0 = c1 = c2 = 0; 2194 (void)p0; // Suppress set but not used warning. 2195 2196 // Loop runs once per "significant" character position in the input text. 2197 for (;;) { 2198 // Move all of the positions forward in the input string. 2199 p0 = p1; c0 = c1; 2200 p1 = p2; c1 = c2; 2201 p2 = p3; c2 = c3; 2202 2203 // Advancd p3 by X(Extend | Format)* Rule 4 2204 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 2205 do { 2206 p3 = fText->moveIndex32(p3, 1); 2207 c3 = fText->char32At(p3); 2208 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2209 break; 2210 }; 2211 } 2212 while (fFormatSet->contains(c3) || fExtendSet->contains(c3)); 2213 2214 2215 if (p1 == p2) { 2216 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2217 continue; 2218 } 2219 if (p2 == fText->length()) { 2220 // Reached end of string. Always a break position. 2221 break; 2222 } 2223 2224 // Rule (3) CR x LF 2225 // No Extend or Format characters may appear between the CR and LF, 2226 // which requires the additional check for p2 immediately following p1. 2227 // 2228 if (c1==0x0D && c2==0x0A) { 2229 continue; 2230 } 2231 2232 // Rule (3a) Break before and after newlines (including CR and LF) 2233 // 2234 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 2235 break; 2236 }; 2237 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2238 break; 2239 }; 2240 2241 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) 2242 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2243 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2244 continue; 2245 } 2246 2247 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) 2248 // 2249 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2250 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 2251 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) { 2252 continue; 2253 } 2254 2255 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter) 2256 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) && 2257 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 2258 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2259 continue; 2260 } 2261 2262 // Rule (7a) Hebrew_Letter x Single_Quote 2263 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) { 2264 continue; 2265 } 2266 2267 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter 2268 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) { 2269 continue; 2270 } 2271 2272 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter 2273 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) { 2274 continue; 2275 } 2276 2277 // Rule (8) Numeric x Numeric 2278 if (fNumericSet->contains(c1) && 2279 fNumericSet->contains(c2)) { 2280 continue; 2281 } 2282 2283 // Rule (9) (ALetter | Hebrew_Letter) x Numeric 2284 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2285 fNumericSet->contains(c2)) { 2286 continue; 2287 } 2288 2289 // Rule (10) Numeric x (ALetter | Hebrew_Letter) 2290 if (fNumericSet->contains(c1) && 2291 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2292 continue; 2293 } 2294 2295 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric 2296 if (fNumericSet->contains(c0) && 2297 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 2298 fNumericSet->contains(c2)) { 2299 continue; 2300 } 2301 2302 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric 2303 if (fNumericSet->contains(c1) && 2304 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 2305 fNumericSet->contains(c3)) { 2306 continue; 2307 } 2308 2309 // Rule (13) Katakana x Katakana 2310 if (fKatakanaSet->contains(c1) && 2311 fKatakanaSet->contains(c2)) { 2312 continue; 2313 } 2314 2315 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet 2316 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) || 2317 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 2318 fExtendNumLetSet->contains(c2)) { 2319 continue; 2320 } 2321 2322 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana) 2323 if (fExtendNumLetSet->contains(c1) && 2324 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) || 2325 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) { 2326 continue; 2327 } 2328 2329 // Rule 13c 2330 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 2331 continue; 2332 } 2333 2334 // Rule 14. Break found here. 2335 break; 2336 } 2337 2338 breakPos = p2; 2339 return breakPos; 2340} 2341 2342 2343UVector *RBBIWordMonkey::charClasses() { 2344 return fSets; 2345} 2346 2347 2348RBBIWordMonkey::~RBBIWordMonkey() { 2349 delete fSets; 2350 delete fCRSet; 2351 delete fLFSet; 2352 delete fNewlineSet; 2353 delete fKatakanaSet; 2354 delete fHebrew_LetterSet; 2355 delete fALetterSet; 2356 delete fSingle_QuoteSet; 2357 delete fDouble_QuoteSet; 2358 delete fMidNumLetSet; 2359 delete fMidLetterSet; 2360 delete fMidNumSet; 2361 delete fNumericSet; 2362 delete fFormatSet; 2363 delete fExtendSet; 2364 delete fExtendNumLetSet; 2365 delete fRegionalIndicatorSet; 2366 delete fDictionaryCjkSet; 2367 delete fOtherSet; 2368} 2369 2370 2371 2372 2373//------------------------------------------------------------------------------------------ 2374// 2375// class RBBISentMonkey Sentence Break specific implementation 2376// of RBBIMonkeyKind. 2377// 2378//------------------------------------------------------------------------------------------ 2379class RBBISentMonkey: public RBBIMonkeyKind { 2380public: 2381 RBBISentMonkey(); 2382 virtual ~RBBISentMonkey(); 2383 virtual UVector *charClasses(); 2384 virtual void setText(const UnicodeString &s); 2385 virtual int32_t next(int32_t i); 2386private: 2387 int moveBack(int posFrom); 2388 int moveForward(int posFrom); 2389 UChar32 cAt(int pos); 2390 2391 UVector *fSets; 2392 2393 UnicodeSet *fSepSet; 2394 UnicodeSet *fFormatSet; 2395 UnicodeSet *fSpSet; 2396 UnicodeSet *fLowerSet; 2397 UnicodeSet *fUpperSet; 2398 UnicodeSet *fOLetterSet; 2399 UnicodeSet *fNumericSet; 2400 UnicodeSet *fATermSet; 2401 UnicodeSet *fSContinueSet; 2402 UnicodeSet *fSTermSet; 2403 UnicodeSet *fCloseSet; 2404 UnicodeSet *fOtherSet; 2405 UnicodeSet *fExtendSet; 2406 2407 const UnicodeString *fText; 2408 2409}; 2410 2411RBBISentMonkey::RBBISentMonkey() 2412{ 2413 UErrorCode status = U_ZERO_ERROR; 2414 2415 fSets = new UVector(status); 2416 2417 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 2418 // set and made into character classes of their own. For the monkey impl, 2419 // they remain in SEP, since Sep always appears with CR and LF in the rules. 2420 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 2421 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 2422 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 2423 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 2424 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 2425 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 2426 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 2427 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 2428 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 2429 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 2430 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 2431 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 2432 fOtherSet = new UnicodeSet(); 2433 2434 if(U_FAILURE(status)) { 2435 deferredStatus = status; 2436 return; 2437 } 2438 2439 fOtherSet->complement(); 2440 fOtherSet->removeAll(*fSepSet); 2441 fOtherSet->removeAll(*fFormatSet); 2442 fOtherSet->removeAll(*fSpSet); 2443 fOtherSet->removeAll(*fLowerSet); 2444 fOtherSet->removeAll(*fUpperSet); 2445 fOtherSet->removeAll(*fOLetterSet); 2446 fOtherSet->removeAll(*fNumericSet); 2447 fOtherSet->removeAll(*fATermSet); 2448 fOtherSet->removeAll(*fSContinueSet); 2449 fOtherSet->removeAll(*fSTermSet); 2450 fOtherSet->removeAll(*fCloseSet); 2451 fOtherSet->removeAll(*fExtendSet); 2452 2453 fSets->addElement(fSepSet, status); 2454 fSets->addElement(fFormatSet, status); 2455 fSets->addElement(fSpSet, status); 2456 fSets->addElement(fLowerSet, status); 2457 fSets->addElement(fUpperSet, status); 2458 fSets->addElement(fOLetterSet, status); 2459 fSets->addElement(fNumericSet, status); 2460 fSets->addElement(fATermSet, status); 2461 fSets->addElement(fSContinueSet, status); 2462 fSets->addElement(fSTermSet, status); 2463 fSets->addElement(fCloseSet, status); 2464 fSets->addElement(fOtherSet, status); 2465 fSets->addElement(fExtendSet, status); 2466 2467 if (U_FAILURE(status)) { 2468 deferredStatus = status; 2469 } 2470} 2471 2472 2473 2474void RBBISentMonkey::setText(const UnicodeString &s) { 2475 fText = &s; 2476} 2477 2478UVector *RBBISentMonkey::charClasses() { 2479 return fSets; 2480} 2481 2482 2483// moveBack() Find the "significant" code point preceding the index i. 2484// Skips over ($Extend | $Format)* . 2485// 2486int RBBISentMonkey::moveBack(int i) { 2487 if (i <= 0) { 2488 return -1; 2489 } 2490 UChar32 c; 2491 int32_t j = i; 2492 do { 2493 j = fText->moveIndex32(j, -1); 2494 c = fText->char32At(j); 2495 } 2496 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 2497 return j; 2498 2499 } 2500 2501 2502int RBBISentMonkey::moveForward(int i) { 2503 if (i>=fText->length()) { 2504 return fText->length(); 2505 } 2506 UChar32 c; 2507 int32_t j = i; 2508 do { 2509 j = fText->moveIndex32(j, 1); 2510 c = cAt(j); 2511 } 2512 while (fFormatSet->contains(c) || fExtendSet->contains(c)); 2513 return j; 2514} 2515 2516UChar32 RBBISentMonkey::cAt(int pos) { 2517 if (pos<0 || pos>=fText->length()) { 2518 return -1; 2519 } else { 2520 return fText->char32At(pos); 2521 } 2522} 2523 2524int32_t RBBISentMonkey::next(int32_t prevPos) { 2525 int p0, p1, p2, p3; // Indices of the significant code points around the 2526 // break position being tested. The candidate break 2527 // location is before p2. 2528 2529 int breakPos = -1; 2530 2531 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2532 UChar32 c; 2533 2534 if (U_FAILURE(deferredStatus)) { 2535 return -1; 2536 } 2537 2538 // Prev break at end of string. return DONE. 2539 if (prevPos >= fText->length()) { 2540 return -1; 2541 } 2542 p0 = p1 = p2 = p3 = prevPos; 2543 c3 = fText->char32At(prevPos); 2544 c0 = c1 = c2 = 0; 2545 (void)p0; // Suppress set but not used warning. 2546 2547 // Loop runs once per "significant" character position in the input text. 2548 for (;;) { 2549 // Move all of the positions forward in the input string. 2550 p0 = p1; c0 = c1; 2551 p1 = p2; c1 = c2; 2552 p2 = p3; c2 = c3; 2553 2554 // Advancd p3 by X(Extend | Format)* Rule 4 2555 p3 = moveForward(p3); 2556 c3 = cAt(p3); 2557 2558 // Rule (3) CR x LF 2559 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 2560 continue; 2561 } 2562 2563 // Rule (4). Sep <break> 2564 if (fSepSet->contains(c1)) { 2565 p2 = p1+1; // Separators don't combine with Extend or Format. 2566 break; 2567 } 2568 2569 if (p2 >= fText->length()) { 2570 // Reached end of string. Always a break position. 2571 break; 2572 } 2573 2574 if (p2 == prevPos) { 2575 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2576 continue; 2577 } 2578 2579 // Rule (6). ATerm x Numeric 2580 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 2581 continue; 2582 } 2583 2584 // Rule (7). Upper ATerm x Uppper 2585 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) { 2586 continue; 2587 } 2588 2589 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 2590 // Note: STerm | ATerm are added to the negated part of the expression by a 2591 // note to the Unicode 5.0 documents. 2592 int p8 = p1; 2593 while (fSpSet->contains(cAt(p8))) { 2594 p8 = moveBack(p8); 2595 } 2596 while (fCloseSet->contains(cAt(p8))) { 2597 p8 = moveBack(p8); 2598 } 2599 if (fATermSet->contains(cAt(p8))) { 2600 p8=p2; 2601 for (;;) { 2602 c = cAt(p8); 2603 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 2604 fLowerSet->contains(c) || fSepSet->contains(c) || 2605 fATermSet->contains(c) || fSTermSet->contains(c)) { 2606 break; 2607 } 2608 p8 = moveForward(p8); 2609 } 2610 if (fLowerSet->contains(cAt(p8))) { 2611 continue; 2612 } 2613 } 2614 2615 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 2616 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 2617 p8 = p1; 2618 while (fSpSet->contains(cAt(p8))) { 2619 p8 = moveBack(p8); 2620 } 2621 while (fCloseSet->contains(cAt(p8))) { 2622 p8 = moveBack(p8); 2623 } 2624 c = cAt(p8); 2625 if (fSTermSet->contains(c) || fATermSet->contains(c)) { 2626 continue; 2627 } 2628 } 2629 2630 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 2631 int p9 = p1; 2632 while (fCloseSet->contains(cAt(p9))) { 2633 p9 = moveBack(p9); 2634 } 2635 c = cAt(p9); 2636 if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 2637 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 2638 continue; 2639 } 2640 } 2641 2642 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 2643 int p10 = p1; 2644 while (fSpSet->contains(cAt(p10))) { 2645 p10 = moveBack(p10); 2646 } 2647 while (fCloseSet->contains(cAt(p10))) { 2648 p10 = moveBack(p10); 2649 } 2650 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 2651 if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 2652 continue; 2653 } 2654 } 2655 2656 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 2657 int p11 = p1; 2658 if (fSepSet->contains(cAt(p11))) { 2659 p11 = moveBack(p11); 2660 } 2661 while (fSpSet->contains(cAt(p11))) { 2662 p11 = moveBack(p11); 2663 } 2664 while (fCloseSet->contains(cAt(p11))) { 2665 p11 = moveBack(p11); 2666 } 2667 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 2668 break; 2669 } 2670 2671 // Rule (12) Any x Any 2672 continue; 2673 } 2674 breakPos = p2; 2675 return breakPos; 2676} 2677 2678RBBISentMonkey::~RBBISentMonkey() { 2679 delete fSets; 2680 delete fSepSet; 2681 delete fFormatSet; 2682 delete fSpSet; 2683 delete fLowerSet; 2684 delete fUpperSet; 2685 delete fOLetterSet; 2686 delete fNumericSet; 2687 delete fATermSet; 2688 delete fSContinueSet; 2689 delete fSTermSet; 2690 delete fCloseSet; 2691 delete fOtherSet; 2692 delete fExtendSet; 2693} 2694 2695 2696 2697//------------------------------------------------------------------------------------------- 2698// 2699// RBBILineMonkey 2700// 2701//------------------------------------------------------------------------------------------- 2702 2703class RBBILineMonkey: public RBBIMonkeyKind { 2704public: 2705 RBBILineMonkey(); 2706 virtual ~RBBILineMonkey(); 2707 virtual UVector *charClasses(); 2708 virtual void setText(const UnicodeString &s); 2709 virtual int32_t next(int32_t i); 2710 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 2711private: 2712 UVector *fSets; 2713 2714 UnicodeSet *fBK; 2715 UnicodeSet *fCR; 2716 UnicodeSet *fLF; 2717 UnicodeSet *fCM; 2718 UnicodeSet *fNL; 2719 UnicodeSet *fSG; 2720 UnicodeSet *fWJ; 2721 UnicodeSet *fZW; 2722 UnicodeSet *fGL; 2723 UnicodeSet *fCB; 2724 UnicodeSet *fSP; 2725 UnicodeSet *fB2; 2726 UnicodeSet *fBA; 2727 UnicodeSet *fBB; 2728 UnicodeSet *fHY; 2729 UnicodeSet *fH2; 2730 UnicodeSet *fH3; 2731 UnicodeSet *fCL; 2732 UnicodeSet *fCP; 2733 UnicodeSet *fEX; 2734 UnicodeSet *fIN; 2735 UnicodeSet *fJL; 2736 UnicodeSet *fJV; 2737 UnicodeSet *fJT; 2738 UnicodeSet *fNS; 2739 UnicodeSet *fOP; 2740 UnicodeSet *fQU; 2741 UnicodeSet *fIS; 2742 UnicodeSet *fNU; 2743 UnicodeSet *fPO; 2744 UnicodeSet *fPR; 2745 UnicodeSet *fSY; 2746 UnicodeSet *fAI; 2747 UnicodeSet *fAL; 2748 UnicodeSet *fCJ; 2749 UnicodeSet *fHL; 2750 UnicodeSet *fID; 2751 UnicodeSet *fRI; 2752 UnicodeSet *fSA; 2753 UnicodeSet *fXX; 2754 2755 BreakIterator *fCharBI; 2756 const UnicodeString *fText; 2757 RegexMatcher *fNumberMatcher; 2758}; 2759 2760 2761RBBILineMonkey::RBBILineMonkey() 2762{ 2763 UErrorCode status = U_ZERO_ERROR; 2764 2765 fSets = new UVector(status); 2766 2767 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 2768 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 2769 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 2770 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 2771 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 2772 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 2773 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 2774 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 2775 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 2776 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 2777 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 2778 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 2779 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 2780 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 2781 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 2782 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 2783 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 2784 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status); 2785 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 2786 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 2787 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 2788 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 2789 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 2790 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 2791 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 2792 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 2793 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 2794 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 2795 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 2796 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 2797 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 2798 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 2799 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 2800 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status); 2801 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status); 2802 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 2803 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status); 2804 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status); 2805 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 2806 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 2807 2808 if (U_FAILURE(status)) { 2809 deferredStatus = status; 2810 fCharBI = NULL; 2811 fNumberMatcher = NULL; 2812 return; 2813 } 2814 2815 fAL->addAll(*fXX); // Default behavior for XX is identical to AL 2816 fAL->addAll(*fAI); // Default behavior for AI is identical to AL 2817 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL 2818 fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 2819 2820 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS. 2821 2822 fSets->addElement(fBK, status); 2823 fSets->addElement(fCR, status); 2824 fSets->addElement(fLF, status); 2825 fSets->addElement(fCM, status); 2826 fSets->addElement(fNL, status); 2827 fSets->addElement(fWJ, status); 2828 fSets->addElement(fZW, status); 2829 fSets->addElement(fGL, status); 2830 fSets->addElement(fCB, status); 2831 fSets->addElement(fSP, status); 2832 fSets->addElement(fB2, status); 2833 fSets->addElement(fBA, status); 2834 fSets->addElement(fBB, status); 2835 fSets->addElement(fHY, status); 2836 fSets->addElement(fH2, status); 2837 fSets->addElement(fH3, status); 2838 fSets->addElement(fCL, status); 2839 fSets->addElement(fCP, status); 2840 fSets->addElement(fEX, status); 2841 fSets->addElement(fIN, status); 2842 fSets->addElement(fJL, status); 2843 fSets->addElement(fJT, status); 2844 fSets->addElement(fJV, status); 2845 fSets->addElement(fNS, status); 2846 fSets->addElement(fOP, status); 2847 fSets->addElement(fQU, status); 2848 fSets->addElement(fIS, status); 2849 fSets->addElement(fNU, status); 2850 fSets->addElement(fPO, status); 2851 fSets->addElement(fPR, status); 2852 fSets->addElement(fSY, status); 2853 fSets->addElement(fAI, status); 2854 fSets->addElement(fAL, status); 2855 fSets->addElement(fHL, status); 2856 fSets->addElement(fID, status); 2857 fSets->addElement(fWJ, status); 2858 fSets->addElement(fRI, status); 2859 fSets->addElement(fSA, status); 2860 fSets->addElement(fSG, status); 2861 2862 const char *rules = 2863 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" 2864 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" 2865 "\\p{Line_Break=NU}\\p{Line_Break=CM}*" 2866 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*" 2867 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?" 2868 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"; 2869 2870 fNumberMatcher = new RegexMatcher( 2871 UnicodeString(rules, -1, US_INV), 0, status); 2872 2873 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 2874 2875 if (U_FAILURE(status)) { 2876 deferredStatus = status; 2877 } 2878} 2879 2880 2881void RBBILineMonkey::setText(const UnicodeString &s) { 2882 fText = &s; 2883 fCharBI->setText(s); 2884 fNumberMatcher->reset(s); 2885} 2886 2887// 2888// rule9Adjust 2889// Line Break TR rules 9 and 10 implementation. 2890// This deals with combining marks and other sequences that 2891// that must be treated as if they were something other than what they actually are. 2892// 2893// This is factored out into a separate function because it must be applied twice for 2894// each potential break, once to the chars before the position being checked, then 2895// again to the text following the possible break. 2896// 2897void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 2898 if (pos == -1) { 2899 // Invalid initial position. Happens during the warmup iteration of the 2900 // main loop in next(). 2901 return; 2902 } 2903 2904 int32_t nPos = *nextPos; 2905 2906 // LB 9 Keep combining sequences together. 2907 // advance over any CM class chars. Note that Line Break CM is different 2908 // from the normal Grapheme Extend property. 2909 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 2910 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 2911 for (;;) { 2912 *nextChar = fText->char32At(nPos); 2913 if (!fCM->contains(*nextChar)) { 2914 break; 2915 } 2916 nPos = fText->moveIndex32(nPos, 1); 2917 } 2918 } 2919 2920 2921 // LB 9 Treat X CM* as if it were x. 2922 // No explicit action required. 2923 2924 // LB 10 Treat any remaining combining mark as AL 2925 if (fCM->contains(*posChar)) { 2926 *posChar = 0x41; // thisChar = 'A'; 2927 } 2928 2929 // Push the updated nextPos and nextChar back to our caller. 2930 // This only makes a difference if posChar got bigger by consuming a 2931 // combining sequence. 2932 *nextPos = nPos; 2933 *nextChar = fText->char32At(nPos); 2934} 2935 2936 2937 2938int32_t RBBILineMonkey::next(int32_t startPos) { 2939 UErrorCode status = U_ZERO_ERROR; 2940 int32_t pos; // Index of the char following a potential break position 2941 UChar32 thisChar; // Character at above position "pos" 2942 2943 int32_t prevPos; // Index of the char preceding a potential break position 2944 UChar32 prevChar; // Character at above position. Note that prevChar 2945 // and thisChar may not be adjacent because combining 2946 // characters between them will be ignored. 2947 2948 int32_t prevPosX2; // Second previous character. Wider context for LB21a. 2949 UChar32 prevCharX2; 2950 2951 int32_t nextPos; // Index of the next character following pos. 2952 // Usually skips over combining marks. 2953 int32_t nextCPPos; // Index of the code point following "pos." 2954 // May point to a combining mark. 2955 int32_t tPos; // temp value. 2956 UChar32 c; 2957 2958 if (U_FAILURE(deferredStatus)) { 2959 return -1; 2960 } 2961 2962 if (startPos >= fText->length()) { 2963 return -1; 2964 } 2965 2966 2967 // Initial values for loop. Loop will run the first time without finding breaks, 2968 // while the invalid values shift out and the "this" and 2969 // "prev" positions are filled in with good values. 2970 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration. 2971 thisChar = prevChar = prevCharX2 = 0; 2972 nextPos = nextCPPos = startPos; 2973 2974 2975 // Loop runs once per position in the test text, until a break position 2976 // is found. 2977 for (;;) { 2978 prevPosX2 = prevPos; 2979 prevCharX2 = prevChar; 2980 2981 prevPos = pos; 2982 prevChar = thisChar; 2983 2984 pos = nextPos; 2985 thisChar = fText->char32At(pos); 2986 2987 nextCPPos = fText->moveIndex32(pos, 1); 2988 nextPos = nextCPPos; 2989 2990 // Rule LB2 - Break at end of text. 2991 if (pos >= fText->length()) { 2992 break; 2993 } 2994 2995 // Rule LB 9 - adjust for combining sequences. 2996 // We do this one out-of-order because the adjustment does not change anything 2997 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 2998 // be applied. 2999 rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 3000 nextCPPos = nextPos = fText->moveIndex32(pos, 1); 3001 c = fText->char32At(nextPos); 3002 rule9Adjust(pos, &thisChar, &nextPos, &c); 3003 3004 // If the loop is still warming up - if we haven't shifted the initial 3005 // -1 positions out of prevPos yet - loop back to advance the 3006 // position in the input without any further looking for breaks. 3007 if (prevPos == -1) { 3008 continue; 3009 } 3010 3011 // LB 4 Always break after hard line breaks, 3012 if (fBK->contains(prevChar)) { 3013 break; 3014 } 3015 3016 // LB 5 Break after CR, LF, NL, but not inside CR LF 3017 if (prevChar == 0x0d && thisChar == 0x0a) { 3018 continue; 3019 } 3020 if (prevChar == 0x0d || 3021 prevChar == 0x0a || 3022 prevChar == 0x85) { 3023 break; 3024 } 3025 3026 // LB 6 Don't break before hard line breaks 3027 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 3028 fBK->contains(thisChar)) { 3029 continue; 3030 } 3031 3032 3033 // LB 7 Don't break before spaces or zero-width space. 3034 if (fSP->contains(thisChar)) { 3035 continue; 3036 } 3037 3038 if (fZW->contains(thisChar)) { 3039 continue; 3040 } 3041 3042 // LB 8 Break after zero width space 3043 if (fZW->contains(prevChar)) { 3044 break; 3045 } 3046 3047 // LB 9, 10 Already done, at top of loop. 3048 // 3049 3050 3051 // LB 11 Do not break before or after WORD JOINER and related characters. 3052 // x WJ 3053 // WJ x 3054 // 3055 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 3056 continue; 3057 } 3058 3059 // LB 12 3060 // GL x 3061 if (fGL->contains(prevChar)) { 3062 continue; 3063 } 3064 3065 // LB 12a 3066 // [^SP BA HY] x GL 3067 if (!(fSP->contains(prevChar) || 3068 fBA->contains(prevChar) || 3069 fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 3070 continue; 3071 } 3072 3073 3074 3075 // LB 13 Don't break before closings. 3076 // NU x CL, NU x CP and NU x IS are not matched here so that they will 3077 // fall into LB 17 and the more general number regular expression. 3078 // 3079 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) || 3080 (!fNU->contains(prevChar) && fCP->contains(thisChar)) || 3081 fEX->contains(thisChar) || 3082 (!fNU->contains(prevChar) && fIS->contains(thisChar)) || 3083 (!fNU->contains(prevChar) && fSY->contains(thisChar))) { 3084 continue; 3085 } 3086 3087 // LB 14 Don't break after OP SP* 3088 // Scan backwards, checking for this sequence. 3089 // The OP char could include combining marks, so we actually check for 3090 // OP CM* SP* 3091 // Another Twist: The Rule 67 fixes may have changed a SP CM 3092 // sequence into a ID char, so before scanning back through spaces, 3093 // verify that prevChar is indeed a space. The prevChar variable 3094 // may differ from fText[prevPos] 3095 tPos = prevPos; 3096 if (fSP->contains(prevChar)) { 3097 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3098 tPos=fText->moveIndex32(tPos, -1); 3099 } 3100 } 3101 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3102 tPos=fText->moveIndex32(tPos, -1); 3103 } 3104 if (fOP->contains(fText->char32At(tPos))) { 3105 continue; 3106 } 3107 3108 3109 // LB 15 QU SP* x OP 3110 if (fOP->contains(thisChar)) { 3111 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 3112 int tPos = prevPos; 3113 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3114 tPos = fText->moveIndex32(tPos, -1); 3115 } 3116 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3117 tPos = fText->moveIndex32(tPos, -1); 3118 } 3119 if (fQU->contains(fText->char32At(tPos))) { 3120 continue; 3121 } 3122 } 3123 3124 3125 3126 // LB 16 (CL | CP) SP* x NS 3127 // Scan backwards for SP* CM* (CL | CP) 3128 if (fNS->contains(thisChar)) { 3129 int tPos = prevPos; 3130 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3131 tPos = fText->moveIndex32(tPos, -1); 3132 } 3133 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3134 tPos = fText->moveIndex32(tPos, -1); 3135 } 3136 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) { 3137 continue; 3138 } 3139 } 3140 3141 3142 // LB 17 B2 SP* x B2 3143 if (fB2->contains(thisChar)) { 3144 // Scan backwards, checking for the B2 CM* SP* sequence. 3145 tPos = prevPos; 3146 if (fSP->contains(prevChar)) { 3147 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3148 tPos=fText->moveIndex32(tPos, -1); 3149 } 3150 } 3151 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3152 tPos=fText->moveIndex32(tPos, -1); 3153 } 3154 if (fB2->contains(fText->char32At(tPos))) { 3155 continue; 3156 } 3157 } 3158 3159 3160 // LB 18 break after space 3161 if (fSP->contains(prevChar)) { 3162 break; 3163 } 3164 3165 // LB 19 3166 // x QU 3167 // QU x 3168 if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 3169 continue; 3170 } 3171 3172 // LB 20 Break around a CB 3173 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 3174 break; 3175 } 3176 3177 // LB 21 3178 if (fBA->contains(thisChar) || 3179 fHY->contains(thisChar) || 3180 fNS->contains(thisChar) || 3181 fBB->contains(prevChar) ) { 3182 continue; 3183 } 3184 3185 // LB 21a 3186 // HL (HY | BA) x 3187 if (fHL->contains(prevCharX2) && 3188 (fHY->contains(prevChar) || fBA->contains(prevChar))) { 3189 continue; 3190 } 3191 3192 // LB 21b 3193 // SY x HL 3194 if (fSY->contains(prevChar) && fHL->contains(thisChar)) { 3195 continue; 3196 } 3197 3198 // LB 22 3199 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) || 3200 (fHL->contains(prevChar) && fIN->contains(thisChar)) || 3201 (fID->contains(prevChar) && fIN->contains(thisChar)) || 3202 (fIN->contains(prevChar) && fIN->contains(thisChar)) || 3203 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) { 3204 continue; 3205 } 3206 3207 3208 // LB 23 ID x PO 3209 // AL x NU 3210 // HL x NU 3211 // NU x AL 3212 if ((fID->contains(prevChar) && fPO->contains(thisChar)) || 3213 (fAL->contains(prevChar) && fNU->contains(thisChar)) || 3214 (fHL->contains(prevChar) && fNU->contains(thisChar)) || 3215 (fNU->contains(prevChar) && fAL->contains(thisChar)) || 3216 (fNU->contains(prevChar) && fHL->contains(thisChar)) ) { 3217 continue; 3218 } 3219 3220 // LB 24 Do not break between prefix and letters or ideographs. 3221 // PR x ID 3222 // PR x (AL | HL) 3223 // PO x (AL | HL) 3224 if ((fPR->contains(prevChar) && fID->contains(thisChar)) || 3225 (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) || 3226 (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar)))) { 3227 continue; 3228 } 3229 3230 3231 3232 // LB 25 Numbers 3233 if (fNumberMatcher->lookingAt(prevPos, status)) { 3234 if (U_FAILURE(status)) { 3235 break; 3236 } 3237 // Matched a number. But could have been just a single digit, which would 3238 // not represent a "no break here" between prevChar and thisChar 3239 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 3240 if (numEndIdx > pos) { 3241 // Number match includes at least our two chars being checked 3242 if (numEndIdx > nextPos) { 3243 // Number match includes additional chars. Update pos and nextPos 3244 // so that next loop iteration will continue at the end of the number, 3245 // checking for breaks between last char in number & whatever follows. 3246 pos = nextPos = numEndIdx; 3247 do { 3248 pos = fText->moveIndex32(pos, -1); 3249 thisChar = fText->char32At(pos); 3250 } while (fCM->contains(thisChar)); 3251 } 3252 continue; 3253 } 3254 } 3255 3256 3257 // LB 26 Do not break a Korean syllable. 3258 if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 3259 fJV->contains(thisChar) || 3260 fH2->contains(thisChar) || 3261 fH3->contains(thisChar))) { 3262 continue; 3263 } 3264 3265 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 3266 (fJV->contains(thisChar) || fJT->contains(thisChar))) { 3267 continue; 3268 } 3269 3270 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 3271 fJT->contains(thisChar)) { 3272 continue; 3273 } 3274 3275 // LB 27 Treat a Korean Syllable Block the same as ID. 3276 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3277 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3278 fIN->contains(thisChar)) { 3279 continue; 3280 } 3281 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3282 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3283 fPO->contains(thisChar)) { 3284 continue; 3285 } 3286 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 3287 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 3288 continue; 3289 } 3290 3291 3292 3293 // LB 28 Do not break between alphabetics ("at"). 3294 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3295 continue; 3296 } 3297 3298 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 3299 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3300 continue; 3301 } 3302 3303 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 3304 // (AL | NU) x OP 3305 // CP x (AL | NU) 3306 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) { 3307 continue; 3308 } 3309 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) { 3310 continue; 3311 } 3312 3313 // LB30a Do not break between regional indicators. 3314 // RI x RI 3315 if (fRI->contains(prevChar) && fRI->contains(thisChar)) { 3316 continue; 3317 } 3318 3319 // LB 31 Break everywhere else 3320 break; 3321 3322 } 3323 3324 return pos; 3325} 3326 3327 3328UVector *RBBILineMonkey::charClasses() { 3329 return fSets; 3330} 3331 3332 3333RBBILineMonkey::~RBBILineMonkey() { 3334 delete fSets; 3335 3336 delete fBK; 3337 delete fCR; 3338 delete fLF; 3339 delete fCM; 3340 delete fNL; 3341 delete fWJ; 3342 delete fZW; 3343 delete fGL; 3344 delete fCB; 3345 delete fSP; 3346 delete fB2; 3347 delete fBA; 3348 delete fBB; 3349 delete fHY; 3350 delete fH2; 3351 delete fH3; 3352 delete fCL; 3353 delete fCP; 3354 delete fEX; 3355 delete fIN; 3356 delete fJL; 3357 delete fJV; 3358 delete fJT; 3359 delete fNS; 3360 delete fOP; 3361 delete fQU; 3362 delete fIS; 3363 delete fNU; 3364 delete fPO; 3365 delete fPR; 3366 delete fSY; 3367 delete fAI; 3368 delete fAL; 3369 delete fCJ; 3370 delete fHL; 3371 delete fID; 3372 delete fRI; 3373 delete fSA; 3374 delete fSG; 3375 delete fXX; 3376 3377 delete fCharBI; 3378 delete fNumberMatcher; 3379} 3380 3381 3382//------------------------------------------------------------------------------------------- 3383// 3384// TestMonkey 3385// 3386// params 3387// seed=nnnnn Random number starting seed. 3388// Setting the seed allows errors to be reproduced. 3389// loop=nnn Looping count. Controls running time. 3390// -1: run forever. 3391// 0 or greater: run length. 3392// 3393// type = char | word | line | sent | title 3394// 3395//------------------------------------------------------------------------------------------- 3396 3397static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 3398 int32_t val = defaultVal; 3399 name.append(" *= *(-?\\d+)"); 3400 UErrorCode status = U_ZERO_ERROR; 3401 RegexMatcher m(name, params, 0, status); 3402 if (m.find()) { 3403 // The param exists. Convert the string to an int. 3404 char valString[100]; 3405 int32_t paramLength = m.end(1, status) - m.start(1, status); 3406 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 3407 paramLength = (int32_t)(sizeof(valString)-2); 3408 } 3409 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 3410 val = strtol(valString, NULL, 10); 3411 3412 // Delete this parameter from the params string. 3413 m.reset(); 3414 params = m.replaceFirst("", status); 3415 } 3416 U_ASSERT(U_SUCCESS(status)); 3417 return val; 3418} 3419#endif 3420 3421#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3422static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 3423 BreakIterator *bi, 3424 int expected[], 3425 int expectedcount) 3426{ 3427 int count = 0; 3428 int i = 0; 3429 int forward[50]; 3430 bi->setText(ustr); 3431 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3432 forward[count] = i; 3433 if (count < expectedcount && expected[count] != i) { 3434 test->errln("break forward test failed: expected %d but got %d", 3435 expected[count], i); 3436 break; 3437 } 3438 count ++; 3439 } 3440 if (count != expectedcount) { 3441 printStringBreaks(ustr, expected, expectedcount); 3442 test->errln("break forward test failed: missed %d match", 3443 expectedcount - count); 3444 return; 3445 } 3446 // testing boundaries 3447 for (i = 1; i < expectedcount; i ++) { 3448 int j = expected[i - 1]; 3449 if (!bi->isBoundary(j)) { 3450 printStringBreaks(ustr, expected, expectedcount); 3451 test->errln("isBoundary() failed. Expected boundary at position %d", j); 3452 return; 3453 } 3454 for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 3455 if (bi->isBoundary(j)) { 3456 printStringBreaks(ustr, expected, expectedcount); 3457 test->errln("isBoundary() failed. Not expecting boundary at position %d", j); 3458 return; 3459 } 3460 } 3461 } 3462 3463 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3464 count --; 3465 if (forward[count] != i) { 3466 printStringBreaks(ustr, expected, expectedcount); 3467 test->errln("happy break test previous() failed: expected %d but got %d", 3468 forward[count], i); 3469 break; 3470 } 3471 } 3472 if (count != 0) { 3473 printStringBreaks(ustr, expected, expectedcount); 3474 test->errln("break test previous() failed: missed a match"); 3475 return; 3476 } 3477 3478 // testing preceding 3479 for (i = 0; i < expectedcount - 1; i ++) { 3480 // int j = expected[i] + 1; 3481 int j = ustr.moveIndex32(expected[i], 1); 3482 for (; j <= expected[i + 1]; j ++) { 3483 if (bi->preceding(j) != expected[i]) { 3484 printStringBreaks(ustr, expected, expectedcount); 3485 test->errln("preceding(): Not expecting boundary at position %d", j); 3486 return; 3487 } 3488 } 3489 } 3490} 3491#endif 3492 3493void RBBITest::TestWordBreaks(void) 3494{ 3495#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3496 3497 Locale locale("en"); 3498 UErrorCode status = U_ZERO_ERROR; 3499 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3500 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3501 // Replaced any C+J characters in a row with a random sequence of characters 3502 // of the same length to make our C+J segmentation not get in the way. 3503 static const char *strlist[] = 3504 { 3505 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 3506 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", 3507 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 3508 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 3509 "\\uac00\\u3588\\u009c\\u0953\\u194b", 3510 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3511 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 3512 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", 3513 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3514 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3515 "\\u2027\\U000e0067\\u0a47\\u00b7", 3516 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3517 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3518 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3519 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", 3520 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3521 "\\u0027\\u11af\\U000e0057\\u0602", 3522 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3523 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3524 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3525 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3526 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3527 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 3528 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3529 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3530 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3531 "\\u18f4\\U000e0049\\u20e7\\u2027", 3532 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3533 "\\ua183\\u102d\\u0bec\\u003a", 3534 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3535 "\\u003a\\u0e57\\u0fad\\u002e", 3536 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3537 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3538 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 3539 "\\u003a\\u0664\\u00b7\\u1fba", 3540 "\\u003b\\u0027\\u00b7\\u47a3", 3541 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", 3542 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 3543 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 3544 }; 3545 int loop; 3546 if (U_FAILURE(status)) { 3547 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3548 return; 3549 } 3550 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3551 // printf("looping %d\n", loop); 3552 UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 3553 // RBBICharMonkey monkey; 3554 RBBIWordMonkey monkey; 3555 3556 int expected[50]; 3557 int expectedcount = 0; 3558 3559 monkey.setText(ustr); 3560 int i; 3561 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3562 expected[expectedcount ++] = i; 3563 } 3564 3565 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3566 } 3567 delete bi; 3568#endif 3569} 3570 3571void RBBITest::TestWordBoundary(void) 3572{ 3573 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 3574 Locale locale("en"); 3575 UErrorCode status = U_ZERO_ERROR; 3576 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3577 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3578 UChar str[50]; 3579 static const char *strlist[] = 3580 { 3581 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3582 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3583 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3584 "\\u2027\\U000e0067\\u0a47\\u00b7", 3585 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3586 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3587 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3588 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 3589 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3590 "\\u0027\\u11af\\U000e0057\\u0602", 3591 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3592 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3593 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3594 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3595 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3596 "\\U000e0065\\u302c\\u09ee\\U000e0068", 3597 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3598 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3599 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3600 "\\u58f4\\U000e0049\\u20e7\\u2027", 3601 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3602 "\\ua183\\u102d\\u0bec\\u003a", 3603 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3604 "\\u003a\\u0e57\\u0fad\\u002e", 3605 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3606 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3607 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 3608 "\\u003a\\u0664\\u00b7\\u1fba", 3609 "\\u003b\\u0027\\u00b7\\u47a3", 3610 }; 3611 int loop; 3612 if (U_FAILURE(status)) { 3613 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3614 return; 3615 } 3616 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3617 // printf("looping %d\n", loop); 3618 u_unescape(strlist[loop], str, 20); 3619 UnicodeString ustr(str); 3620 int forward[50]; 3621 int count = 0; 3622 3623 bi->setText(ustr); 3624 int prev = 0; 3625 int i; 3626 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3627 forward[count ++] = i; 3628 if (i > prev) { 3629 int j; 3630 for (j = prev + 1; j < i; j ++) { 3631 if (bi->isBoundary(j)) { 3632 printStringBreaks(ustr, forward, count); 3633 errln("happy boundary test failed: expected %d not a boundary", 3634 j); 3635 return; 3636 } 3637 } 3638 } 3639 if (!bi->isBoundary(i)) { 3640 printStringBreaks(ustr, forward, count); 3641 errln("happy boundary test failed: expected %d a boundary", 3642 i); 3643 return; 3644 } 3645 prev = i; 3646 } 3647 } 3648 delete bi; 3649} 3650 3651void RBBITest::TestLineBreaks(void) 3652{ 3653#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3654 Locale locale("en"); 3655 UErrorCode status = U_ZERO_ERROR; 3656 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 3657 const int32_t STRSIZE = 50; 3658 UChar str[STRSIZE]; 3659 static const char *strlist[] = 3660 { 3661 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 3662 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 3663 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 3664 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 3665 "u2014\\U000e0105\\u118c\\u000a\\u07f8", 3666 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 3667 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3668 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 3669 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3670 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 3671 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", 3672 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 3673 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 3674 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 3675 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 3676 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 3677 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 3678 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 3679 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 3680 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 3681 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 3682 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 3683 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 3684 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 3685 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 3686 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 3687 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", 3688 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 3689 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 3690 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 3691 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 3692 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 3693 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", 3694 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 3695 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 3696 "\\u2014\\u0020\\u000a\\u17c5\\u24fc", 3697 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 3698 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 3699 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 3700 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 3701 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 3702 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 3703 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" 3704 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" 3705 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", 3706 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 3707 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 3708 }; 3709 int loop; 3710 TEST_ASSERT_SUCCESS(status); 3711 if (U_FAILURE(status)) { 3712 return; 3713 } 3714 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3715 // printf("looping %d\n", loop); 3716 int32_t t = u_unescape(strlist[loop], str, STRSIZE); 3717 if (t >= STRSIZE) { 3718 TEST_ASSERT(FALSE); 3719 continue; 3720 } 3721 3722 3723 UnicodeString ustr(str); 3724 RBBILineMonkey monkey; 3725 if (U_FAILURE(monkey.deferredStatus)) { 3726 continue; 3727 } 3728 3729 const int EXPECTEDSIZE = 50; 3730 int expected[EXPECTEDSIZE]; 3731 int expectedcount = 0; 3732 3733 monkey.setText(ustr); 3734 int i; 3735 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3736 if (expectedcount >= EXPECTEDSIZE) { 3737 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3738 return; 3739 } 3740 expected[expectedcount ++] = i; 3741 } 3742 3743 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3744 } 3745 delete bi; 3746#endif 3747} 3748 3749void RBBITest::TestSentBreaks(void) 3750{ 3751#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3752 Locale locale("en"); 3753 UErrorCode status = U_ZERO_ERROR; 3754 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 3755 UChar str[200]; 3756 static const char *strlist[] = 3757 { 3758 "Now\ris\nthe\r\ntime\n\rfor\r\r", 3759 "This\n", 3760 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 3761 "\"Sentence ending with a quote.\" Bye.", 3762 " (This is it). Testing the sentence iterator. \"This isn't it.\"", 3763 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 3764 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 3765 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 3766 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 3767 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 3768 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 3769 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 3770 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 3771 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 3772 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 3773 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 3774 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 3775 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 3776 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 3777 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 3778 }; 3779 int loop; 3780 if (U_FAILURE(status)) { 3781 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3782 return; 3783 } 3784 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3785 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0]))); 3786 UnicodeString ustr(str); 3787 3788 RBBISentMonkey monkey; 3789 if (U_FAILURE(monkey.deferredStatus)) { 3790 continue; 3791 } 3792 3793 const int EXPECTEDSIZE = 50; 3794 int expected[EXPECTEDSIZE]; 3795 int expectedcount = 0; 3796 3797 monkey.setText(ustr); 3798 int i; 3799 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3800 if (expectedcount >= EXPECTEDSIZE) { 3801 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3802 return; 3803 } 3804 expected[expectedcount ++] = i; 3805 } 3806 3807 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3808 } 3809 delete bi; 3810#endif 3811} 3812 3813void RBBITest::TestMonkey(char *params) { 3814#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3815 3816 UErrorCode status = U_ZERO_ERROR; 3817 int32_t loopCount = 500; 3818 int32_t seed = 1; 3819 UnicodeString breakType = "all"; 3820 Locale locale("en"); 3821 UBool useUText = FALSE; 3822 3823 if (quick == FALSE) { 3824 loopCount = 10000; 3825 } 3826 3827 if (params) { 3828 UnicodeString p(params); 3829 loopCount = getIntParam("loop", p, loopCount); 3830 seed = getIntParam("seed", p, seed); 3831 3832 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 3833 if (m.find()) { 3834 breakType = m.group(1, status); 3835 m.reset(); 3836 p = m.replaceFirst("", status); 3837 } 3838 3839 RegexMatcher u(" *utext", p, 0, status); 3840 if (u.find()) { 3841 useUText = TRUE; 3842 u.reset(); 3843 p = u.replaceFirst("", status); 3844 } 3845 3846 3847 // m.reset(p); 3848 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 3849 // Each option is stripped out of the option string as it is processed. 3850 // All options have been checked. The option string should have been completely emptied.. 3851 char buf[100]; 3852 p.extract(buf, sizeof(buf), NULL, status); 3853 buf[sizeof(buf)-1] = 0; 3854 errln("Unrecognized or extra parameter: %s\n", buf); 3855 return; 3856 } 3857 3858 } 3859 3860 if (breakType == "char" || breakType == "all") { 3861 RBBICharMonkey m; 3862 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3863 if (U_SUCCESS(status)) { 3864 RunMonkey(bi, m, "char", seed, loopCount, useUText); 3865 if (breakType == "all" && useUText==FALSE) { 3866 // Also run a quick test with UText when "all" is specified 3867 RunMonkey(bi, m, "char", seed, loopCount, TRUE); 3868 } 3869 } 3870 else { 3871 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 3872 } 3873 delete bi; 3874 } 3875 3876 if (breakType == "word" || breakType == "all") { 3877 logln("Word Break Monkey Test"); 3878 RBBIWordMonkey m; 3879 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3880 if (U_SUCCESS(status)) { 3881 RunMonkey(bi, m, "word", seed, loopCount, useUText); 3882 } 3883 else { 3884 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 3885 } 3886 delete bi; 3887 } 3888 3889 if (breakType == "line" || breakType == "all") { 3890 logln("Line Break Monkey Test"); 3891 RBBILineMonkey m; 3892 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 3893 if (loopCount >= 10) { 3894 loopCount = loopCount / 5; // Line break runs slower than the others. 3895 } 3896 if (U_SUCCESS(status)) { 3897 RunMonkey(bi, m, "line", seed, loopCount, useUText); 3898 } 3899 else { 3900 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 3901 } 3902 delete bi; 3903 } 3904 3905 if (breakType == "sent" || breakType == "all" ) { 3906 logln("Sentence Break Monkey Test"); 3907 RBBISentMonkey m; 3908 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 3909 if (loopCount >= 10) { 3910 loopCount = loopCount / 10; // Sentence runs slower than the other break types 3911 } 3912 if (U_SUCCESS(status)) { 3913 RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 3914 } 3915 else { 3916 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 3917 } 3918 delete bi; 3919 } 3920 3921#endif 3922} 3923 3924// 3925// Run a RBBI monkey test. Common routine, for all break iterator types. 3926// Parameters: 3927// bi - the break iterator to use 3928// mk - MonkeyKind, abstraction for obtaining expected results 3929// name - Name of test (char, word, etc.) for use in error messages 3930// seed - Seed for starting random number generator (parameter from user) 3931// numIterations 3932// 3933void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 3934 int32_t numIterations, UBool useUText) { 3935 3936#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3937 3938 const int32_t TESTSTRINGLEN = 500; 3939 UnicodeString testText; 3940 int32_t numCharClasses; 3941 UVector *chClasses; 3942 int expected[TESTSTRINGLEN*2 + 1]; 3943 int expectedCount = 0; 3944 char expectedBreaks[TESTSTRINGLEN*2 + 1]; 3945 char forwardBreaks[TESTSTRINGLEN*2 + 1]; 3946 char reverseBreaks[TESTSTRINGLEN*2+1]; 3947 char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 3948 char followingBreaks[TESTSTRINGLEN*2+1]; 3949 char precedingBreaks[TESTSTRINGLEN*2+1]; 3950 int i; 3951 int loopCount = 0; 3952 3953 m_seed = seed; 3954 3955 numCharClasses = mk.charClasses()->size(); 3956 chClasses = mk.charClasses(); 3957 3958 // Check for errors that occured during the construction of the MonkeyKind object. 3959 // Can't report them where they occured because errln() is a method coming from intlTest, 3960 // and is not visible outside of RBBITest :-( 3961 if (U_FAILURE(mk.deferredStatus)) { 3962 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 3963 return; 3964 } 3965 3966 // Verify that the character classes all have at least one member. 3967 for (i=0; i<numCharClasses; i++) { 3968 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 3969 if (s == NULL || s->size() == 0) { 3970 errln("Character Class #%d is null or of zero size.", i); 3971 return; 3972 } 3973 } 3974 3975 while (loopCount < numIterations || numIterations == -1) { 3976 if (numIterations == -1 && loopCount % 10 == 0) { 3977 // If test is running in an infinite loop, display a periodic tic so 3978 // we can tell that it is making progress. 3979 fprintf(stderr, "."); 3980 } 3981 // Save current random number seed, so that we can recreate the random numbers 3982 // for this loop iteration in event of an error. 3983 seed = m_seed; 3984 3985 // Populate a test string with data. 3986 testText.truncate(0); 3987 for (i=0; i<TESTSTRINGLEN; i++) { 3988 int32_t aClassNum = m_rand() % numCharClasses; 3989 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 3990 int32_t charIdx = m_rand() % classSet->size(); 3991 UChar32 c = classSet->charAt(charIdx); 3992 if (c < 0) { // TODO: deal with sets containing strings. 3993 errln("c < 0"); 3994 break; 3995 } 3996 testText.append(c); 3997 } 3998 3999 // Calculate the expected results for this test string. 4000 mk.setText(testText); 4001 memset(expectedBreaks, 0, sizeof(expectedBreaks)); 4002 expectedBreaks[0] = 1; 4003 int32_t breakPos = 0; 4004 expectedCount = 0; 4005 for (;;) { 4006 breakPos = mk.next(breakPos); 4007 if (breakPos == -1) { 4008 break; 4009 } 4010 if (breakPos > testText.length()) { 4011 errln("breakPos > testText.length()"); 4012 } 4013 expectedBreaks[breakPos] = 1; 4014 U_ASSERT(expectedCount<testText.length()); 4015 expected[expectedCount ++] = breakPos; 4016 (void)expected; // Set but not used warning. 4017 // TODO (andy): check it out. 4018 } 4019 4020 // Find the break positions using forward iteration 4021 memset(forwardBreaks, 0, sizeof(forwardBreaks)); 4022 if (useUText) { 4023 UErrorCode status = U_ZERO_ERROR; 4024 UText *testUText = utext_openReplaceable(NULL, &testText, &status); 4025 // testUText = utext_openUnicodeString(testUText, &testText, &status); 4026 bi->setText(testUText, status); 4027 TEST_ASSERT_SUCCESS(status); 4028 utext_close(testUText); // The break iterator does a shallow clone of the UText 4029 // This UText can be closed immediately, so long as the 4030 // testText string continues to exist. 4031 } else { 4032 bi->setText(testText); 4033 } 4034 4035 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 4036 if (i < 0 || i > testText.length()) { 4037 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4038 break; 4039 } 4040 forwardBreaks[i] = 1; 4041 } 4042 4043 // Find the break positions using reverse iteration 4044 memset(reverseBreaks, 0, sizeof(reverseBreaks)); 4045 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 4046 if (i < 0 || i > testText.length()) { 4047 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4048 break; 4049 } 4050 reverseBreaks[i] = 1; 4051 } 4052 4053 // Find the break positions using isBoundary() tests. 4054 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 4055 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 4056 for (i=0; i<=testText.length(); i++) { 4057 isBoundaryBreaks[i] = bi->isBoundary(i); 4058 } 4059 4060 4061 // Find the break positions using the following() function. 4062 // printf("."); 4063 memset(followingBreaks, 0, sizeof(followingBreaks)); 4064 int32_t lastBreakPos = 0; 4065 followingBreaks[0] = 1; 4066 for (i=0; i<testText.length(); i++) { 4067 breakPos = bi->following(i); 4068 if (breakPos <= i || 4069 breakPos < lastBreakPos || 4070 breakPos > testText.length() || 4071 (breakPos > lastBreakPos && lastBreakPos > i)) { 4072 errln("%s break monkey test: " 4073 "Out of range value returned by BreakIterator::following().\n" 4074 "Random seed=%d index=%d; following returned %d; lastbreak=%d", 4075 name, seed, i, breakPos, lastBreakPos); 4076 break; 4077 } 4078 followingBreaks[breakPos] = 1; 4079 lastBreakPos = breakPos; 4080 } 4081 4082 // Find the break positions using the preceding() function. 4083 memset(precedingBreaks, 0, sizeof(precedingBreaks)); 4084 lastBreakPos = testText.length(); 4085 precedingBreaks[testText.length()] = 1; 4086 for (i=testText.length(); i>0; i--) { 4087 breakPos = bi->preceding(i); 4088 if (breakPos >= i || 4089 breakPos > lastBreakPos || 4090 (breakPos < 0 && testText.getChar32Start(i)>0) || 4091 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) { 4092 errln("%s break monkey test: " 4093 "Out of range value returned by BreakIterator::preceding().\n" 4094 "index=%d; prev returned %d; lastBreak=%d" , 4095 name, i, breakPos, lastBreakPos); 4096 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 4097 precedingBreaks[i] = 2; // Forces an error. 4098 } 4099 } else { 4100 if (breakPos >= 0) { 4101 precedingBreaks[breakPos] = 1; 4102 } 4103 lastBreakPos = breakPos; 4104 } 4105 } 4106 4107 // Compare the expected and actual results. 4108 for (i=0; i<=testText.length(); i++) { 4109 const char *errorType = NULL; 4110 if (forwardBreaks[i] != expectedBreaks[i]) { 4111 errorType = "next()"; 4112 } else if (reverseBreaks[i] != forwardBreaks[i]) { 4113 errorType = "previous()"; 4114 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 4115 errorType = "isBoundary()"; 4116 } else if (followingBreaks[i] != expectedBreaks[i]) { 4117 errorType = "following()"; 4118 } else if (precedingBreaks[i] != expectedBreaks[i]) { 4119 errorType = "preceding()"; 4120 } 4121 4122 4123 if (errorType != NULL) { 4124 // Format a range of the test text that includes the failure as 4125 // a data item that can be included in the rbbi test data file. 4126 4127 // Start of the range is the last point where expected and actual results 4128 // both agreed that there was a break position. 4129 int startContext = i; 4130 int32_t count = 0; 4131 for (;;) { 4132 if (startContext==0) { break; } 4133 startContext --; 4134 if (expectedBreaks[startContext] != 0) { 4135 if (count == 2) break; 4136 count ++; 4137 } 4138 } 4139 4140 // End of range is two expected breaks past the start position. 4141 int endContext = i + 1; 4142 int ci; 4143 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 4144 for (;;) { 4145 if (endContext >= testText.length()) {break;} 4146 if (expectedBreaks[endContext-1] != 0) { 4147 if (count == 0) break; 4148 count --; 4149 } 4150 endContext ++; 4151 } 4152 } 4153 4154 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 4155 UnicodeString errorText = "<data>"; 4156 /***if (strcmp(errorType, "next()") == 0) { 4157 startContext = 0; 4158 endContext = testText.length(); 4159 4160 printStringBreaks(testText, expected, expectedCount); 4161 }***/ 4162 4163 for (ci=startContext; ci<endContext;) { 4164 UnicodeString hexChars("0123456789abcdef"); 4165 UChar32 c; 4166 int bn; 4167 c = testText.char32At(ci); 4168 if (ci == i) { 4169 // This is the location of the error. 4170 errorText.append("<?>"); 4171 } else if (expectedBreaks[ci] != 0) { 4172 // This a non-error expected break position. 4173 errorText.append("\\"); 4174 } 4175 if (c < 0x10000) { 4176 errorText.append("\\u"); 4177 for (bn=12; bn>=0; bn-=4) { 4178 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4179 } 4180 } else { 4181 errorText.append("\\U"); 4182 for (bn=28; bn>=0; bn-=4) { 4183 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4184 } 4185 } 4186 ci = testText.moveIndex32(ci, 1); 4187 } 4188 errorText.append("\\"); 4189 errorText.append("</data>\n"); 4190 4191 // Output the error 4192 char charErrorTxt[500]; 4193 UErrorCode status = U_ZERO_ERROR; 4194 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 4195 charErrorTxt[sizeof(charErrorTxt)-1] = 0; 4196 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status); 4197 4198 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 4199 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 4200 errorType, seed, i, charErrorTxt); 4201 break; 4202 } 4203 } 4204 4205 loopCount++; 4206 } 4207#endif 4208} 4209 4210 4211// Bug 5532. UTF-8 based UText fails in dictionary code. 4212// This test checks the initial patch, 4213// which is to just keep it from crashing. Correct word boundaries 4214// await a proper fix to the dictionary code. 4215// 4216void RBBITest::TestBug5532(void) { 4217 // Text includes a mixture of Thai and Latin. 4218 const unsigned char utf8Data[] = { 4219 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, 4220 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, 4221 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u, 4222 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 4223 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u, 4224 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, 4225 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, 4226 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, 4227 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 4228 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, 4229 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00}; 4230 4231 UErrorCode status = U_ZERO_ERROR; 4232 UText utext=UTEXT_INITIALIZER; 4233 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status); 4234 TEST_ASSERT_SUCCESS(status); 4235 4236 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status); 4237 TEST_ASSERT_SUCCESS(status); 4238 if (U_SUCCESS(status)) { 4239 bi->setText(&utext, status); 4240 TEST_ASSERT_SUCCESS(status); 4241 4242 int32_t breakCount = 0; 4243 int32_t previousBreak = -1; 4244 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) { 4245 // For now, just make sure that the break iterator doesn't hang. 4246 TEST_ASSERT(previousBreak < bi->current()); 4247 previousBreak = bi->current(); 4248 } 4249 TEST_ASSERT(breakCount > 0); 4250 } 4251 delete bi; 4252 utext_close(&utext); 4253} 4254 4255 4256void RBBITest::TestBug9983(void) { 4257 UnicodeString text = UnicodeString("\\u002A" // * Other 4258 "\\uFF65" // Other 4259 "\\u309C" // Katakana 4260 "\\uFF9F" // Extend 4261 "\\uFF65" // Other 4262 "\\u0020" // Other 4263 "\\u0000").unescape(); 4264 4265 UErrorCode status = U_ZERO_ERROR; 4266 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>( 4267 BreakIterator::createWordInstance(Locale::getRoot(), status))); 4268 TEST_ASSERT_SUCCESS(status); 4269 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>( 4270 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status))); 4271 TEST_ASSERT_SUCCESS(status); 4272 if (U_FAILURE(status)) { 4273 return; 4274 } 4275 int32_t offset, rstatus, iterationCount; 4276 4277 brkiter->setText(text); 4278 brkiter->last(); 4279 iterationCount = 0; 4280 while ( (offset = brkiter->previous()) != UBRK_DONE ) { 4281 iterationCount++; 4282 rstatus = brkiter->getRuleStatus(); 4283 (void)rstatus; // Suppress set but not used warning. 4284 if (iterationCount >= 10) { 4285 break; 4286 } 4287 } 4288 TEST_ASSERT(iterationCount == 6); 4289 4290 brkiterPOSIX->setText(text); 4291 brkiterPOSIX->last(); 4292 iterationCount = 0; 4293 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) { 4294 iterationCount++; 4295 rstatus = brkiterPOSIX->getRuleStatus(); 4296 (void)rstatus; // Suppress set but not used warning. 4297 if (iterationCount >= 10) { 4298 break; 4299 } 4300 } 4301 TEST_ASSERT(iterationCount == 6); 4302} 4303 4304 4305// 4306// TestDebug - A place-holder test for debugging purposes. 4307// For putting in fragments of other tests that can be invoked 4308// for tracing without a lot of unwanted extra stuff happening. 4309// 4310void RBBITest::TestDebug(void) { 4311#if 0 4312 UErrorCode status = U_ZERO_ERROR; 4313 int pos = 0; 4314 int ruleStatus = 0; 4315 4316 RuleBasedBreakIterator* bi = 4317 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 4318 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status); 4319 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 4320 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e"); 4321 // UnicodeString s("Aaa. Bcd"); 4322 s = s.unescape(); 4323 bi->setText(s); 4324 UBool r = bi->isBoundary(8); 4325 printf("%s", r?"true":"false"); 4326 return; 4327 pos = bi->last(); 4328 do { 4329 // ruleStatus = bi->getRuleStatus(); 4330 printf("%d\t%d\n", pos, ruleStatus); 4331 pos = bi->previous(); 4332 } while (pos != BreakIterator::DONE); 4333#endif 4334} 4335 4336void RBBITest::TestProperties() { 4337 UErrorCode errorCode = U_ZERO_ERROR; 4338 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode); 4339 if (!prependSet.isEmpty()) { 4340 errln( 4341 "[:GCB=Prepend:] is not empty any more. " 4342 "Uncomment relevant lines in source/data/brkitr/char.txt and " 4343 "change this test to the opposite condition."); 4344 } 4345} 4346 4347#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 4348