1/* 2******************************************************************************** 3* Copyright (C) 1999-2014 International Business Machines Corporation and 4* others. All Rights Reserved. 5******************************************************************************** 6* Date Name Description 7* 10/20/99 alan Creation. 8* 03/22/2000 Madhu Added additional tests 9******************************************************************************** 10*/ 11 12#include <stdio.h> 13 14#include <string.h> 15#include "unicode/utypes.h" 16#include "usettest.h" 17#include "unicode/ucnv.h" 18#include "unicode/uniset.h" 19#include "unicode/uchar.h" 20#include "unicode/usetiter.h" 21#include "unicode/ustring.h" 22#include "unicode/parsepos.h" 23#include "unicode/symtable.h" 24#include "unicode/uversion.h" 25#include "hash.h" 26 27#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \ 28 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \ 29 u_errorName(status));}} 30 31#define TEST_ASSERT(expr) {if (!(expr)) { \ 32 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }} 33 34UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) { 35 UnicodeString pat; 36 set.toPattern(pat); 37 return left + UnicodeSetTest::escape(pat); 38} 39 40#define CASE(id,test) case id: \ 41 name = #test; \ 42 if (exec) { \ 43 logln(#test "---"); \ 44 logln(); \ 45 test(); \ 46 } \ 47 break 48 49UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) { 50} 51 52UConverter *UnicodeSetTest::openUTF8Converter() { 53 if(utf8Cnv==NULL) { 54 UErrorCode errorCode=U_ZERO_ERROR; 55 utf8Cnv=ucnv_open("UTF-8", &errorCode); 56 } 57 return utf8Cnv; 58} 59 60UnicodeSetTest::~UnicodeSetTest() { 61 ucnv_close(utf8Cnv); 62} 63 64void 65UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, 66 const char* &name, char* /*par*/) { 67 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest"); 68 switch (index) { 69 CASE(0,TestPatterns); 70 CASE(1,TestAddRemove); 71 CASE(2,TestCategories); 72 CASE(3,TestCloneEqualHash); 73 CASE(4,TestMinimalRep); 74 CASE(5,TestAPI); 75 CASE(6,TestScriptSet); 76 CASE(7,TestPropertySet); 77 CASE(8,TestClone); 78 CASE(9,TestExhaustive); 79 CASE(10,TestToPattern); 80 CASE(11,TestIndexOf); 81 CASE(12,TestStrings); 82 CASE(13,Testj2268); 83 CASE(14,TestCloseOver); 84 CASE(15,TestEscapePattern); 85 CASE(16,TestInvalidCodePoint); 86 CASE(17,TestSymbolTable); 87 CASE(18,TestSurrogate); 88 CASE(19,TestPosixClasses); 89 CASE(20,TestIteration); 90 CASE(21,TestFreezable); 91 CASE(22,TestSpan); 92 CASE(23,TestStringSpan); 93 default: name = ""; break; 94 } 95} 96 97static const char NOT[] = "%%%%"; 98 99/** 100 * UVector was improperly copying contents 101 * This code will crash this is still true 102 */ 103void UnicodeSetTest::Testj2268() { 104 UnicodeSet t; 105 t.add(UnicodeString("abc")); 106 UnicodeSet test(t); 107 UnicodeString ustrPat; 108 test.toPattern(ustrPat, TRUE); 109} 110 111/** 112 * Test toPattern(). 113 */ 114void UnicodeSetTest::TestToPattern() { 115 UErrorCode ec = U_ZERO_ERROR; 116 117 // Test that toPattern() round trips with syntax characters and 118 // whitespace. 119 { 120 static const char* OTHER_TOPATTERN_TESTS[] = { 121 "[[:latin:]&[:greek:]]", 122 "[[:latin:]-[:greek:]]", 123 "[:nonspacing mark:]", 124 NULL 125 }; 126 127 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) { 128 ec = U_ZERO_ERROR; 129 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec); 130 if (U_FAILURE(ec)) { 131 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec))); 132 continue; 133 } 134 checkPat(OTHER_TOPATTERN_TESTS[j], s); 135 } 136 137 for (UChar32 i = 0; i <= 0x10FFFF; ++i) { 138 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) { 139 140 // check various combinations to make sure they all work. 141 if (i != 0 && !toPatternAux(i, i)){ 142 continue; 143 } 144 if (!toPatternAux(0, i)){ 145 continue; 146 } 147 if (!toPatternAux(i, 0xFFFF)){ 148 continue; 149 } 150 } 151 } 152 } 153 154 // Test pattern behavior of multicharacter strings. 155 { 156 ec = U_ZERO_ERROR; 157 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec); 158 159 // This loop isn't a loop. It's here to make the compiler happy. 160 // If you're curious, try removing it and changing the 'break' 161 // statements (except for the last) to goto's. 162 for (;;) { 163 if (U_FAILURE(ec)) break; 164 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL}; 165 expectToPattern(*s, "[a-z{aa}{ab}]", exp1); 166 167 s->add("ac"); 168 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL}; 169 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2); 170 171 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec); 172 if (U_FAILURE(ec)) break; 173 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL}; 174 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3); 175 176 s->add("[]"); 177 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL}; 178 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4); 179 180 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec); 181 if (U_FAILURE(ec)) break; 182 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL}; 183 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5); 184 185 // j2189 186 s->clear(); 187 s->add(UnicodeString("abc", "")); 188 s->add(UnicodeString("abc", "")); 189 const char* exp6[] = {"abc", NOT, "ab", NULL}; 190 expectToPattern(*s, "[{abc}]", exp6); 191 192 break; 193 } 194 195 if (U_FAILURE(ec)) errln("FAIL: pattern parse error"); 196 delete s; 197 } 198 199 // JB#3400: For 2 character ranges prefer [ab] to [a-b] 200 UnicodeSet s; 201 s.add((UChar)97, (UChar)98); // 'a', 'b' 202 expectToPattern(s, "[ab]", NULL); 203} 204 205UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) { 206 207 // use Integer.toString because Utility.hex doesn't handle ints 208 UnicodeString pat = ""; 209 // TODO do these in hex 210 //String source = "0x" + Integer.toString(start,16).toUpperCase(); 211 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase(); 212 UnicodeString source; 213 source = source + (uint32_t)start; 214 if (start != end) 215 source = source + ".." + (uint32_t)end; 216 UnicodeSet testSet; 217 testSet.add(start, end); 218 return checkPat(source, testSet); 219} 220 221UBool UnicodeSetTest::checkPat(const UnicodeString& source, 222 const UnicodeSet& testSet) { 223 // What we want to make sure of is that a pattern generated 224 // by toPattern(), with or without escaped unprintables, can 225 // be passed back into the UnicodeSet constructor. 226 UnicodeString pat0; 227 228 testSet.toPattern(pat0, TRUE); 229 230 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE; 231 232 //String pat1 = unescapeLeniently(pat0); 233 //if (!checkPat(source + " (in code)", testSet, pat1)) return false; 234 235 UnicodeString pat2; 236 testSet.toPattern(pat2, FALSE); 237 if (!checkPat(source, testSet, pat2)) return FALSE; 238 239 //String pat3 = unescapeLeniently(pat2); 240 // if (!checkPat(source + " (in code)", testSet, pat3)) return false; 241 242 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3); 243 logln((UnicodeString)source + " => " + pat0 + ", " + pat2); 244 return TRUE; 245} 246 247UBool UnicodeSetTest::checkPat(const UnicodeString& source, 248 const UnicodeSet& testSet, 249 const UnicodeString& pat) { 250 UErrorCode ec = U_ZERO_ERROR; 251 UnicodeSet testSet2(pat, ec); 252 if (testSet2 != testSet) { 253 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat); 254 return FALSE; 255 } 256 return TRUE; 257} 258 259void 260UnicodeSetTest::TestPatterns(void) { 261 UnicodeSet set; 262 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km"); 263 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz"); 264 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz"); 265 expectPattern(set, UnicodeString("[-az]", ""), "--aazz"); 266 expectPattern(set, UnicodeString("[az-]", ""), "--aazz"); 267 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz"); 268 269 // Throw in a test of complement 270 set.complement(); 271 UnicodeString exp; 272 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF); 273 expectPairs(set, exp); 274} 275 276void 277UnicodeSetTest::TestCategories(void) { 278 UErrorCode status = U_ZERO_ERROR; 279 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:] 280 UnicodeSet set(pat, status); 281 if (U_FAILURE(status)) { 282 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status))); 283 return; 284 } else { 285 expectContainment(set, pat, "ABC", "abc"); 286 } 287 288 UChar32 i; 289 int32_t failures = 0; 290 // Make sure generation of L doesn't pollute cached Lu set 291 // First generate L, then Lu 292 set.applyPattern("[:L:]", status); 293 if (U_FAILURE(status)) { errln("FAIL"); return; } 294 for (i=0; i<0x200; ++i) { 295 UBool l = u_isalpha((UChar)i); 296 if (l != set.contains(i)) { 297 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " + 298 set.contains(i)); 299 if (++failures == 10) break; 300 } 301 } 302 303 set.applyPattern("[:Lu:]", status); 304 if (U_FAILURE(status)) { errln("FAIL"); return; } 305 for (i=0; i<0x200; ++i) { 306 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER); 307 if (lu != set.contains(i)) { 308 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " + 309 set.contains(i)); 310 if (++failures == 20) break; 311 } 312 } 313} 314void 315UnicodeSetTest::TestCloneEqualHash(void) { 316 UErrorCode status = U_ZERO_ERROR; 317 // set1 and set2 used to be built with the obsolete constructor taking 318 // UCharCategory values; replaced with pattern constructors 319 // markus 20030502 320 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase 321 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase 322 if (U_FAILURE(status)){ 323 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status))); 324 return; 325 } 326 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit 327 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit 328 if (U_FAILURE(status)){ 329 errln((UnicodeString)"FAIL: Can't construct set with category->Nd"); 330 return; 331 } 332 333 if (*set1 != *set1a) { 334 errln("FAIL: category constructor for Ll broken"); 335 } 336 if (*set2 != *set2a) { 337 errln("FAIL: category constructor for Nd broken"); 338 } 339 delete set1a; 340 delete set2a; 341 342 logln("Testing copy construction"); 343 UnicodeSet *set1copy=new UnicodeSet(*set1); 344 if(*set1 != *set1copy || *set1 == *set2 || 345 getPairs(*set1) != getPairs(*set1copy) || 346 set1->hashCode() != set1copy->hashCode()){ 347 errln("FAIL : Error in copy construction"); 348 return; 349 } 350 351 logln("Testing =operator"); 352 UnicodeSet set1equal=*set1; 353 UnicodeSet set2equal=*set2; 354 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 || 355 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){ 356 errln("FAIL: Error in =operator"); 357 } 358 359 logln("Testing clone()"); 360 UnicodeSet *set1clone=(UnicodeSet*)set1->clone(); 361 UnicodeSet *set2clone=(UnicodeSet*)set2->clone(); 362 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal || 363 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal || 364 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){ 365 errln("FAIL: Error in clone"); 366 } 367 368 logln("Testing hashcode"); 369 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() || 370 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() || 371 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() || 372 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() || 373 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){ 374 errln("FAIL: Error in hashCode()"); 375 } 376 377 delete set1; 378 delete set1copy; 379 delete set2; 380 delete set1clone; 381 delete set2clone; 382 383 384} 385void 386UnicodeSetTest::TestAddRemove(void) { 387 UnicodeSet set; // Construct empty set 388 doAssert(set.isEmpty() == TRUE, "set should be empty"); 389 doAssert(set.size() == 0, "size should be 0"); 390 set.complement(); 391 doAssert(set.size() == 0x110000, "size should be 0x110000"); 392 set.clear(); 393 set.add(0x0061, 0x007a); 394 expectPairs(set, "az"); 395 doAssert(set.isEmpty() == FALSE, "set should not be empty"); 396 doAssert(set.size() != 0, "size should not be equal to 0"); 397 doAssert(set.size() == 26, "size should be equal to 26"); 398 set.remove(0x006d, 0x0070); 399 expectPairs(set, "alqz"); 400 doAssert(set.size() == 22, "size should be equal to 22"); 401 set.remove(0x0065, 0x0067); 402 expectPairs(set, "adhlqz"); 403 doAssert(set.size() == 19, "size should be equal to 19"); 404 set.remove(0x0064, 0x0069); 405 expectPairs(set, "acjlqz"); 406 doAssert(set.size() == 16, "size should be equal to 16"); 407 set.remove(0x0063, 0x0072); 408 expectPairs(set, "absz"); 409 doAssert(set.size() == 10, "size should be equal to 10"); 410 set.add(0x0066, 0x0071); 411 expectPairs(set, "abfqsz"); 412 doAssert(set.size() == 22, "size should be equal to 22"); 413 set.remove(0x0061, 0x0067); 414 expectPairs(set, "hqsz"); 415 set.remove(0x0061, 0x007a); 416 expectPairs(set, ""); 417 doAssert(set.isEmpty() == TRUE, "set should be empty"); 418 doAssert(set.size() == 0, "size should be 0"); 419 set.add(0x0061); 420 doAssert(set.isEmpty() == FALSE, "set should not be empty"); 421 doAssert(set.size() == 1, "size should not be equal to 1"); 422 set.add(0x0062); 423 set.add(0x0063); 424 expectPairs(set, "ac"); 425 doAssert(set.size() == 3, "size should not be equal to 3"); 426 set.add(0x0070); 427 set.add(0x0071); 428 expectPairs(set, "acpq"); 429 doAssert(set.size() == 5, "size should not be equal to 5"); 430 set.clear(); 431 expectPairs(set, ""); 432 doAssert(set.isEmpty() == TRUE, "set should be empty"); 433 doAssert(set.size() == 0, "size should be 0"); 434 435 // Try removing an entire set from another set 436 expectPattern(set, "[c-x]", "cx"); 437 UnicodeSet set2; 438 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz"); 439 set.removeAll(set2); 440 expectPairs(set, "deluxx"); 441 442 // Try adding an entire set to another set 443 expectPattern(set, "[jackiemclean]", "aacceein"); 444 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort"); 445 set.addAll(set2); 446 expectPairs(set, "aacehort"); 447 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2"); 448 449 // Try retaining an set of elements contained in another set (intersection) 450 UnicodeSet set3; 451 expectPattern(set3, "[a-c]", "ac"); 452 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3"); 453 set3.remove(0x0062); 454 expectPairs(set3, "aacc"); 455 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3"); 456 set.retainAll(set3); 457 expectPairs(set, "aacc"); 458 doAssert(set.size() == set3.size(), "set.size() should be set3.size()"); 459 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3"); 460 set.clear(); 461 doAssert(set.size() != set3.size(), "set.size() != set3.size()"); 462 463 // Test commutativity 464 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort"); 465 expectPattern(set2, "[jackiemclean]", "aacceein"); 466 set.addAll(set2); 467 expectPairs(set, "aacehort"); 468 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2"); 469 470 471 472 473} 474 475/** 476 * Make sure minimal representation is maintained. 477 */ 478void UnicodeSetTest::TestMinimalRep() { 479 UErrorCode status = U_ZERO_ERROR; 480 // This is pretty thoroughly tested by checkCanonicalRep() 481 // run against the exhaustive operation results. Use the code 482 // here for debugging specific spot problems. 483 484 // 1 overlap against 2 485 UnicodeSet set("[h-km-q]", status); 486 if (U_FAILURE(status)) { errln("FAIL"); return; } 487 UnicodeSet set2("[i-o]", status); 488 if (U_FAILURE(status)) { errln("FAIL"); return; } 489 set.addAll(set2); 490 expectPairs(set, "hq"); 491 // right 492 set.applyPattern("[a-m]", status); 493 if (U_FAILURE(status)) { errln("FAIL"); return; } 494 set2.applyPattern("[e-o]", status); 495 if (U_FAILURE(status)) { errln("FAIL"); return; } 496 set.addAll(set2); 497 expectPairs(set, "ao"); 498 // left 499 set.applyPattern("[e-o]", status); 500 if (U_FAILURE(status)) { errln("FAIL"); return; } 501 set2.applyPattern("[a-m]", status); 502 if (U_FAILURE(status)) { errln("FAIL"); return; } 503 set.addAll(set2); 504 expectPairs(set, "ao"); 505 // 1 overlap against 3 506 set.applyPattern("[a-eg-mo-w]", status); 507 if (U_FAILURE(status)) { errln("FAIL"); return; } 508 set2.applyPattern("[d-q]", status); 509 if (U_FAILURE(status)) { errln("FAIL"); return; } 510 set.addAll(set2); 511 expectPairs(set, "aw"); 512} 513 514void UnicodeSetTest::TestAPI() { 515 UErrorCode status = U_ZERO_ERROR; 516 // default ct 517 UnicodeSet set; 518 if (!set.isEmpty() || set.getRangeCount() != 0) { 519 errln((UnicodeString)"FAIL, set should be empty but isn't: " + 520 set); 521 } 522 523 // clear(), isEmpty() 524 set.add(0x0061); 525 if (set.isEmpty()) { 526 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " + 527 set); 528 } 529 set.clear(); 530 if (!set.isEmpty()) { 531 errln((UnicodeString)"FAIL, set should be empty but isn't: " + 532 set); 533 } 534 535 // size() 536 set.clear(); 537 if (set.size() != 0) { 538 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() + 539 ": " + set); 540 } 541 set.add(0x0061); 542 if (set.size() != 1) { 543 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() + 544 ": " + set); 545 } 546 set.add(0x0031, 0x0039); 547 if (set.size() != 10) { 548 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() + 549 ": " + set); 550 } 551 552 // contains(first, last) 553 set.clear(); 554 set.applyPattern("[A-Y 1-8 b-d l-y]", status); 555 if (U_FAILURE(status)) { errln("FAIL"); return; } 556 for (int32_t i = 0; i<set.getRangeCount(); ++i) { 557 UChar32 a = set.getRangeStart(i); 558 UChar32 b = set.getRangeEnd(i); 559 if (!set.contains(a, b)) { 560 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b + 561 " but doesn't: " + set); 562 } 563 if (set.contains((UChar32)(a-1), b)) { 564 errln((UnicodeString)"FAIL, shouldn't contain " + 565 (unsigned short)(a-1) + '-' + (unsigned short)b + 566 " but does: " + set); 567 } 568 if (set.contains(a, (UChar32)(b+1))) { 569 errln((UnicodeString)"FAIL, shouldn't contain " + 570 (unsigned short)a + '-' + (unsigned short)(b+1) + 571 " but does: " + set); 572 } 573 } 574 575 // Ported InversionList test. 576 UnicodeSet a((UChar32)3,(UChar32)10); 577 UnicodeSet b((UChar32)7,(UChar32)15); 578 UnicodeSet c; 579 580 logln((UnicodeString)"a [3-10]: " + a); 581 logln((UnicodeString)"b [7-15]: " + b); 582 c = a; 583 c.addAll(b); 584 UnicodeSet exp((UChar32)3,(UChar32)15); 585 if (c == exp) { 586 logln((UnicodeString)"c.set(a).add(b): " + c); 587 } else { 588 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp); 589 } 590 c.complement(); 591 exp.set((UChar32)0, (UChar32)2); 592 exp.add((UChar32)16, UnicodeSet::MAX_VALUE); 593 if (c == exp) { 594 logln((UnicodeString)"c.complement(): " + c); 595 } else { 596 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp); 597 } 598 c.complement(); 599 exp.set((UChar32)3, (UChar32)15); 600 if (c == exp) { 601 logln((UnicodeString)"c.complement(): " + c); 602 } else { 603 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp); 604 } 605 c = a; 606 c.complementAll(b); 607 exp.set((UChar32)3,(UChar32)6); 608 exp.add((UChar32)11,(UChar32) 15); 609 if (c == exp) { 610 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c); 611 } else { 612 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp); 613 } 614 615 exp = c; 616 bitsToSet(setToBits(c), c); 617 if (c == exp) { 618 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c); 619 } else { 620 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp); 621 } 622 623 // Additional tests for coverage JB#2118 624 //UnicodeSet::complement(class UnicodeString const &) 625 //UnicodeSet::complementAll(class UnicodeString const &) 626 //UnicodeSet::containsNone(class UnicodeSet const &) 627 //UnicodeSet::containsNone(long,long) 628 //UnicodeSet::containsSome(class UnicodeSet const &) 629 //UnicodeSet::containsSome(long,long) 630 //UnicodeSet::removeAll(class UnicodeString const &) 631 //UnicodeSet::retain(long) 632 //UnicodeSet::retainAll(class UnicodeString const &) 633 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &) 634 //UnicodeSetIterator::getString(void) 635 set.clear(); 636 set.complement("ab"); 637 exp.applyPattern("[{ab}]", status); 638 if (U_FAILURE(status)) { errln("FAIL"); return; } 639 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; } 640 641 UnicodeSetIterator iset(set); 642 if (!iset.next() || !iset.isString()) { 643 errln("FAIL: UnicodeSetIterator::next/isString"); 644 } else if (iset.getString() != "ab") { 645 errln("FAIL: UnicodeSetIterator::getString"); 646 } 647 648 set.add((UChar32)0x61, (UChar32)0x7A); 649 set.complementAll("alan"); 650 exp.applyPattern("[{ab}b-kmo-z]", status); 651 if (U_FAILURE(status)) { errln("FAIL"); return; } 652 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; } 653 654 exp.applyPattern("[a-z]", status); 655 if (U_FAILURE(status)) { errln("FAIL"); return; } 656 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 657 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 658 exp.applyPattern("[aln]", status); 659 if (U_FAILURE(status)) { errln("FAIL"); return; } 660 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 661 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 662 663 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) { 664 errln("FAIL: containsNone(UChar32, UChar32)"); 665 } 666 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) { 667 errln("FAIL: containsSome(UChar32, UChar32)"); 668 } 669 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) { 670 errln("FAIL: containsNone(UChar32, UChar32)"); 671 } 672 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) { 673 errln("FAIL: containsSome(UChar32, UChar32)"); 674 } 675 676 set.removeAll("liu"); 677 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status); 678 if (U_FAILURE(status)) { errln("FAIL"); return; } 679 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; } 680 681 set.retainAll("star"); 682 exp.applyPattern("[rst]", status); 683 if (U_FAILURE(status)) { errln("FAIL"); return; } 684 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; } 685 686 set.retain((UChar32)0x73); 687 exp.applyPattern("[s]", status); 688 if (U_FAILURE(status)) { errln("FAIL"); return; } 689 if (set != exp) { errln("FAIL: retain('s')"); return; } 690 691 uint16_t buf[32]; 692 int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status); 693 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; } 694 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) { 695 errln("FAIL: serialize"); 696 return; 697 } 698 699 // Conversions to and from USet 700 UnicodeSet *uniset = &set; 701 USet *uset = uniset->toUSet(); 702 TEST_ASSERT((void *)uset == (void *)uniset); 703 UnicodeSet *setx = UnicodeSet::fromUSet(uset); 704 TEST_ASSERT((void *)setx == (void *)uset); 705 const UnicodeSet *constSet = uniset; 706 const USet *constUSet = constSet->toUSet(); 707 TEST_ASSERT((void *)constUSet == (void *)constSet); 708 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet); 709 TEST_ASSERT((void *)constSetx == (void *)constUSet); 710 711 // span(UnicodeString) and spanBack(UnicodeString) convenience methods 712 UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc"); 713 UnicodeSet ac(0x61, 0x63); 714 ac.remove(0x62).freeze(); 715 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 || 716 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 || 717 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 || 718 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 || 719 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 || 720 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 || 721 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 || 722 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 || 723 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 || 724 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30 725 ) { 726 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes"); 727 } 728 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 || 729 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 || 730 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 || 731 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 || 732 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 || 733 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 || 734 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 || 735 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 || 736 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 || 737 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20 738 ) { 739 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes"); 740 } 741} 742 743void UnicodeSetTest::TestIteration() { 744 UErrorCode ec = U_ZERO_ERROR; 745 int i = 0; 746 int outerLoop; 747 748 // 6 code points, 3 ranges, 2 strings, 8 total elements 749 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2" 750 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec); 751 TEST_ASSERT_SUCCESS(ec); 752 UnicodeSetIterator it(set); 753 754 for (outerLoop=0; outerLoop<3; outerLoop++) { 755 // Run the test multiple times, to check that iterator.reset() is working. 756 for (i=0; i<10; i++) { 757 UBool nextv = it.next(); 758 UBool isString = it.isString(); 759 int32_t codePoint = it.getCodepoint(); 760 //int32_t codePointEnd = it.getCodepointEnd(); 761 UnicodeString s = it.getString(); 762 switch (i) { 763 case 0: 764 TEST_ASSERT(nextv == TRUE); 765 TEST_ASSERT(isString == FALSE); 766 TEST_ASSERT(codePoint==0x61); 767 TEST_ASSERT(s == "a"); 768 break; 769 case 1: 770 TEST_ASSERT(nextv == TRUE); 771 TEST_ASSERT(isString == FALSE); 772 TEST_ASSERT(codePoint==0x62); 773 TEST_ASSERT(s == "b"); 774 break; 775 case 2: 776 TEST_ASSERT(nextv == TRUE); 777 TEST_ASSERT(isString == FALSE); 778 TEST_ASSERT(codePoint==0x63); 779 TEST_ASSERT(s == "c"); 780 break; 781 case 3: 782 TEST_ASSERT(nextv == TRUE); 783 TEST_ASSERT(isString == FALSE); 784 TEST_ASSERT(codePoint==0x79); 785 TEST_ASSERT(s == "y"); 786 break; 787 case 4: 788 TEST_ASSERT(nextv == TRUE); 789 TEST_ASSERT(isString == FALSE); 790 TEST_ASSERT(codePoint==0x7a); 791 TEST_ASSERT(s == "z"); 792 break; 793 case 5: 794 TEST_ASSERT(nextv == TRUE); 795 TEST_ASSERT(isString == FALSE); 796 TEST_ASSERT(codePoint==0x1abcd); 797 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd)); 798 break; 799 case 6: 800 TEST_ASSERT(nextv == TRUE); 801 TEST_ASSERT(isString == TRUE); 802 TEST_ASSERT(s == "str1"); 803 break; 804 case 7: 805 TEST_ASSERT(nextv == TRUE); 806 TEST_ASSERT(isString == TRUE); 807 TEST_ASSERT(s == "str2"); 808 break; 809 case 8: 810 TEST_ASSERT(nextv == FALSE); 811 break; 812 case 9: 813 TEST_ASSERT(nextv == FALSE); 814 break; 815 } 816 } 817 it.reset(); // prepare to run the iteration again. 818 } 819} 820 821 822 823 824void UnicodeSetTest::TestStrings() { 825 UErrorCode ec = U_ZERO_ERROR; 826 827 UnicodeSet* testList[] = { 828 UnicodeSet::createFromAll("abc"), 829 new UnicodeSet("[a-c]", ec), 830 831 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")), 832 new UnicodeSet("[{ll}{ch}a-z]", ec), 833 834 UnicodeSet::createFrom("ab}c"), 835 new UnicodeSet("[{ab\\}c}]", ec), 836 837 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')), 838 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec), 839 840 NULL 841 }; 842 843 if (U_FAILURE(ec)) { 844 errln("FAIL: couldn't construct test sets"); 845 } 846 847 for (int32_t i = 0; testList[i] != NULL; i+=2) { 848 if (U_SUCCESS(ec)) { 849 UnicodeString pat0, pat1; 850 testList[i]->toPattern(pat0, TRUE); 851 testList[i+1]->toPattern(pat1, TRUE); 852 if (*testList[i] == *testList[i+1]) { 853 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1); 854 } else { 855 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1); 856 } 857 } 858 delete testList[i]; 859 delete testList[i+1]; 860 } 861} 862 863/** 864 * Test the [:Latin:] syntax. 865 */ 866void UnicodeSetTest::TestScriptSet() { 867 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1")); 868 869 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA"); 870 871 /* Jitterbug 1423 */ 872 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA"); 873 874} 875 876/** 877 * Test the [:Latin:] syntax. 878 */ 879void UnicodeSetTest::TestPropertySet() { 880 static const char* const DATA[] = { 881 // Pattern, Chars IN, Chars NOT in 882 883 "[:Latin:]", 884 "aA", 885 "\\u0391\\u03B1", 886 887 "[\\p{Greek}]", 888 "\\u0391\\u03B1", 889 "aA", 890 891 "\\P{ GENERAL Category = upper case letter }", 892 "abc", 893 "ABC", 894 895#if !UCONFIG_NO_NORMALIZATION 896 // Combining class: @since ICU 2.2 897 // Check both symbolic and numeric 898 "\\p{ccc=Nukta}", 899 "\\u0ABC", 900 "abc", 901 902 "\\p{Canonical Combining Class = 11}", 903 "\\u05B1", 904 "\\u05B2", 905 906 "[:c c c = iota subscript :]", 907 "\\u0345", 908 "xyz", 909#endif 910 911 // Bidi class: @since ICU 2.2 912 "\\p{bidiclass=lefttoright}", 913 "abc", 914 "\\u0671\\u0672", 915 916 // Binary properties: @since ICU 2.2 917 "\\p{ideographic}", 918 "\\u4E0A", 919 "x", 920 921 "[:math=false:]", 922 "q)*(", 923 // weiv: )(and * were removed from math in Unicode 4.0.1 924 //"(*+)", 925 "+<>^", 926 927 // JB#1767 \N{}, \p{ASCII} 928 "[:Ascii:]", 929 "abc\\u0000\\u007F", 930 "\\u0080\\u4E00", 931 932 "[\\N{ latin small letter a }[:name= latin small letter z:]]", 933 "az", 934 "qrs", 935 936 // JB#2015 937 "[:any:]", 938 "a\\U0010FFFF", 939 "", 940 941 "[:nv=0.5:]", 942 "\\u00BD\\u0F2A", 943 "\\u00BC", 944 945 // JB#2653: Age 946 "[:Age=1.1:]", 947 "\\u03D6", // 1.1 948 "\\u03D8\\u03D9", // 3.2 949 950 "[:Age=3.1:]", 951 "\\u1800\\u3400\\U0002f800", 952 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000", 953 954 // JB#2350: Case_Sensitive 955 "[:Case Sensitive:]", 956 "A\\u1FFC\\U00010410", 957 ";\\u00B4\\U00010500", 958 959 // JB#2832: C99-compatibility props 960 "[:blank:]", 961 " \\u0009", 962 "1-9A-Z", 963 964 "[:graph:]", 965 "19AZ", 966 " \\u0003\\u0007\\u0009\\u000A\\u000D", 967 968 "[:punct:]", 969 "!@#%&*()[]{}-_\\/;:,.?'\"", 970 "09azAZ", 971 972 "[:xdigit:]", 973 "09afAF", 974 "gG!", 975 976 // Regex compatibility test 977 "[-b]", // leading '-' is literal 978 "-b", 979 "ac", 980 981 "[^-b]", // leading '-' is literal 982 "ac", 983 "-b", 984 985 "[b-]", // trailing '-' is literal 986 "-b", 987 "ac", 988 989 "[^b-]", // trailing '-' is literal 990 "ac", 991 "-b", 992 993 "[a-b-]", // trailing '-' is literal 994 "ab-", 995 "c=", 996 997 "[[a-q]&[p-z]-]", // trailing '-' is literal 998 "pq-", 999 "or=", 1000 1001 "[\\s|\\)|:|$|\\>]", // from regex tests 1002 "s|):$>", 1003 "abc", 1004 1005 "[\\uDC00cd]", // JB#2906: isolated trail at start 1006 "cd\\uDC00", 1007 "ab\\uD800\\U00010000", 1008 1009 "[ab\\uD800]", // JB#2906: isolated trail at start 1010 "ab\\uD800", 1011 "cd\\uDC00\\U00010000", 1012 1013 "[ab\\uD800cd]", // JB#2906: isolated lead in middle 1014 "abcd\\uD800", 1015 "ef\\uDC00\\U00010000", 1016 1017 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle 1018 "abcd\\uDC00", 1019 "ef\\uD800\\U00010000", 1020 1021#if !UCONFIG_NO_NORMALIZATION 1022 "[:^lccc=0:]", // Lead canonical class 1023 "\\u0300\\u0301", 1024 "abcd\\u00c0\\u00c5", 1025 1026 "[:^tccc=0:]", // Trail canonical class 1027 "\\u0300\\u0301\\u00c0\\u00c5", 1028 "abcd", 1029 1030 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class 1031 "\\u0300\\u0301\\u00c0\\u00c5", 1032 "abcd", 1033 1034 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now) 1035 "", 1036 "abcd\\u0300\\u0301\\u00c0\\u00c5", 1037 1038 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not 1039 "\\u0F73\\u0F75\\u0F81", 1040 "abcd\\u0300\\u0301\\u00c0\\u00c5", 1041#endif /* !UCONFIG_NO_NORMALIZATION */ 1042 1043 "[:Assigned:]", 1044 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD", 1045 "\\u0888\\uFDD3\\uFFFE\\U00050005", 1046 1047 // Script_Extensions, new in Unicode 6.0 1048 "[:scx=Arab:]", 1049 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3", 1050 "\\u061D\\uFDEF\\uFDFE", 1051 1052 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions, 1053 // so scx-sc is missing U+FDF2. 1054 "[[:Script_Extensions=Arabic:]-[:Arab:]]", 1055 "\\u0640\\u064B\\u0650\\u0655", 1056 "\\uFDF2" 1057 }; 1058 1059 static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]); 1060 1061 for (int32_t i=0; i<DATA_LEN; i+=3) { 1062 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]), 1063 CharsToUnicodeString(DATA[i+2])); 1064 } 1065} 1066 1067/** 1068 * Test that Posix style character classes [:digit:], etc. 1069 * have the Unicode definitions from TR 18. 1070 */ 1071void UnicodeSetTest::TestPosixClasses() { 1072 { 1073 UErrorCode status = U_ZERO_ERROR; 1074 UnicodeSet s1("[:alpha:]", status); 1075 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status); 1076 TEST_ASSERT_SUCCESS(status); 1077 TEST_ASSERT(s1==s2); 1078 } 1079 { 1080 UErrorCode status = U_ZERO_ERROR; 1081 UnicodeSet s1("[:lower:]", status); 1082 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status); 1083 TEST_ASSERT_SUCCESS(status); 1084 TEST_ASSERT(s1==s2); 1085 } 1086 { 1087 UErrorCode status = U_ZERO_ERROR; 1088 UnicodeSet s1("[:upper:]", status); 1089 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status); 1090 TEST_ASSERT_SUCCESS(status); 1091 TEST_ASSERT(s1==s2); 1092 } 1093 { 1094 UErrorCode status = U_ZERO_ERROR; 1095 UnicodeSet s1("[:punct:]", status); 1096 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status); 1097 TEST_ASSERT_SUCCESS(status); 1098 TEST_ASSERT(s1==s2); 1099 } 1100 { 1101 UErrorCode status = U_ZERO_ERROR; 1102 UnicodeSet s1("[:digit:]", status); 1103 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status); 1104 TEST_ASSERT_SUCCESS(status); 1105 TEST_ASSERT(s1==s2); 1106 } 1107 { 1108 UErrorCode status = U_ZERO_ERROR; 1109 UnicodeSet s1("[:xdigit:]", status); 1110 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status); 1111 TEST_ASSERT_SUCCESS(status); 1112 TEST_ASSERT(s1==s2); 1113 } 1114 { 1115 UErrorCode status = U_ZERO_ERROR; 1116 UnicodeSet s1("[:alnum:]", status); 1117 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status); 1118 TEST_ASSERT_SUCCESS(status); 1119 TEST_ASSERT(s1==s2); 1120 } 1121 { 1122 UErrorCode status = U_ZERO_ERROR; 1123 UnicodeSet s1("[:space:]", status); 1124 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status); 1125 TEST_ASSERT_SUCCESS(status); 1126 TEST_ASSERT(s1==s2); 1127 } 1128 { 1129 UErrorCode status = U_ZERO_ERROR; 1130 UnicodeSet s1("[:blank:]", status); 1131 TEST_ASSERT_SUCCESS(status); 1132 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"), 1133 status); 1134 TEST_ASSERT_SUCCESS(status); 1135 TEST_ASSERT(s1==s2); 1136 } 1137 { 1138 UErrorCode status = U_ZERO_ERROR; 1139 UnicodeSet s1("[:cntrl:]", status); 1140 TEST_ASSERT_SUCCESS(status); 1141 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status); 1142 TEST_ASSERT_SUCCESS(status); 1143 TEST_ASSERT(s1==s2); 1144 } 1145 { 1146 UErrorCode status = U_ZERO_ERROR; 1147 UnicodeSet s1("[:graph:]", status); 1148 TEST_ASSERT_SUCCESS(status); 1149 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status); 1150 TEST_ASSERT_SUCCESS(status); 1151 TEST_ASSERT(s1==s2); 1152 } 1153 { 1154 UErrorCode status = U_ZERO_ERROR; 1155 UnicodeSet s1("[:print:]", status); 1156 TEST_ASSERT_SUCCESS(status); 1157 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status); 1158 TEST_ASSERT_SUCCESS(status); 1159 TEST_ASSERT(s1==s2); 1160 } 1161} 1162/** 1163 * Test cloning of UnicodeSet. For C++, we test the copy constructor. 1164 */ 1165void UnicodeSetTest::TestClone() { 1166 UErrorCode ec = U_ZERO_ERROR; 1167 UnicodeSet s("[abcxyz]", ec); 1168 UnicodeSet t(s); 1169 expectContainment(t, "abc", "def"); 1170} 1171 1172/** 1173 * Test the indexOf() and charAt() methods. 1174 */ 1175void UnicodeSetTest::TestIndexOf() { 1176 UErrorCode ec = U_ZERO_ERROR; 1177 UnicodeSet set("[a-cx-y3578]", ec); 1178 if (U_FAILURE(ec)) { 1179 errln("FAIL: UnicodeSet constructor"); 1180 return; 1181 } 1182 for (int32_t i=0; i<set.size(); ++i) { 1183 UChar32 c = set.charAt(i); 1184 if (set.indexOf(c) != i) { 1185 errln("FAIL: charAt(%d) = %X => indexOf() => %d", 1186 i, c, set.indexOf(c)); 1187 } 1188 } 1189 UChar32 c = set.charAt(set.size()); 1190 if (c != -1) { 1191 errln("FAIL: charAt(<out of range>) = %X", c); 1192 } 1193 int32_t j = set.indexOf((UChar32)0x71/*'q'*/); 1194 if (j != -1) { 1195 errln((UnicodeString)"FAIL: indexOf('q') = " + j); 1196 } 1197} 1198 1199/** 1200 * Test closure API. 1201 */ 1202void UnicodeSetTest::TestCloseOver() { 1203 UErrorCode ec = U_ZERO_ERROR; 1204 1205 char CASE[] = {(char)USET_CASE_INSENSITIVE}; 1206 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS}; 1207 const char* DATA[] = { 1208 // selector, input, output 1209 CASE, 1210 "[aq\\u00DF{Bc}{bC}{Fi}]", 1211 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1 1212 1213 CASE, 1214 "[\\u01F1]", // 'DZ' 1215 "[\\u01F1\\u01F2\\u01F3]", 1216 1217 CASE, 1218 "[\\u1FB4]", 1219 "[\\u1FB4{\\u03AC\\u03B9}]", 1220 1221 CASE, 1222 "[{F\\uFB01}]", 1223 "[\\uFB03{ffi}]", 1224 1225 CASE, // make sure binary search finds limits 1226 "[a\\uFF3A]", 1227 "[aA\\uFF3A\\uFF5A]", 1228 1229 CASE, 1230 "[a-z]","[A-Za-z\\u017F\\u212A]", 1231 CASE, 1232 "[abc]","[A-Ca-c]", 1233 CASE, 1234 "[ABC]","[A-Ca-c]", 1235 1236 CASE, "[i]", "[iI]", 1237 1238 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I 1239 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot 1240 1241 CASE, "[\\u0131]", "[\\u0131]", // dotless i 1242 1243 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]", 1244 1245 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas 1246 1247 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas 1248 1249 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]", 1250 1251 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]", 1252 1253 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]", 1254 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]", 1255 1256 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]", 1257 1258 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table 1259 1260 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table 1261 1262#if !UCONFIG_NO_FILE_IO 1263 CASE_MAPPINGS, 1264 "[aq\\u00DF{Bc}{bC}{Fi}]", 1265 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]", 1266#endif 1267 1268 CASE_MAPPINGS, 1269 "[\\u01F1]", // 'DZ' 1270 "[\\u01F1\\u01F2\\u01F3]", 1271 1272 CASE_MAPPINGS, 1273 "[a-z]", 1274 "[A-Za-z]", 1275 1276 NULL 1277 }; 1278 1279 UnicodeSet s; 1280 UnicodeSet t; 1281 UnicodeString buf; 1282 for (int32_t i=0; DATA[i]!=NULL; i+=3) { 1283 int32_t selector = DATA[i][0]; 1284 UnicodeString pat(DATA[i+1], -1, US_INV); 1285 UnicodeString exp(DATA[i+2], -1, US_INV); 1286 s.applyPattern(pat, ec); 1287 s.closeOver(selector); 1288 t.applyPattern(exp, ec); 1289 if (U_FAILURE(ec)) { 1290 errln("FAIL: applyPattern failed"); 1291 continue; 1292 } 1293 if (s == t) { 1294 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp); 1295 } else { 1296 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " + 1297 s.toPattern(buf, TRUE) + ", expected " + exp); 1298 } 1299 } 1300 1301#if 0 1302 /* 1303 * Unused test code. 1304 * This was used to compare the old implementation (using USET_CASE) 1305 * with the new one (using 0x100 temporarily) 1306 * while transitioning from hardcoded case closure tables in uniset.cpp 1307 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu. 1308 * and using ucase.c functions for closure. 1309 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file 1310 * 1311 * Note: The old and new implementation never fully matched because 1312 * the old implementation turned out to not map U+0130 and U+0131 correctly 1313 * (dotted I and dotless i) and because the old implementation's data tables 1314 * were outdated compared to Unicode 4.0.1 at the time of the change to the 1315 * new implementation. (So sigmas and some other characters were not handled 1316 * according to the newer Unicode version.) 1317 */ 1318 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2; 1319 UnicodeSetIterator si(sens); 1320 UnicodeString str, buf2; 1321 const UnicodeString *pStr; 1322 UChar32 c; 1323 while(si.next()) { 1324 if(!si.isString()) { 1325 c=si.getCodepoint(); 1326 s.clear(); 1327 s.add(c); 1328 1329 str.setTo(c); 1330 str.foldCase(); 1331 sens2.add(str); 1332 1333 t=s; 1334 s.closeOver(USET_CASE); 1335 t.closeOver(0x100); 1336 if(s!=t) { 1337 errln("FAIL: closeOver(U+%04x) differs: ", c); 1338 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE)); 1339 } 1340 } 1341 } 1342 // remove all code points 1343 // should contain all full case folding mapping strings 1344 sens2.remove(0, 0x10ffff); 1345 si.reset(sens2); 1346 while(si.next()) { 1347 if(si.isString()) { 1348 pStr=&si.getString(); 1349 s.clear(); 1350 s.add(*pStr); 1351 t=s2=s; 1352 s.closeOver(USET_CASE); 1353 t.closeOver(0x100); 1354 if(s!=t) { 1355 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: "); 1356 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE)); 1357 } 1358 } 1359 } 1360#endif 1361 1362 // Test the pattern API 1363 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec); 1364 if (U_FAILURE(ec)) { 1365 errln("FAIL: applyPattern failed"); 1366 } else { 1367 expectContainment(s, "abcABC", "defDEF"); 1368 } 1369 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec); 1370 if (U_FAILURE(ec)) { 1371 errln("FAIL: constructor failed"); 1372 } else { 1373 expectContainment(v, "defDEF", "abcABC"); 1374 } 1375 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec); 1376 if (U_FAILURE(ec)) { 1377 errln("FAIL: construct w/case mappings failed"); 1378 } else { 1379 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A")); 1380 } 1381} 1382 1383void UnicodeSetTest::TestEscapePattern() { 1384 const char pattern[] = 1385 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]"; 1386 const char exp[] = 1387 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]"; 1388 // We test this with two passes; in the second pass we 1389 // pre-unescape the pattern. Since U+200E is Pattern_White_Space, 1390 // this fails -- which is what we expect. 1391 for (int32_t pass=1; pass<=2; ++pass) { 1392 UErrorCode ec = U_ZERO_ERROR; 1393 UnicodeString pat(pattern, -1, US_INV); 1394 if (pass==2) { 1395 pat = pat.unescape(); 1396 } 1397 // Pattern is only good for pass 1 1398 UBool isPatternValid = (pass==1); 1399 1400 UnicodeSet set(pat, ec); 1401 if (U_SUCCESS(ec) != isPatternValid){ 1402 errln((UnicodeString)"FAIL: applyPattern(" + 1403 escape(pat) + ") => " + 1404 u_errorName(ec)); 1405 continue; 1406 } 1407 if (U_FAILURE(ec)) { 1408 continue; 1409 } 1410 if (set.contains((UChar)0x0644)){ 1411 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)"); 1412 } 1413 1414 UnicodeString newpat; 1415 set.toPattern(newpat, TRUE); 1416 if (newpat == UnicodeString(exp, -1, US_INV)) { 1417 logln(escape(pat) + " => " + newpat); 1418 } else { 1419 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat); 1420 } 1421 1422 for (int32_t i=0; i<set.getRangeCount(); ++i) { 1423 UnicodeString str("Range "); 1424 str.append((UChar)(0x30 + i)) 1425 .append(": ") 1426 .append((UChar32)set.getRangeStart(i)) 1427 .append(" - ") 1428 .append((UChar32)set.getRangeEnd(i)); 1429 str = str + " (" + set.getRangeStart(i) + " - " + 1430 set.getRangeEnd(i) + ")"; 1431 if (set.getRangeStart(i) < 0) { 1432 errln((UnicodeString)"FAIL: " + escape(str)); 1433 } else { 1434 logln(escape(str)); 1435 } 1436 } 1437 } 1438} 1439 1440void UnicodeSetTest::expectRange(const UnicodeString& label, 1441 const UnicodeSet& set, 1442 UChar32 start, UChar32 end) { 1443 UnicodeSet exp(start, end); 1444 UnicodeString pat; 1445 if (set == exp) { 1446 logln(label + " => " + set.toPattern(pat, TRUE)); 1447 } else { 1448 UnicodeString xpat; 1449 errln((UnicodeString)"FAIL: " + label + " => " + 1450 set.toPattern(pat, TRUE) + 1451 ", expected " + exp.toPattern(xpat, TRUE)); 1452 } 1453} 1454 1455void UnicodeSetTest::TestInvalidCodePoint() { 1456 1457 const UChar32 DATA[] = { 1458 // Test range Expected range 1459 0, 0x10FFFF, 0, 0x10FFFF, 1460 (UChar32)-1, 8, 0, 8, 1461 8, 0x110000, 8, 0x10FFFF 1462 }; 1463 const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]); 1464 1465 UnicodeString pat; 1466 int32_t i; 1467 1468 for (i=0; i<DATA_LENGTH; i+=4) { 1469 UChar32 start = DATA[i]; 1470 UChar32 end = DATA[i+1]; 1471 UChar32 xstart = DATA[i+2]; 1472 UChar32 xend = DATA[i+3]; 1473 1474 // Try various API using the test code points 1475 1476 UnicodeSet set(start, end); 1477 expectRange((UnicodeString)"ct(" + start + "," + end + ")", 1478 set, xstart, xend); 1479 1480 set.clear(); 1481 set.set(start, end); 1482 expectRange((UnicodeString)"set(" + start + "," + end + ")", 1483 set, xstart, xend); 1484 1485 UBool b = set.contains(start); 1486 b = set.contains(start, end); 1487 b = set.containsNone(start, end); 1488 b = set.containsSome(start, end); 1489 (void)b; // Suppress set but not used warning. 1490 1491 /*int32_t index = set.indexOf(start);*/ 1492 1493 set.clear(); 1494 set.add(start); 1495 set.add(start, end); 1496 expectRange((UnicodeString)"add(" + start + "," + end + ")", 1497 set, xstart, xend); 1498 1499 set.set(0, 0x10FFFF); 1500 set.retain(start, end); 1501 expectRange((UnicodeString)"retain(" + start + "," + end + ")", 1502 set, xstart, xend); 1503 set.retain(start); 1504 1505 set.set(0, 0x10FFFF); 1506 set.remove(start); 1507 set.remove(start, end); 1508 set.complement(); 1509 expectRange((UnicodeString)"!remove(" + start + "," + end + ")", 1510 set, xstart, xend); 1511 1512 set.set(0, 0x10FFFF); 1513 set.complement(start, end); 1514 set.complement(); 1515 expectRange((UnicodeString)"!complement(" + start + "," + end + ")", 1516 set, xstart, xend); 1517 set.complement(start); 1518 } 1519 1520 const UChar32 DATA2[] = { 1521 0, 1522 0x10FFFF, 1523 (UChar32)-1, 1524 0x110000 1525 }; 1526 const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]); 1527 1528 for (i=0; i<DATA2_LENGTH; ++i) { 1529 UChar32 c = DATA2[i], end = 0x10FFFF; 1530 UBool valid = (c >= 0 && c <= 0x10FFFF); 1531 1532 UnicodeSet set(0, 0x10FFFF); 1533 1534 // For single-codepoint contains, invalid codepoints are NOT contained 1535 UBool b = set.contains(c); 1536 if (b == valid) { 1537 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c + 1538 ") = " + b); 1539 } else { 1540 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c + 1541 ") = " + b); 1542 } 1543 1544 // For codepoint range contains, containsNone, and containsSome, 1545 // invalid or empty (start > end) ranges have UNDEFINED behavior. 1546 b = set.contains(c, end); 1547 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c + 1548 "," + end + ") = " + b); 1549 1550 b = set.containsNone(c, end); 1551 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c + 1552 "," + end + ") = " + b); 1553 1554 b = set.containsSome(c, end); 1555 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c + 1556 "," + end + ") = " + b); 1557 1558 int32_t index = set.indexOf(c); 1559 if ((index >= 0) == valid) { 1560 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c + 1561 ") = " + index); 1562 } else { 1563 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c + 1564 ") = " + index); 1565 } 1566 } 1567} 1568 1569// Used by TestSymbolTable 1570class TokenSymbolTable : public SymbolTable { 1571public: 1572 Hashtable contents; 1573 1574 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) { 1575 contents.setValueDeleter(uprv_deleteUObject); 1576 } 1577 1578 ~TokenSymbolTable() {} 1579 1580 /** 1581 * (Non-SymbolTable API) Add the given variable and value to 1582 * the table. Variable should NOT contain leading '$'. 1583 */ 1584 void add(const UnicodeString& var, const UnicodeString& value, 1585 UErrorCode& ec) { 1586 if (U_SUCCESS(ec)) { 1587 contents.put(var, new UnicodeString(value), ec); 1588 } 1589 } 1590 1591 /** 1592 * SymbolTable API 1593 */ 1594 virtual const UnicodeString* lookup(const UnicodeString& s) const { 1595 return (const UnicodeString*) contents.get(s); 1596 } 1597 1598 /** 1599 * SymbolTable API 1600 */ 1601 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const { 1602 return NULL; 1603 } 1604 1605 /** 1606 * SymbolTable API 1607 */ 1608 virtual UnicodeString parseReference(const UnicodeString& text, 1609 ParsePosition& pos, int32_t limit) const { 1610 int32_t start = pos.getIndex(); 1611 int32_t i = start; 1612 UnicodeString result; 1613 while (i < limit) { 1614 UChar c = text.charAt(i); 1615 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { 1616 break; 1617 } 1618 ++i; 1619 } 1620 if (i == start) { // No valid name chars 1621 return result; // Indicate failure with empty string 1622 } 1623 pos.setIndex(i); 1624 text.extractBetween(start, i, result); 1625 return result; 1626 } 1627}; 1628 1629void UnicodeSetTest::TestSymbolTable() { 1630 // Multiple test cases can be set up here. Each test case 1631 // is terminated by null: 1632 // var, value, var, value,..., input pat., exp. output pat., null 1633 const char* DATA[] = { 1634 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL, 1635 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL, 1636 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL, 1637 NULL 1638 }; 1639 1640 for (int32_t i=0; DATA[i]!=NULL; ++i) { 1641 UErrorCode ec = U_ZERO_ERROR; 1642 TokenSymbolTable sym(ec); 1643 if (U_FAILURE(ec)) { 1644 errln("FAIL: couldn't construct TokenSymbolTable"); 1645 continue; 1646 } 1647 1648 // Set up variables 1649 while (DATA[i+2] != NULL) { 1650 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec); 1651 if (U_FAILURE(ec)) { 1652 errln("FAIL: couldn't add to TokenSymbolTable"); 1653 continue; 1654 } 1655 i += 2; 1656 } 1657 1658 // Input pattern and expected output pattern 1659 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV); 1660 i += 2; 1661 1662 ParsePosition pos(0); 1663 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec); 1664 if (U_FAILURE(ec)) { 1665 errln("FAIL: couldn't construct UnicodeSet"); 1666 continue; 1667 } 1668 1669 // results 1670 if (pos.getIndex() != inpat.length()) { 1671 errln((UnicodeString)"Failed to read to end of string \"" 1672 + inpat + "\": read to " 1673 + pos.getIndex() + ", length is " 1674 + inpat.length()); 1675 } 1676 1677 UnicodeSet us2(exppat, ec); 1678 if (U_FAILURE(ec)) { 1679 errln("FAIL: couldn't construct expected UnicodeSet"); 1680 continue; 1681 } 1682 1683 UnicodeString a, b; 1684 if (us != us2) { 1685 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) + 1686 ", expected " + us2.toPattern(b, TRUE)); 1687 } else { 1688 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE)); 1689 } 1690 } 1691} 1692 1693void UnicodeSetTest::TestSurrogate() { 1694 const char* DATA[] = { 1695 // These should all behave identically 1696 "[abc\\uD800\\uDC00]", 1697 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java 1698 "[abc\\U00010000]", 1699 0 1700 }; 1701 for (int i=0; DATA[i] != 0; ++i) { 1702 UErrorCode ec = U_ZERO_ERROR; 1703 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV)); 1704 UnicodeString str = UnicodeString(DATA[i], -1, US_INV); 1705 UnicodeSet set(str, ec); 1706 if (U_FAILURE(ec)) { 1707 errln("FAIL: UnicodeSet constructor"); 1708 continue; 1709 } 1710 expectContainment(set, 1711 CharsToUnicodeString("abc\\U00010000"), 1712 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair 1713 if (set.size() != 4) { 1714 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " + 1715 set.size() + ", expected 4"); 1716 } 1717 } 1718} 1719 1720void UnicodeSetTest::TestExhaustive() { 1721 // exhaustive tests. Simulate UnicodeSets with integers. 1722 // That gives us very solid tests (except for large memory tests). 1723 1724 int32_t limit = 128; 1725 1726 UnicodeSet x, y, z, aa; 1727 1728 for (int32_t i = 0; i < limit; ++i) { 1729 bitsToSet(i, x); 1730 logln((UnicodeString)"Testing " + i + ", " + x); 1731 _testComplement(i, x, y); 1732 1733 // AS LONG AS WE ARE HERE, check roundtrip 1734 checkRoundTrip(bitsToSet(i, aa)); 1735 1736 for (int32_t j = 0; j < limit; ++j) { 1737 _testAdd(i,j, x,y,z); 1738 _testXor(i,j, x,y,z); 1739 _testRetain(i,j, x,y,z); 1740 _testRemove(i,j, x,y,z); 1741 } 1742 } 1743} 1744 1745void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) { 1746 bitsToSet(a, x); 1747 z = x; 1748 z.complement(); 1749 int32_t c = setToBits(z); 1750 if (c != (~a)) { 1751 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z); 1752 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c); 1753 } 1754 checkCanonicalRep(z, (UnicodeString)"complement " + a); 1755} 1756 1757void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1758 bitsToSet(a, x); 1759 bitsToSet(b, y); 1760 z = x; 1761 z.addAll(y); 1762 int32_t c = setToBits(z); 1763 if (c != (a | b)) { 1764 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z); 1765 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c); 1766 } 1767 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b); 1768} 1769 1770void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1771 bitsToSet(a, x); 1772 bitsToSet(b, y); 1773 z = x; 1774 z.retainAll(y); 1775 int32_t c = setToBits(z); 1776 if (c != (a & b)) { 1777 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z); 1778 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c); 1779 } 1780 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b); 1781} 1782 1783void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1784 bitsToSet(a, x); 1785 bitsToSet(b, y); 1786 z = x; 1787 z.removeAll(y); 1788 int32_t c = setToBits(z); 1789 if (c != (a &~ b)) { 1790 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z); 1791 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c); 1792 } 1793 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b); 1794} 1795 1796void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1797 bitsToSet(a, x); 1798 bitsToSet(b, y); 1799 z = x; 1800 z.complementAll(y); 1801 int32_t c = setToBits(z); 1802 if (c != (a ^ b)) { 1803 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z); 1804 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c); 1805 } 1806 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b); 1807} 1808 1809/** 1810 * Check that ranges are monotonically increasing and non- 1811 * overlapping. 1812 */ 1813void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) { 1814 int32_t n = set.getRangeCount(); 1815 if (n < 0) { 1816 errln((UnicodeString)"FAIL result of " + msg + 1817 ": range count should be >= 0 but is " + 1818 n /*+ " for " + set.toPattern())*/); 1819 return; 1820 } 1821 UChar32 last = 0; 1822 for (int32_t i=0; i<n; ++i) { 1823 UChar32 start = set.getRangeStart(i); 1824 UChar32 end = set.getRangeEnd(i); 1825 if (start > end) { 1826 errln((UnicodeString)"FAIL result of " + msg + 1827 ": range " + (i+1) + 1828 " start > end: " + (int)start + ", " + (int)end + 1829 " for " + set); 1830 } 1831 if (i > 0 && start <= last) { 1832 errln((UnicodeString)"FAIL result of " + msg + 1833 ": range " + (i+1) + 1834 " overlaps previous range: " + (int)start + ", " + (int)end + 1835 " for " + set); 1836 } 1837 last = end; 1838 } 1839} 1840 1841/** 1842 * Convert a bitmask to a UnicodeSet. 1843 */ 1844UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) { 1845 result.clear(); 1846 for (UChar32 i = 0; i < 32; ++i) { 1847 if ((a & (1<<i)) != 0) { 1848 result.add(i); 1849 } 1850 } 1851 return result; 1852} 1853 1854/** 1855 * Convert a UnicodeSet to a bitmask. Only the characters 1856 * U+0000 to U+0020 are represented in the bitmask. 1857 */ 1858int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) { 1859 int32_t result = 0; 1860 for (int32_t i = 0; i < 32; ++i) { 1861 if (x.contains((UChar32)i)) { 1862 result |= (1<<i); 1863 } 1864 } 1865 return result; 1866} 1867 1868/** 1869 * Return the representation of an inversion list based UnicodeSet 1870 * as a pairs list. Ranges are listed in ascending Unicode order. 1871 * For example, the set [a-zA-M3] is represented as "33AMaz". 1872 */ 1873UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) { 1874 UnicodeString pairs; 1875 for (int32_t i=0; i<set.getRangeCount(); ++i) { 1876 UChar32 start = set.getRangeStart(i); 1877 UChar32 end = set.getRangeEnd(i); 1878 if (end > 0xFFFF) { 1879 end = 0xFFFF; 1880 i = set.getRangeCount(); // Should be unnecessary 1881 } 1882 pairs.append((UChar)start).append((UChar)end); 1883 } 1884 return pairs; 1885} 1886 1887/** 1888 * Basic consistency check for a few items. 1889 * That the iterator works, and that we can create a pattern and 1890 * get the same thing back 1891 */ 1892void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) { 1893 UErrorCode ec = U_ZERO_ERROR; 1894 1895 UnicodeSet t(s); 1896 checkEqual(s, t, "copy ct"); 1897 1898 t = s; 1899 checkEqual(s, t, "operator="); 1900 1901 copyWithIterator(t, s, FALSE); 1902 checkEqual(s, t, "iterator roundtrip"); 1903 1904 copyWithIterator(t, s, TRUE); // try range 1905 checkEqual(s, t, "iterator roundtrip"); 1906 1907 UnicodeString pat; s.toPattern(pat, FALSE); 1908 t.applyPattern(pat, ec); 1909 if (U_FAILURE(ec)) { 1910 errln("FAIL: applyPattern"); 1911 return; 1912 } else { 1913 checkEqual(s, t, "toPattern(false)"); 1914 } 1915 1916 s.toPattern(pat, TRUE); 1917 t.applyPattern(pat, ec); 1918 if (U_FAILURE(ec)) { 1919 errln("FAIL: applyPattern"); 1920 return; 1921 } else { 1922 checkEqual(s, t, "toPattern(true)"); 1923 } 1924} 1925 1926void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) { 1927 t.clear(); 1928 UnicodeSetIterator it(s); 1929 if (withRange) { 1930 while (it.nextRange()) { 1931 if (it.isString()) { 1932 t.add(it.getString()); 1933 } else { 1934 t.add(it.getCodepoint(), it.getCodepointEnd()); 1935 } 1936 } 1937 } else { 1938 while (it.next()) { 1939 if (it.isString()) { 1940 t.add(it.getString()); 1941 } else { 1942 t.add(it.getCodepoint()); 1943 } 1944 } 1945 } 1946} 1947 1948UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) { 1949 UnicodeString source; s.toPattern(source, TRUE); 1950 UnicodeString result; t.toPattern(result, TRUE); 1951 if (s != t) { 1952 errln((UnicodeString)"FAIL: " + message 1953 + "; source = " + source 1954 + "; result = " + result 1955 ); 1956 return FALSE; 1957 } else { 1958 logln((UnicodeString)"Ok: " + message 1959 + "; source = " + source 1960 + "; result = " + result 1961 ); 1962 } 1963 return TRUE; 1964} 1965 1966void 1967UnicodeSetTest::expectContainment(const UnicodeString& pat, 1968 const UnicodeString& charsIn, 1969 const UnicodeString& charsOut) { 1970 UErrorCode ec = U_ZERO_ERROR; 1971 UnicodeSet set(pat, ec); 1972 if (U_FAILURE(ec)) { 1973 dataerrln((UnicodeString)"FAIL: pattern \"" + 1974 pat + "\" => " + u_errorName(ec)); 1975 return; 1976 } 1977 expectContainment(set, pat, charsIn, charsOut); 1978} 1979 1980void 1981UnicodeSetTest::expectContainment(const UnicodeSet& set, 1982 const UnicodeString& charsIn, 1983 const UnicodeString& charsOut) { 1984 UnicodeString pat; 1985 set.toPattern(pat); 1986 expectContainment(set, pat, charsIn, charsOut); 1987} 1988 1989void 1990UnicodeSetTest::expectContainment(const UnicodeSet& set, 1991 const UnicodeString& setName, 1992 const UnicodeString& charsIn, 1993 const UnicodeString& charsOut) { 1994 UnicodeString bad; 1995 UChar32 c; 1996 int32_t i; 1997 1998 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) { 1999 c = charsIn.char32At(i); 2000 if (!set.contains(c)) { 2001 bad.append(c); 2002 } 2003 } 2004 if (bad.length() > 0) { 2005 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) + 2006 ", expected containment of " + prettify(charsIn)); 2007 } else { 2008 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn)); 2009 } 2010 2011 bad.truncate(0); 2012 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) { 2013 c = charsOut.char32At(i); 2014 if (set.contains(c)) { 2015 bad.append(c); 2016 } 2017 } 2018 if (bad.length() > 0) { 2019 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) + 2020 ", expected non-containment of " + prettify(charsOut)); 2021 } else { 2022 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut)); 2023 } 2024} 2025 2026void 2027UnicodeSetTest::expectPattern(UnicodeSet& set, 2028 const UnicodeString& pattern, 2029 const UnicodeString& expectedPairs){ 2030 UErrorCode status = U_ZERO_ERROR; 2031 set.applyPattern(pattern, status); 2032 if (U_FAILURE(status)) { 2033 errln(UnicodeString("FAIL: applyPattern(\"") + pattern + 2034 "\") failed"); 2035 return; 2036 } else { 2037 if (getPairs(set) != expectedPairs ) { 2038 errln(UnicodeString("FAIL: applyPattern(\"") + pattern + 2039 "\") => pairs \"" + 2040 escape(getPairs(set)) + "\", expected \"" + 2041 escape(expectedPairs) + "\""); 2042 } else { 2043 logln(UnicodeString("Ok: applyPattern(\"") + pattern + 2044 "\") => pairs \"" + 2045 escape(getPairs(set)) + "\""); 2046 } 2047 } 2048 // the result of calling set.toPattern(), which is the string representation of 2049 // this set(set), is passed to a UnicodeSet constructor, and tested that it 2050 // will produce another set that is equal to this one. 2051 UnicodeString temppattern; 2052 set.toPattern(temppattern); 2053 UnicodeSet *tempset=new UnicodeSet(temppattern, status); 2054 if (U_FAILURE(status)) { 2055 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern")); 2056 return; 2057 } 2058 if(*tempset != set || getPairs(*tempset) != getPairs(set)){ 2059 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" + 2060 escape(getPairs(set)) + "\"")); 2061 } else{ 2062 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\"")); 2063 } 2064 2065 delete tempset; 2066 2067} 2068 2069void 2070UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) { 2071 if (getPairs(set) != expectedPairs) { 2072 errln(UnicodeString("FAIL: Expected pair list \"") + 2073 escape(expectedPairs) + "\", got \"" + 2074 escape(getPairs(set)) + "\""); 2075 } 2076} 2077 2078void UnicodeSetTest::expectToPattern(const UnicodeSet& set, 2079 const UnicodeString& expPat, 2080 const char** expStrings) { 2081 UnicodeString pat; 2082 set.toPattern(pat, TRUE); 2083 if (pat == expPat) { 2084 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\""); 2085 } else { 2086 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\""); 2087 return; 2088 } 2089 if (expStrings == NULL) { 2090 return; 2091 } 2092 UBool in = TRUE; 2093 for (int32_t i=0; expStrings[i] != NULL; ++i) { 2094 if (expStrings[i] == NOT) { // sic; pointer comparison 2095 in = FALSE; 2096 continue; 2097 } 2098 UnicodeString s = CharsToUnicodeString(expStrings[i]); 2099 UBool contained = set.contains(s); 2100 if (contained == in) { 2101 logln((UnicodeString)"Ok: " + expPat + 2102 (contained ? " contains {" : " does not contain {") + 2103 escape(expStrings[i]) + "}"); 2104 } else { 2105 errln((UnicodeString)"FAIL: " + expPat + 2106 (contained ? " contains {" : " does not contain {") + 2107 escape(expStrings[i]) + "}"); 2108 } 2109 } 2110} 2111 2112static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); } 2113 2114void 2115UnicodeSetTest::doAssert(UBool condition, const char *message) 2116{ 2117 if (!condition) { 2118 errln(UnicodeString("ERROR : ") + message); 2119 } 2120} 2121 2122UnicodeString 2123UnicodeSetTest::escape(const UnicodeString& s) { 2124 UnicodeString buf; 2125 for (int32_t i=0; i<s.length(); ) 2126 { 2127 UChar32 c = s.char32At(i); 2128 if (0x0020 <= c && c <= 0x007F) { 2129 buf += c; 2130 } else { 2131 if (c <= 0xFFFF) { 2132 buf += (UChar)0x5c; buf += (UChar)0x75; 2133 } else { 2134 buf += (UChar)0x5c; buf += (UChar)0x55; 2135 buf += toHexString((c & 0xF0000000) >> 28); 2136 buf += toHexString((c & 0x0F000000) >> 24); 2137 buf += toHexString((c & 0x00F00000) >> 20); 2138 buf += toHexString((c & 0x000F0000) >> 16); 2139 } 2140 buf += toHexString((c & 0xF000) >> 12); 2141 buf += toHexString((c & 0x0F00) >> 8); 2142 buf += toHexString((c & 0x00F0) >> 4); 2143 buf += toHexString(c & 0x000F); 2144 } 2145 i += U16_LENGTH(c); 2146 } 2147 return buf; 2148} 2149 2150void UnicodeSetTest::TestFreezable() { 2151 UErrorCode errorCode=U_ZERO_ERROR; 2152 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15); 2153 UnicodeSet idSet(idPattern, errorCode); 2154 if(U_FAILURE(errorCode)) { 2155 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode)); 2156 return; 2157 } 2158 2159 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15); 2160 UnicodeSet wsSet(wsPattern, errorCode); 2161 if(U_FAILURE(errorCode)) { 2162 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode)); 2163 return; 2164 } 2165 2166 idSet.add(idPattern); 2167 UnicodeSet frozen(idSet); 2168 frozen.freeze(); 2169 2170 if(idSet.isFrozen() || !frozen.isFrozen()) { 2171 errln("FAIL: isFrozen() is wrong"); 2172 } 2173 if(frozen!=idSet || !(frozen==idSet)) { 2174 errln("FAIL: a copy-constructed frozen set differs from its original"); 2175 } 2176 2177 frozen=wsSet; 2178 if(frozen!=idSet || !(frozen==idSet)) { 2179 errln("FAIL: a frozen set was modified by operator="); 2180 } 2181 2182 UnicodeSet frozen2(frozen); 2183 if(frozen2!=frozen || frozen2!=idSet) { 2184 errln("FAIL: a copied frozen set differs from its frozen original"); 2185 } 2186 if(!frozen2.isFrozen()) { 2187 errln("FAIL: copy-constructing a frozen set results in a thawed one"); 2188 } 2189 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction. 2190 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) { 2191 errln("FAIL: UnicodeSet(5, 55) failed"); 2192 } 2193 frozen3=frozen; 2194 if(!frozen3.isFrozen()) { 2195 errln("FAIL: copying a frozen set results in a thawed one"); 2196 } 2197 2198 UnicodeSet *cloned=(UnicodeSet *)frozen.clone(); 2199 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) { 2200 errln("FAIL: clone() failed"); 2201 } 2202 cloned->add(0xd802, 0xd805); 2203 if(cloned->containsSome(0xd802, 0xd805)) { 2204 errln("FAIL: unable to modify clone"); 2205 } 2206 delete cloned; 2207 2208 UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed(); 2209 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) { 2210 errln("FAIL: cloneAsThawed() failed"); 2211 } 2212 thawed->add(0xd802, 0xd805); 2213 if(!thawed->contains(0xd802, 0xd805)) { 2214 errln("FAIL: unable to modify thawed clone"); 2215 } 2216 delete thawed; 2217 2218 frozen.set(5, 55); 2219 if(frozen!=idSet || !(frozen==idSet)) { 2220 errln("FAIL: UnicodeSet::set() modified a frozen set"); 2221 } 2222 2223 frozen.clear(); 2224 if(frozen!=idSet || !(frozen==idSet)) { 2225 errln("FAIL: UnicodeSet::clear() modified a frozen set"); 2226 } 2227 2228 frozen.closeOver(USET_CASE_INSENSITIVE); 2229 if(frozen!=idSet || !(frozen==idSet)) { 2230 errln("FAIL: UnicodeSet::closeOver() modified a frozen set"); 2231 } 2232 2233 frozen.compact(); 2234 if(frozen!=idSet || !(frozen==idSet)) { 2235 errln("FAIL: UnicodeSet::compact() modified a frozen set"); 2236 } 2237 2238 ParsePosition pos; 2239 frozen. 2240 applyPattern(wsPattern, errorCode). 2241 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode). 2242 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode). 2243 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode). 2244 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode); 2245 if(frozen!=idSet || !(frozen==idSet)) { 2246 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set"); 2247 } 2248 2249 frozen. 2250 add(0xd800). 2251 add(0xd802, 0xd805). 2252 add(wsPattern). 2253 addAll(idPattern). 2254 addAll(wsSet); 2255 if(frozen!=idSet || !(frozen==idSet)) { 2256 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set"); 2257 } 2258 2259 frozen. 2260 retain(0x62). 2261 retain(0x64, 0x69). 2262 retainAll(wsPattern). 2263 retainAll(wsSet); 2264 if(frozen!=idSet || !(frozen==idSet)) { 2265 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set"); 2266 } 2267 2268 frozen. 2269 remove(0x62). 2270 remove(0x64, 0x69). 2271 remove(idPattern). 2272 removeAll(idPattern). 2273 removeAll(idSet); 2274 if(frozen!=idSet || !(frozen==idSet)) { 2275 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set"); 2276 } 2277 2278 frozen. 2279 complement(). 2280 complement(0x62). 2281 complement(0x64, 0x69). 2282 complement(idPattern). 2283 complementAll(idPattern). 2284 complementAll(idSet); 2285 if(frozen!=idSet || !(frozen==idSet)) { 2286 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set"); 2287 } 2288} 2289 2290// Test span() etc. -------------------------------------------------------- *** 2291 2292// Append the UTF-8 version of the string to t and return the appended UTF-8 length. 2293static int32_t 2294appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) { 2295 UErrorCode errorCode=U_ZERO_ERROR; 2296 int32_t length8=0; 2297 u_strToUTF8(t, capacity, &length8, s, length, &errorCode); 2298 if(U_SUCCESS(errorCode)) { 2299 return length8; 2300 } else { 2301 // The string contains an unpaired surrogate. 2302 // Ignore this string. 2303 return 0; 2304 } 2305} 2306 2307class UnicodeSetWithStringsIterator; 2308 2309// Make the strings in a UnicodeSet easily accessible. 2310class UnicodeSetWithStrings { 2311public: 2312 UnicodeSetWithStrings(const UnicodeSet &normalSet) : 2313 set(normalSet), stringsLength(0), hasSurrogates(FALSE) { 2314 int32_t size=set.size(); 2315 if(size>0 && set.charAt(size-1)<0) { 2316 // If a set's last element is not a code point, then it must contain strings. 2317 // Iterate over the set, skip all code point ranges, and cache the strings. 2318 // Convert them to UTF-8 for spanUTF8(). 2319 UnicodeSetIterator iter(set); 2320 const UnicodeString *s; 2321 char *s8=utf8; 2322 int32_t length8, utf8Count=0; 2323 while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) { 2324 if(iter.isString()) { 2325 // Store the pointer to the set's string element 2326 // which we happen to know is a stable pointer. 2327 strings[stringsLength]=s=&iter.getString(); 2328 utf8Count+= 2329 utf8Lengths[stringsLength]=length8= 2330 appendUTF8(s->getBuffer(), s->length(), 2331 s8, (int32_t)(sizeof(utf8)-utf8Count)); 2332 if(length8==0) { 2333 hasSurrogates=TRUE; // Contains unpaired surrogates. 2334 } 2335 s8+=length8; 2336 ++stringsLength; 2337 } 2338 } 2339 } 2340 } 2341 2342 const UnicodeSet &getSet() const { 2343 return set; 2344 } 2345 2346 UBool hasStrings() const { 2347 return (UBool)(stringsLength>0); 2348 } 2349 2350 UBool hasStringsWithSurrogates() const { 2351 return hasSurrogates; 2352 } 2353 2354private: 2355 friend class UnicodeSetWithStringsIterator; 2356 2357 const UnicodeSet &set; 2358 2359 const UnicodeString *strings[20]; 2360 int32_t stringsLength; 2361 UBool hasSurrogates; 2362 2363 char utf8[1024]; 2364 int32_t utf8Lengths[20]; 2365}; 2366 2367class UnicodeSetWithStringsIterator { 2368public: 2369 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) : 2370 fSet(set), nextStringIndex(0), nextUTF8Start(0) { 2371 } 2372 2373 void reset() { 2374 nextStringIndex=nextUTF8Start=0; 2375 } 2376 2377 const UnicodeString *nextString() { 2378 if(nextStringIndex<fSet.stringsLength) { 2379 return fSet.strings[nextStringIndex++]; 2380 } else { 2381 return NULL; 2382 } 2383 } 2384 2385 // Do not mix with calls to nextString(). 2386 const char *nextUTF8(int32_t &length) { 2387 if(nextStringIndex<fSet.stringsLength) { 2388 const char *s8=fSet.utf8+nextUTF8Start; 2389 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++]; 2390 return s8; 2391 } else { 2392 length=0; 2393 return NULL; 2394 } 2395 } 2396 2397private: 2398 const UnicodeSetWithStrings &fSet; 2399 int32_t nextStringIndex; 2400 int32_t nextUTF8Start; 2401}; 2402 2403// Compare 16-bit Unicode strings (which may be malformed UTF-16) 2404// at code point boundaries. 2405// That is, each edge of a match must not be in the middle of a surrogate pair. 2406static inline UBool 2407matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) { 2408 s+=start; 2409 limit-=start; 2410 int32_t length=t.length(); 2411 return 0==t.compare(s, length) && 2412 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 2413 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 2414} 2415 2416// Implement span() with contains() for comparison. 2417static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length, 2418 USetSpanCondition spanCondition) { 2419 const UnicodeSet &realSet(set.getSet()); 2420 if(!set.hasStrings()) { 2421 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2422 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2423 } 2424 2425 UChar32 c; 2426 int32_t start=0, prev; 2427 while((prev=start)<length) { 2428 U16_NEXT(s, start, length, c); 2429 if(realSet.contains(c)!=spanCondition) { 2430 break; 2431 } 2432 } 2433 return prev; 2434 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2435 UnicodeSetWithStringsIterator iter(set); 2436 UChar32 c; 2437 int32_t start, next; 2438 for(start=next=0; start<length;) { 2439 U16_NEXT(s, next, length, c); 2440 if(realSet.contains(c)) { 2441 break; 2442 } 2443 const UnicodeString *str; 2444 iter.reset(); 2445 while((str=iter.nextString())!=NULL) { 2446 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) { 2447 // spanNeedsStrings=TRUE; 2448 return start; 2449 } 2450 } 2451 start=next; 2452 } 2453 return start; 2454 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2455 UnicodeSetWithStringsIterator iter(set); 2456 UChar32 c; 2457 int32_t start, next, maxSpanLimit=0; 2458 for(start=next=0; start<length;) { 2459 U16_NEXT(s, next, length, c); 2460 if(!realSet.contains(c)) { 2461 next=start; // Do not span this single, not-contained code point. 2462 } 2463 const UnicodeString *str; 2464 iter.reset(); 2465 while((str=iter.nextString())!=NULL) { 2466 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) { 2467 // spanNeedsStrings=TRUE; 2468 int32_t matchLimit=start+str->length(); 2469 if(matchLimit==length) { 2470 return length; 2471 } 2472 if(spanCondition==USET_SPAN_CONTAINED) { 2473 // Iterate for the shortest match at each position. 2474 // Recurse for each but the shortest match. 2475 if(next==start) { 2476 next=matchLimit; // First match from start. 2477 } else { 2478 if(matchLimit<next) { 2479 // Remember shortest match from start for iteration. 2480 int32_t temp=next; 2481 next=matchLimit; 2482 matchLimit=temp; 2483 } 2484 // Recurse for non-shortest match from start. 2485 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit, 2486 USET_SPAN_CONTAINED); 2487 if((matchLimit+spanLength)>maxSpanLimit) { 2488 maxSpanLimit=matchLimit+spanLength; 2489 if(maxSpanLimit==length) { 2490 return length; 2491 } 2492 } 2493 } 2494 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2495 if(matchLimit>next) { 2496 // Remember longest match from start. 2497 next=matchLimit; 2498 } 2499 } 2500 } 2501 } 2502 if(next==start) { 2503 break; // No match from start. 2504 } 2505 start=next; 2506 } 2507 if(start>maxSpanLimit) { 2508 return start; 2509 } else { 2510 return maxSpanLimit; 2511 } 2512 } 2513} 2514 2515static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length, 2516 USetSpanCondition spanCondition) { 2517 if(length==0) { 2518 return 0; 2519 } 2520 const UnicodeSet &realSet(set.getSet()); 2521 if(!set.hasStrings()) { 2522 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2523 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2524 } 2525 2526 UChar32 c; 2527 int32_t prev=length; 2528 do { 2529 U16_PREV(s, 0, length, c); 2530 if(realSet.contains(c)!=spanCondition) { 2531 break; 2532 } 2533 } while((prev=length)>0); 2534 return prev; 2535 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2536 UnicodeSetWithStringsIterator iter(set); 2537 UChar32 c; 2538 int32_t prev=length, length0=length; 2539 do { 2540 U16_PREV(s, 0, length, c); 2541 if(realSet.contains(c)) { 2542 break; 2543 } 2544 const UnicodeString *str; 2545 iter.reset(); 2546 while((str=iter.nextString())!=NULL) { 2547 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) { 2548 // spanNeedsStrings=TRUE; 2549 return prev; 2550 } 2551 } 2552 } while((prev=length)>0); 2553 return prev; 2554 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2555 UnicodeSetWithStringsIterator iter(set); 2556 UChar32 c; 2557 int32_t prev=length, minSpanStart=length, length0=length; 2558 do { 2559 U16_PREV(s, 0, length, c); 2560 if(!realSet.contains(c)) { 2561 length=prev; // Do not span this single, not-contained code point. 2562 } 2563 const UnicodeString *str; 2564 iter.reset(); 2565 while((str=iter.nextString())!=NULL) { 2566 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) { 2567 // spanNeedsStrings=TRUE; 2568 int32_t matchStart=prev-str->length(); 2569 if(matchStart==0) { 2570 return 0; 2571 } 2572 if(spanCondition==USET_SPAN_CONTAINED) { 2573 // Iterate for the shortest match at each position. 2574 // Recurse for each but the shortest match. 2575 if(length==prev) { 2576 length=matchStart; // First match from prev. 2577 } else { 2578 if(matchStart>length) { 2579 // Remember shortest match from prev for iteration. 2580 int32_t temp=length; 2581 length=matchStart; 2582 matchStart=temp; 2583 } 2584 // Recurse for non-shortest match from prev. 2585 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart, 2586 USET_SPAN_CONTAINED); 2587 if(spanStart<minSpanStart) { 2588 minSpanStart=spanStart; 2589 if(minSpanStart==0) { 2590 return 0; 2591 } 2592 } 2593 } 2594 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2595 if(matchStart<length) { 2596 // Remember longest match from prev. 2597 length=matchStart; 2598 } 2599 } 2600 } 2601 } 2602 if(length==prev) { 2603 break; // No match from prev. 2604 } 2605 } while((prev=length)>0); 2606 if(prev<minSpanStart) { 2607 return prev; 2608 } else { 2609 return minSpanStart; 2610 } 2611 } 2612} 2613 2614static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length, 2615 USetSpanCondition spanCondition) { 2616 const UnicodeSet &realSet(set.getSet()); 2617 if(!set.hasStrings()) { 2618 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2619 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2620 } 2621 2622 UChar32 c; 2623 int32_t start=0, prev; 2624 while((prev=start)<length) { 2625 U8_NEXT_OR_FFFD(s, start, length, c); 2626 if(realSet.contains(c)!=spanCondition) { 2627 break; 2628 } 2629 } 2630 return prev; 2631 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2632 UnicodeSetWithStringsIterator iter(set); 2633 UChar32 c; 2634 int32_t start, next; 2635 for(start=next=0; start<length;) { 2636 U8_NEXT_OR_FFFD(s, next, length, c); 2637 if(realSet.contains(c)) { 2638 break; 2639 } 2640 const char *s8; 2641 int32_t length8; 2642 iter.reset(); 2643 while((s8=iter.nextUTF8(length8))!=NULL) { 2644 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) { 2645 // spanNeedsStrings=TRUE; 2646 return start; 2647 } 2648 } 2649 start=next; 2650 } 2651 return start; 2652 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2653 UnicodeSetWithStringsIterator iter(set); 2654 UChar32 c; 2655 int32_t start, next, maxSpanLimit=0; 2656 for(start=next=0; start<length;) { 2657 U8_NEXT_OR_FFFD(s, next, length, c); 2658 if(!realSet.contains(c)) { 2659 next=start; // Do not span this single, not-contained code point. 2660 } 2661 const char *s8; 2662 int32_t length8; 2663 iter.reset(); 2664 while((s8=iter.nextUTF8(length8))!=NULL) { 2665 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) { 2666 // spanNeedsStrings=TRUE; 2667 int32_t matchLimit=start+length8; 2668 if(matchLimit==length) { 2669 return length; 2670 } 2671 if(spanCondition==USET_SPAN_CONTAINED) { 2672 // Iterate for the shortest match at each position. 2673 // Recurse for each but the shortest match. 2674 if(next==start) { 2675 next=matchLimit; // First match from start. 2676 } else { 2677 if(matchLimit<next) { 2678 // Remember shortest match from start for iteration. 2679 int32_t temp=next; 2680 next=matchLimit; 2681 matchLimit=temp; 2682 } 2683 // Recurse for non-shortest match from start. 2684 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit, 2685 USET_SPAN_CONTAINED); 2686 if((matchLimit+spanLength)>maxSpanLimit) { 2687 maxSpanLimit=matchLimit+spanLength; 2688 if(maxSpanLimit==length) { 2689 return length; 2690 } 2691 } 2692 } 2693 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2694 if(matchLimit>next) { 2695 // Remember longest match from start. 2696 next=matchLimit; 2697 } 2698 } 2699 } 2700 } 2701 if(next==start) { 2702 break; // No match from start. 2703 } 2704 start=next; 2705 } 2706 if(start>maxSpanLimit) { 2707 return start; 2708 } else { 2709 return maxSpanLimit; 2710 } 2711 } 2712} 2713 2714static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length, 2715 USetSpanCondition spanCondition) { 2716 if(length==0) { 2717 return 0; 2718 } 2719 const UnicodeSet &realSet(set.getSet()); 2720 if(!set.hasStrings()) { 2721 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2722 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2723 } 2724 2725 UChar32 c; 2726 int32_t prev=length; 2727 do { 2728 U8_PREV_OR_FFFD(s, 0, length, c); 2729 if(realSet.contains(c)!=spanCondition) { 2730 break; 2731 } 2732 } while((prev=length)>0); 2733 return prev; 2734 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2735 UnicodeSetWithStringsIterator iter(set); 2736 UChar32 c; 2737 int32_t prev=length; 2738 do { 2739 U8_PREV_OR_FFFD(s, 0, length, c); 2740 if(realSet.contains(c)) { 2741 break; 2742 } 2743 const char *s8; 2744 int32_t length8; 2745 iter.reset(); 2746 while((s8=iter.nextUTF8(length8))!=NULL) { 2747 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) { 2748 // spanNeedsStrings=TRUE; 2749 return prev; 2750 } 2751 } 2752 } while((prev=length)>0); 2753 return prev; 2754 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2755 UnicodeSetWithStringsIterator iter(set); 2756 UChar32 c; 2757 int32_t prev=length, minSpanStart=length; 2758 do { 2759 U8_PREV_OR_FFFD(s, 0, length, c); 2760 if(!realSet.contains(c)) { 2761 length=prev; // Do not span this single, not-contained code point. 2762 } 2763 const char *s8; 2764 int32_t length8; 2765 iter.reset(); 2766 while((s8=iter.nextUTF8(length8))!=NULL) { 2767 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) { 2768 // spanNeedsStrings=TRUE; 2769 int32_t matchStart=prev-length8; 2770 if(matchStart==0) { 2771 return 0; 2772 } 2773 if(spanCondition==USET_SPAN_CONTAINED) { 2774 // Iterate for the shortest match at each position. 2775 // Recurse for each but the shortest match. 2776 if(length==prev) { 2777 length=matchStart; // First match from prev. 2778 } else { 2779 if(matchStart>length) { 2780 // Remember shortest match from prev for iteration. 2781 int32_t temp=length; 2782 length=matchStart; 2783 matchStart=temp; 2784 } 2785 // Recurse for non-shortest match from prev. 2786 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart, 2787 USET_SPAN_CONTAINED); 2788 if(spanStart<minSpanStart) { 2789 minSpanStart=spanStart; 2790 if(minSpanStart==0) { 2791 return 0; 2792 } 2793 } 2794 } 2795 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2796 if(matchStart<length) { 2797 // Remember longest match from prev. 2798 length=matchStart; 2799 } 2800 } 2801 } 2802 } 2803 if(length==prev) { 2804 break; // No match from prev. 2805 } 2806 } while((prev=length)>0); 2807 if(prev<minSpanStart) { 2808 return prev; 2809 } else { 2810 return minSpanStart; 2811 } 2812 } 2813} 2814 2815// spans to be performed and compared 2816enum { 2817 SPAN_UTF16 =1, 2818 SPAN_UTF8 =2, 2819 SPAN_UTFS =3, 2820 2821 SPAN_SET =4, 2822 SPAN_COMPLEMENT =8, 2823 SPAN_POLARITY =0xc, 2824 2825 SPAN_FWD =0x10, 2826 SPAN_BACK =0x20, 2827 SPAN_DIRS =0x30, 2828 2829 SPAN_CONTAINED =0x100, 2830 SPAN_SIMPLE =0x200, 2831 SPAN_CONDITION =0x300, 2832 2833 SPAN_ALL =0x33f 2834}; 2835 2836static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) { 2837 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED; 2838} 2839 2840static inline int32_t slen(const void *s, UBool isUTF16) { 2841 return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s); 2842} 2843 2844/* 2845 * Count spans on a string with the method according to type and set the span limits. 2846 * The set may be the complement of the original. 2847 * When using spanBack() and comparing with span(), use a span condition for the first spanBack() 2848 * according to the expected number of spans. 2849 * Sets typeName to an empty string if there is no such type. 2850 * Returns -1 if the span option is filtered out. 2851 */ 2852static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement, 2853 const void *s, int32_t length, UBool isUTF16, 2854 uint32_t whichSpans, 2855 int type, const char *&typeName, 2856 int32_t limits[], int32_t limitsCapacity, 2857 int32_t expectCount) { 2858 const UnicodeSet &realSet(set.getSet()); 2859 int32_t start, count; 2860 USetSpanCondition spanCondition, firstSpanCondition, contained; 2861 UBool isForward; 2862 2863 if(type<0 || 7<type) { 2864 typeName=""; 2865 return 0; 2866 } 2867 2868 static const char *const typeNames16[]={ 2869 "contains", "contains(LM)", 2870 "span", "span(LM)", 2871 "containsBack", "containsBack(LM)", 2872 "spanBack", "spanBack(LM)" 2873 }; 2874 2875 static const char *const typeNames8[]={ 2876 "containsUTF8", "containsUTF8(LM)", 2877 "spanUTF8", "spanUTF8(LM)", 2878 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented 2879 "spanBackUTF8", "spanBackUTF8(LM)" 2880 }; 2881 2882 typeName= isUTF16 ? typeNames16[type] : typeNames8[type]; 2883 2884 // filter span options 2885 if(type<=3) { 2886 // span forward 2887 if((whichSpans&SPAN_FWD)==0) { 2888 return -1; 2889 } 2890 isForward=TRUE; 2891 } else { 2892 // span backward 2893 if((whichSpans&SPAN_BACK)==0) { 2894 return -1; 2895 } 2896 isForward=FALSE; 2897 } 2898 if((type&1)==0) { 2899 // use USET_SPAN_CONTAINED 2900 if((whichSpans&SPAN_CONTAINED)==0) { 2901 return -1; 2902 } 2903 contained=USET_SPAN_CONTAINED; 2904 } else { 2905 // use USET_SPAN_SIMPLE 2906 if((whichSpans&SPAN_SIMPLE)==0) { 2907 return -1; 2908 } 2909 contained=USET_SPAN_SIMPLE; 2910 } 2911 2912 // Default first span condition for going forward with an uncomplemented set. 2913 spanCondition=USET_SPAN_NOT_CONTAINED; 2914 if(isComplement) { 2915 spanCondition=invertSpanCondition(spanCondition, contained); 2916 } 2917 2918 // First span condition for span(), used to terminate the spanBack() iteration. 2919 firstSpanCondition=spanCondition; 2920 2921 // spanBack(): Its initial span condition is span()'s last span condition, 2922 // which is the opposite of span()'s first span condition 2923 // if we expect an even number of spans. 2924 // (The loop inverts spanCondition (expectCount-1) times 2925 // before the expectCount'th span() call.) 2926 // If we do not compare forward and backward directions, then we do not have an 2927 // expectCount and just start with firstSpanCondition. 2928 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) { 2929 spanCondition=invertSpanCondition(spanCondition, contained); 2930 } 2931 2932 count=0; 2933 switch(type) { 2934 case 0: 2935 case 1: 2936 start=0; 2937 if(length<0) { 2938 length=slen(s, isUTF16); 2939 } 2940 for(;;) { 2941 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) : 2942 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition); 2943 if(count<limitsCapacity) { 2944 limits[count]=start; 2945 } 2946 ++count; 2947 if(start>=length) { 2948 break; 2949 } 2950 spanCondition=invertSpanCondition(spanCondition, contained); 2951 } 2952 break; 2953 case 2: 2954 case 3: 2955 start=0; 2956 for(;;) { 2957 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) : 2958 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition); 2959 if(count<limitsCapacity) { 2960 limits[count]=start; 2961 } 2962 ++count; 2963 if(length>=0 ? start>=length : 2964 isUTF16 ? ((const UChar *)s)[start]==0 : 2965 ((const char *)s)[start]==0 2966 ) { 2967 break; 2968 } 2969 spanCondition=invertSpanCondition(spanCondition, contained); 2970 } 2971 break; 2972 case 4: 2973 case 5: 2974 if(length<0) { 2975 length=slen(s, isUTF16); 2976 } 2977 for(;;) { 2978 ++count; 2979 if(count<=limitsCapacity) { 2980 limits[limitsCapacity-count]=length; 2981 } 2982 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) : 2983 containsSpanBackUTF8(set, (const char *)s, length, spanCondition); 2984 if(length==0 && spanCondition==firstSpanCondition) { 2985 break; 2986 } 2987 spanCondition=invertSpanCondition(spanCondition, contained); 2988 } 2989 if(count<limitsCapacity) { 2990 memmove(limits, limits+(limitsCapacity-count), count*4); 2991 } 2992 break; 2993 case 6: 2994 case 7: 2995 for(;;) { 2996 ++count; 2997 if(count<=limitsCapacity) { 2998 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16); 2999 } 3000 // Note: Length<0 is tested only for the first spanBack(). 3001 // If we wanted to keep length<0 for all spanBack()s, we would have to 3002 // temporarily modify the string by placing a NUL where the previous spanBack() stopped. 3003 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) : 3004 realSet.spanBackUTF8((const char *)s, length, spanCondition); 3005 if(length==0 && spanCondition==firstSpanCondition) { 3006 break; 3007 } 3008 spanCondition=invertSpanCondition(spanCondition, contained); 3009 } 3010 if(count<limitsCapacity) { 3011 memmove(limits, limits+(limitsCapacity-count), count*4); 3012 } 3013 break; 3014 default: 3015 typeName=""; 3016 return -1; 3017 } 3018 3019 return count; 3020} 3021 3022// sets to be tested; odd index=isComplement 3023enum { 3024 SLOW, 3025 SLOW_NOT, 3026 FAST, 3027 FAST_NOT, 3028 SET_COUNT 3029}; 3030 3031static const char *const setNames[SET_COUNT]={ 3032 "slow", 3033 "slow.not", 3034 "fast", 3035 "fast.not" 3036}; 3037 3038/* 3039 * Verify that we get the same results whether we look at text with contains(), 3040 * span() or spanBack(), using unfrozen or frozen versions of the set, 3041 * and using the set or its complement (switching the spanConditions accordingly). 3042 * The latter verifies that 3043 * set.span(spanCondition) == set.complement().span(!spanCondition). 3044 * 3045 * The expectLimits[] are either provided by the caller (with expectCount>=0) 3046 * or returned to the caller (with an input expectCount<0). 3047 */ 3048void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4], 3049 const void *s, int32_t length, UBool isUTF16, 3050 uint32_t whichSpans, 3051 int32_t expectLimits[], int32_t &expectCount, 3052 const char *testName, int32_t index) { 3053 int32_t limits[500]; 3054 int32_t limitsCount; 3055 int i, j; 3056 3057 const char *typeName; 3058 int type; 3059 3060 for(i=0; i<SET_COUNT; ++i) { 3061 if((i&1)==0) { 3062 // Even-numbered sets are original, uncomplemented sets. 3063 if((whichSpans&SPAN_SET)==0) { 3064 continue; 3065 } 3066 } else { 3067 // Odd-numbered sets are complemented. 3068 if((whichSpans&SPAN_COMPLEMENT)==0) { 3069 continue; 3070 } 3071 } 3072 for(type=0;; ++type) { 3073 limitsCount=getSpans(*sets[i], (UBool)(i&1), 3074 s, length, isUTF16, 3075 whichSpans, 3076 type, typeName, 3077 limits, UPRV_LENGTHOF(limits), expectCount); 3078 if(typeName[0]==0) { 3079 break; // All types tried. 3080 } 3081 if(limitsCount<0) { 3082 continue; // Span option filtered out. 3083 } 3084 if(expectCount<0) { 3085 expectCount=limitsCount; 3086 if(limitsCount>UPRV_LENGTHOF(limits)) { 3087 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans", 3088 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits)); 3089 return; 3090 } 3091 memcpy(expectLimits, limits, limitsCount*4); 3092 } else if(limitsCount!=expectCount) { 3093 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld", 3094 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount); 3095 } else { 3096 for(j=0; j<limitsCount; ++j) { 3097 if(limits[j]!=expectLimits[j]) { 3098 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld", 3099 testName, (long)index, setNames[i], typeName, (long)limitsCount, 3100 j, (long)limits[j], (long)expectLimits[j]); 3101 break; 3102 } 3103 } 3104 } 3105 } 3106 } 3107 3108 // Compare span() with containsAll()/containsNone(), 3109 // but only if we have expectLimits[] from the uncomplemented set. 3110 if(isUTF16 && (whichSpans&SPAN_SET)!=0) { 3111 const UChar *s16=(const UChar *)s; 3112 UnicodeString string; 3113 int32_t prev=0, limit, length; 3114 for(i=0; i<expectCount; ++i) { 3115 limit=expectLimits[i]; 3116 length=limit-prev; 3117 if(length>0) { 3118 string.setTo(FALSE, s16+prev, length); // read-only alias 3119 if(i&1) { 3120 if(!sets[SLOW]->getSet().containsAll(string)) { 3121 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()", 3122 testName, (long)index, setNames[SLOW], (long)prev, (long)limit); 3123 return; 3124 } 3125 if(!sets[FAST]->getSet().containsAll(string)) { 3126 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()", 3127 testName, (long)index, setNames[FAST], (long)prev, (long)limit); 3128 return; 3129 } 3130 } else { 3131 if(!sets[SLOW]->getSet().containsNone(string)) { 3132 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()", 3133 testName, (long)index, setNames[SLOW], (long)prev, (long)limit); 3134 return; 3135 } 3136 if(!sets[FAST]->getSet().containsNone(string)) { 3137 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()", 3138 testName, (long)index, setNames[FAST], (long)prev, (long)limit); 3139 return; 3140 } 3141 } 3142 } 3143 prev=limit; 3144 } 3145 } 3146} 3147 3148// Specifically test either UTF-16 or UTF-8. 3149void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4], 3150 const void *s, int32_t length, UBool isUTF16, 3151 uint32_t whichSpans, 3152 const char *testName, int32_t index) { 3153 int32_t expectLimits[500]; 3154 int32_t expectCount=-1; 3155 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index); 3156} 3157 3158UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) { 3159 UChar c, c2; 3160 3161 if(length>=0) { 3162 while(length>0) { 3163 c=*s++; 3164 --length; 3165 if(0xd800<=c && c<0xe000) { 3166 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) { 3167 return TRUE; 3168 } 3169 --length; 3170 } 3171 } 3172 } else { 3173 while((c=*s++)!=0) { 3174 if(0xd800<=c && c<0xe000) { 3175 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) { 3176 return TRUE; 3177 } 3178 } 3179 } 3180 } 3181 return FALSE; 3182} 3183 3184// Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text, 3185// unless either UTF is turned off in whichSpans. 3186// Testing UTF-16 and UTF-8 together requires that surrogate code points 3187// have the same contains(c) value as U+FFFD. 3188void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4], 3189 const UChar *s16, int32_t length16, 3190 uint32_t whichSpans, 3191 const char *testName, int32_t index) { 3192 int32_t expectLimits[500]; 3193 int32_t expectCount; 3194 3195 expectCount=-1; // Get expectLimits[] from testSpan(). 3196 3197 if((whichSpans&SPAN_UTF16)!=0) { 3198 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index); 3199 } 3200 if((whichSpans&SPAN_UTF8)==0) { 3201 return; 3202 } 3203 3204 // Convert s16[] and expectLimits[] to UTF-8. 3205 uint8_t s8[3000]; 3206 int32_t offsets[3000]; 3207 3208 const UChar *s16Limit=s16+length16; 3209 char *t=(char *)s8; 3210 char *tLimit=t+sizeof(s8); 3211 int32_t *o=offsets; 3212 UErrorCode errorCode=U_ZERO_ERROR; 3213 3214 // Convert with substitution: Turn unpaired surrogates into U+FFFD. 3215 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode); 3216 if(U_FAILURE(errorCode)) { 3217 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s", 3218 testName, (long)index, u_errorName(errorCode)); 3219 ucnv_resetFromUnicode(utf8Cnv); 3220 return; 3221 } 3222 int32_t length8=(int32_t)(t-(char *)s8); 3223 3224 // Convert expectLimits[]. 3225 int32_t i, j, expect; 3226 for(i=j=0; i<expectCount; ++i) { 3227 expect=expectLimits[i]; 3228 if(expect==length16) { 3229 expectLimits[i]=length8; 3230 } else { 3231 while(offsets[j]<expect) { 3232 ++j; 3233 } 3234 expectLimits[i]=j; 3235 } 3236 } 3237 3238 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index); 3239} 3240 3241static UChar32 nextCodePoint(UChar32 c) { 3242 // Skip some large and boring ranges. 3243 switch(c) { 3244 case 0x3441: 3245 return 0x4d7f; 3246 case 0x5100: 3247 return 0x9f00; 3248 case 0xb040: 3249 return 0xd780; 3250 case 0xe041: 3251 return 0xf8fe; 3252 case 0x10100: 3253 return 0x20000; 3254 case 0x20041: 3255 return 0xe0000; 3256 case 0xe0101: 3257 return 0x10fffd; 3258 default: 3259 return c+1; 3260 } 3261} 3262 3263// Verify that all implementations represent the same set. 3264void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3265 // contains(U+FFFD) is inconsistent with contains(some surrogates), 3266 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8: 3267 // Skip the UTF-8 part of the test - if the string contains surrogates - 3268 // because it is likely to produce a different result. 3269 UBool inconsistentSurrogates= 3270 (!(sets[0]->getSet().contains(0xfffd) ? 3271 sets[0]->getSet().contains(0xd800, 0xdfff) : 3272 sets[0]->getSet().containsNone(0xd800, 0xdfff)) || 3273 sets[0]->hasStringsWithSurrogates()); 3274 3275 UChar s[1000]; 3276 int32_t length=0; 3277 uint32_t localWhichSpans; 3278 3279 UChar32 c, first; 3280 for(first=c=0;; c=nextCodePoint(c)) { 3281 if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) { 3282 localWhichSpans=whichSpans; 3283 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) { 3284 localWhichSpans&=~SPAN_UTF8; 3285 } 3286 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first); 3287 if(c>0x10ffff) { 3288 break; 3289 } 3290 length=0; 3291 first=c; 3292 } 3293 U16_APPEND_UNSAFE(s, length, c); 3294 } 3295} 3296 3297// Test with a particular, interesting string. 3298// Specify length and try NUL-termination. 3299void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3300 static const UChar s[]={ 3301 0x61, 0x62, 0x20, // Latin, space 3302 0x3b1, 0x3b2, 0x3b3, // Greek 3303 0xd900, // lead surrogate 3304 0x3000, 0x30ab, 0x30ad, // wide space, Katakana 3305 0xdc05, // trail surrogate 3306 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul 3307 0xd900, 0xdc05, // unassigned supplementary 3308 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary 3309 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS 3310 0 // NUL 3311 }; 3312 3313 if((whichSpans&SPAN_UTF16)==0) { 3314 return; 3315 } 3316 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0); 3317 testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1); 3318} 3319 3320void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3321 static const char s[]={ 3322 "abc" // Latin 3323 3324 /* trail byte in lead position */ 3325 "\x80" 3326 3327 " " // space 3328 3329 /* truncated multi-byte sequences */ 3330 "\xd0" 3331 "\xe0" 3332 "\xe1" 3333 "\xed" 3334 "\xee" 3335 "\xf0" 3336 "\xf1" 3337 "\xf4" 3338 "\xf8" 3339 "\xfc" 3340 3341 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek 3342 3343 /* trail byte in lead position */ 3344 "\x80" 3345 3346 "\xe0\x80" 3347 "\xe0\xa0" 3348 "\xe1\x80" 3349 "\xed\x80" 3350 "\xed\xa0" 3351 "\xee\x80" 3352 "\xf0\x80" 3353 "\xf0\x90" 3354 "\xf1\x80" 3355 "\xf4\x80" 3356 "\xf4\x90" 3357 "\xf8\x80" 3358 "\xfc\x80" 3359 3360 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana 3361 3362 /* trail byte in lead position */ 3363 "\x80" 3364 3365 "\xf0\x80\x80" 3366 "\xf0\x90\x80" 3367 "\xf1\x80\x80" 3368 "\xf4\x80\x80" 3369 "\xf4\x90\x80" 3370 "\xf8\x80\x80" 3371 "\xfc\x80\x80" 3372 3373 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul 3374 3375 /* trail byte in lead position */ 3376 "\x80" 3377 3378 "\xf8\x80\x80\x80" 3379 "\xfc\x80\x80\x80" 3380 3381 "\xF1\x90\x80\x85" // unassigned supplementary 3382 3383 /* trail byte in lead position */ 3384 "\x80" 3385 3386 "\xfc\x80\x80\x80\x80" 3387 3388 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary 3389 3390 /* trail byte in lead position */ 3391 "\x80" 3392 3393 /* complete sequences but non-shortest forms or out of range etc. */ 3394 "\xc0\x80" 3395 "\xe0\x80\x80" 3396 "\xed\xa0\x80" 3397 "\xf0\x80\x80\x80" 3398 "\xf4\x90\x80\x80" 3399 "\xf8\x80\x80\x80\x80" 3400 "\xfc\x80\x80\x80\x80\x80" 3401 "\xfe" 3402 "\xff" 3403 3404 /* trail byte in lead position */ 3405 "\x80" 3406 3407 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated 3408 }; 3409 3410 if((whichSpans&SPAN_UTF8)==0) { 3411 return; 3412 } 3413 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0); 3414 testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1); 3415} 3416 3417// Take a set of span options and multiply them so that 3418// each portion only has one of the options a, b and c. 3419// If b==0, then the set of options is just modified with mask and a. 3420// If b!=0 and c==0, then the set of options is just modified with mask, a and b. 3421static int32_t 3422addAlternative(uint32_t whichSpans[], int32_t whichSpansCount, 3423 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) { 3424 uint32_t s; 3425 int32_t i; 3426 3427 for(i=0; i<whichSpansCount; ++i) { 3428 s=whichSpans[i]&mask; 3429 whichSpans[i]=s|a; 3430 if(b!=0) { 3431 whichSpans[whichSpansCount+i]=s|b; 3432 if(c!=0) { 3433 whichSpans[2*whichSpansCount+i]=s|c; 3434 } 3435 } 3436 } 3437 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount; 3438} 3439 3440#define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 3441#define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 3442#define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" 3443#define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" 3444 3445void UnicodeSetTest::TestSpan() { 3446 // "[...]" is a UnicodeSet pattern. 3447 // "*" performs tests on all Unicode code points and on a selection of 3448 // malformed UTF-8/16 strings. 3449 // "-options" limits the scope of testing for the current set. 3450 // By default, the test verifies that equivalent boundaries are found 3451 // for UTF-16 and UTF-8, going forward and backward, 3452 // alternating USET_SPAN_NOT_CONTAINED with 3453 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE. 3454 // Single-character options: 3455 // 8 -- UTF-16 and UTF-8 boundaries may differ. 3456 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates), 3457 // or the set contains strings with unpaired surrogates 3458 // which do not translate to valid UTF-8. 3459 // c -- set.span() and set.complement().span() boundaries may differ. 3460 // Cause: Set strings are not complemented. 3461 // b -- span() and spanBack() boundaries may differ. 3462 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED) 3463 // and spanBack(USET_SPAN_SIMPLE) are defined to 3464 // match with non-overlapping substrings. 3465 // For example, with a set containing "ab" and "ba", 3466 // span() of "aba" yields boundaries { 0, 2, 3 } 3467 // because the initial "ab" matches from 0 to 2, 3468 // while spanBack() yields boundaries { 0, 1, 3 } 3469 // because the final "ba" matches from 1 to 3. 3470 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ. 3471 // Cause: Strings in the set overlap, and a longer match may 3472 // require a sequence including non-longest substrings. 3473 // For example, with a set containing "ab", "abc" and "cd", 3474 // span(contained) of "abcd" spans the entire string 3475 // but span(longest match) only spans the first 3 characters. 3476 // Each "-options" first resets all options and then applies the specified options. 3477 // A "-" without options resets the options. 3478 // The options are also reset for each new set. 3479 // Other strings will be spanned. 3480 static const char *const testdata[]={ 3481 "[:ID_Continue:]", 3482 "*", 3483 "[:White_Space:]", 3484 "*", 3485 "[]", 3486 "*", 3487 "[\\u0000-\\U0010FFFF]", 3488 "*", 3489 "[\\u0000\\u0080\\u0800\\U00010000]", 3490 "*", 3491 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]", 3492 "*", 3493 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]", 3494 "-c", 3495 "*", 3496 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]", 3497 "-c", 3498 "*", 3499 3500 // Overlapping strings cause overlapping attempts to match. 3501 "[x{xy}{xya}{axy}{ax}]", 3502 "-cl", 3503 3504 // More repetitions of "xya" would take too long with the recursive 3505 // reference implementation. 3506 // containsAll()=FALSE 3507 // test_string 0x14 3508 "xx" 3509 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here. 3510 "xx" // set.complement().span(contained) will stop between the two 'x'es. 3511 "xyaxyaxyaxya" 3512 "xx" 3513 "xyaxyaxyaxya" // span() ends here. 3514 "aaa", 3515 3516 // containsAll()=TRUE 3517 // test_string 0x15 3518 "xx" 3519 "xyaxyaxyaxya" 3520 "xx" 3521 "xyaxyaxyaxya" 3522 "xx" 3523 "xyaxyaxyaxy", 3524 3525 "-bc", 3526 // test_string 0x17 3527 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 } 3528 "-c", 3529 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 } 3530 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 } 3531 "-", 3532 "byaya", // span() -> { 5 } 3533 "byay", // span() -> { 4 } 3534 "bya", // span() -> { 3 } 3535 3536 // span(longest match) will not span the whole string. 3537 "[a{ab}{bc}]", 3538 "-cl", 3539 // test_string 0x21 3540 "abc", 3541 3542 "[a{ab}{abc}{cd}]", 3543 "-cl", 3544 "acdabcdabccd", 3545 3546 // spanBack(longest match) will not span the whole string. 3547 "[c{ab}{bc}]", 3548 "-cl", 3549 "abc", 3550 3551 "[d{cd}{bcd}{ab}]", 3552 "-cl", 3553 "abbcdabcdabd", 3554 3555 // Test with non-ASCII set strings - test proper handling of surrogate pairs 3556 // and UTF-8 trail bytes. 3557 // Copies of above test sets and strings, but transliterated to have 3558 // different code points with similar trail units. 3559 // Previous: a b c d 3560 // Unicode: 042B 30AB 200AB 204AB 3561 // UTF-16: 042B 30AB D840 DCAB D841 DCAB 3562 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB 3563 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]", 3564 "-cl", 3565 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB", 3566 3567 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]", 3568 "-cl", 3569 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB", 3570 3571 // Stress bookkeeping and recursion. 3572 // The following strings are barely doable with the recursive 3573 // reference implementation. 3574 // The not-contained character at the end prevents an early exit from the span(). 3575 "[b{bb}]", 3576 "-c", 3577 // test_string 0x33 3578 "bbbbbbbbbbbbbbbbbbbbbbbb-", 3579 // On complement sets, span() and spanBack() get different results 3580 // because b is not in the complement set and there is an odd number of b's 3581 // in the test string. 3582 "-bc", 3583 "bbbbbbbbbbbbbbbbbbbbbbbbb-", 3584 3585 // Test with set strings with an initial or final code point span 3586 // longer than 254. 3587 "[a{" _64_a _64_a _64_a _64_a "b}" 3588 "{a" _64_b _64_b _64_b _64_b "}]", 3589 "-c", 3590 _64_a _64_a _64_a _63_a "b", 3591 _64_a _64_a _64_a _64_a "b", 3592 _64_a _64_a _64_a _64_a "aaaabbbb", 3593 "a" _64_b _64_b _64_b _63_b, 3594 "a" _64_b _64_b _64_b _64_b, 3595 "aaaabbbb" _64_b _64_b _64_b _64_b, 3596 3597 // Test with strings containing unpaired surrogates. 3598 // They are not representable in UTF-8, and a leading trail surrogate 3599 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair. 3600 // U+20001 == \\uD840\\uDC01 3601 // U+20400 == \\uD841\\uDC00 3602 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]", 3603 "-8cl", 3604 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb" 3605 }; 3606 uint32_t whichSpans[96]={ SPAN_ALL }; 3607 int32_t whichSpansCount=1; 3608 3609 UnicodeSet *sets[SET_COUNT]={ NULL }; 3610 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL }; 3611 3612 char testName[1024]; 3613 char *testNameLimit=testName; 3614 3615 int32_t i, j; 3616 for(i=0; i<UPRV_LENGTHOF(testdata); ++i) { 3617 const char *s=testdata[i]; 3618 if(s[0]=='[') { 3619 // Create new test sets from this pattern. 3620 for(j=0; j<SET_COUNT; ++j) { 3621 delete sets_with_str[j]; 3622 delete sets[j]; 3623 } 3624 UErrorCode errorCode=U_ZERO_ERROR; 3625 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode); 3626 if(U_FAILURE(errorCode)) { 3627 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode)); 3628 break; 3629 } 3630 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]); 3631 sets[SLOW_NOT]->complement(); 3632 // Intermediate set: Test cloning of a frozen set. 3633 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]); 3634 fast->freeze(); 3635 sets[FAST]=(UnicodeSet *)fast->clone(); 3636 delete fast; 3637 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]); 3638 fastNot->freeze(); 3639 sets[FAST_NOT]=(UnicodeSet *)fastNot->clone(); 3640 delete fastNot; 3641 3642 for(j=0; j<SET_COUNT; ++j) { 3643 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]); 3644 } 3645 3646 strcpy(testName, s); 3647 testNameLimit=strchr(testName, 0); 3648 *testNameLimit++=':'; 3649 *testNameLimit=0; 3650 3651 whichSpans[0]=SPAN_ALL; 3652 whichSpansCount=1; 3653 } else if(s[0]=='-') { 3654 whichSpans[0]=SPAN_ALL; 3655 whichSpansCount=1; 3656 3657 while(*++s!=0) { 3658 switch(*s) { 3659 case 'c': 3660 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3661 ~SPAN_POLARITY, 3662 SPAN_SET, 3663 SPAN_COMPLEMENT, 3664 0); 3665 break; 3666 case 'b': 3667 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3668 ~SPAN_DIRS, 3669 SPAN_FWD, 3670 SPAN_BACK, 3671 0); 3672 break; 3673 case 'l': 3674 // test USET_SPAN_CONTAINED FWD & BACK, and separately 3675 // USET_SPAN_SIMPLE only FWD, and separately 3676 // USET_SPAN_SIMPLE only BACK 3677 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3678 ~(SPAN_DIRS|SPAN_CONDITION), 3679 SPAN_DIRS|SPAN_CONTAINED, 3680 SPAN_FWD|SPAN_SIMPLE, 3681 SPAN_BACK|SPAN_SIMPLE); 3682 break; 3683 case '8': 3684 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3685 ~SPAN_UTFS, 3686 SPAN_UTF16, 3687 SPAN_UTF8, 3688 0); 3689 break; 3690 default: 3691 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]); 3692 break; 3693 } 3694 } 3695 } else if(0==strcmp(s, "*")) { 3696 strcpy(testNameLimit, "bad_string"); 3697 for(j=0; j<whichSpansCount; ++j) { 3698 if(whichSpansCount>1) { 3699 sprintf(testNameLimit+10 /* strlen("bad_string") */, 3700 "%%0x%3x", 3701 whichSpans[j]); 3702 } 3703 testSpanUTF16String(sets_with_str, whichSpans[j], testName); 3704 testSpanUTF8String(sets_with_str, whichSpans[j], testName); 3705 } 3706 3707 strcpy(testNameLimit, "contents"); 3708 for(j=0; j<whichSpansCount; ++j) { 3709 if(whichSpansCount>1) { 3710 sprintf(testNameLimit+8 /* strlen("contents") */, 3711 "%%0x%3x", 3712 whichSpans[j]); 3713 } 3714 testSpanContents(sets_with_str, whichSpans[j], testName); 3715 } 3716 } else { 3717 UnicodeString string=UnicodeString(s, -1, US_INV).unescape(); 3718 strcpy(testNameLimit, "test_string"); 3719 for(j=0; j<whichSpansCount; ++j) { 3720 if(whichSpansCount>1) { 3721 sprintf(testNameLimit+11 /* strlen("test_string") */, 3722 "%%0x%3x", 3723 whichSpans[j]); 3724 } 3725 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i); 3726 } 3727 } 3728 } 3729 for(j=0; j<SET_COUNT; ++j) { 3730 delete sets_with_str[j]; 3731 delete sets[j]; 3732 } 3733} 3734 3735// Test select patterns and strings, and test USET_SPAN_SIMPLE. 3736void UnicodeSetTest::TestStringSpan() { 3737 static const char *pattern="[x{xy}{xya}{axy}{ax}]"; 3738 static const char *const string= 3739 "xx" 3740 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" 3741 "xx" 3742 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" 3743 "xx" 3744 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy" 3745 "aaaa"; 3746 3747 UErrorCode errorCode=U_ZERO_ERROR; 3748 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV); 3749 UnicodeSet set(pattern16, errorCode); 3750 if(U_FAILURE(errorCode)) { 3751 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3752 return; 3753 } 3754 3755 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape(); 3756 3757 if(set.containsAll(string16)) { 3758 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string); 3759 } 3760 3761 // Remove trailing "aaaa". 3762 string16.truncate(string16.length()-4); 3763 if(!set.containsAll(string16)) { 3764 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string); 3765 } 3766 3767 string16=UNICODE_STRING_SIMPLE("byayaxya"); 3768 const UChar *s16=string16.getBuffer(); 3769 int32_t length16=string16.length(); 3770 (void)length16; // Suppress set but not used warning. 3771 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 || 3772 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 || 3773 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 || 3774 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 || 3775 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 || 3776 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3 3777 ) { 3778 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern); 3779 } 3780 3781 pattern="[a{ab}{abc}{cd}]"; 3782 pattern16=UnicodeString(pattern, -1, US_INV); 3783 set.applyPattern(pattern16, errorCode); 3784 if(U_FAILURE(errorCode)) { 3785 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3786 return; 3787 } 3788 string16=UNICODE_STRING_SIMPLE("acdabcdabccd"); 3789 s16=string16.getBuffer(); 3790 length16=string16.length(); 3791 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 || 3792 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 || 3793 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5 3794 ) { 3795 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern); 3796 } 3797 3798 pattern="[d{cd}{bcd}{ab}]"; 3799 pattern16=UnicodeString(pattern, -1, US_INV); 3800 set.applyPattern(pattern16, errorCode).freeze(); 3801 if(U_FAILURE(errorCode)) { 3802 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3803 return; 3804 } 3805 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd"); 3806 s16=string16.getBuffer(); 3807 length16=string16.length(); 3808 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 || 3809 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 || 3810 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0 3811 ) { 3812 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern); 3813 } 3814} 3815