1/* 2******************************************************************************** 3* Copyright (C) 1999-2013 International Business Machines Corporation and 4* others. All Rights Reserved. 5******************************************************************************** 6* Date Name Description 7* 10/20/99 alan Creation. 8* 03/22/2000 Madhu Added additional tests 9******************************************************************************** 10*/ 11 12#include <stdio.h> 13 14#include <string.h> 15#include "unicode/utypes.h" 16#include "usettest.h" 17#include "unicode/ucnv.h" 18#include "unicode/uniset.h" 19#include "unicode/uchar.h" 20#include "unicode/usetiter.h" 21#include "unicode/ustring.h" 22#include "unicode/parsepos.h" 23#include "unicode/symtable.h" 24#include "unicode/uversion.h" 25#include "hash.h" 26 27#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 28 29#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \ 30 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \ 31 u_errorName(status));}} 32 33#define TEST_ASSERT(expr) {if (!(expr)) { \ 34 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }} 35 36UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) { 37 UnicodeString pat; 38 set.toPattern(pat); 39 return left + UnicodeSetTest::escape(pat); 40} 41 42#define CASE(id,test) case id: \ 43 name = #test; \ 44 if (exec) { \ 45 logln(#test "---"); \ 46 logln(); \ 47 test(); \ 48 } \ 49 break 50 51UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) { 52} 53 54UConverter *UnicodeSetTest::openUTF8Converter() { 55 if(utf8Cnv==NULL) { 56 UErrorCode errorCode=U_ZERO_ERROR; 57 utf8Cnv=ucnv_open("UTF-8", &errorCode); 58 } 59 return utf8Cnv; 60} 61 62UnicodeSetTest::~UnicodeSetTest() { 63 ucnv_close(utf8Cnv); 64} 65 66void 67UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, 68 const char* &name, char* /*par*/) { 69 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest"); 70 switch (index) { 71 CASE(0,TestPatterns); 72 CASE(1,TestAddRemove); 73 CASE(2,TestCategories); 74 CASE(3,TestCloneEqualHash); 75 CASE(4,TestMinimalRep); 76 CASE(5,TestAPI); 77 CASE(6,TestScriptSet); 78 CASE(7,TestPropertySet); 79 CASE(8,TestClone); 80 CASE(9,TestExhaustive); 81 CASE(10,TestToPattern); 82 CASE(11,TestIndexOf); 83 CASE(12,TestStrings); 84 CASE(13,Testj2268); 85 CASE(14,TestCloseOver); 86 CASE(15,TestEscapePattern); 87 CASE(16,TestInvalidCodePoint); 88 CASE(17,TestSymbolTable); 89 CASE(18,TestSurrogate); 90 CASE(19,TestPosixClasses); 91 CASE(20,TestIteration); 92 CASE(21,TestFreezable); 93 CASE(22,TestSpan); 94 CASE(23,TestStringSpan); 95 default: name = ""; break; 96 } 97} 98 99static const char NOT[] = "%%%%"; 100 101/** 102 * UVector was improperly copying contents 103 * This code will crash this is still true 104 */ 105void UnicodeSetTest::Testj2268() { 106 UnicodeSet t; 107 t.add(UnicodeString("abc")); 108 UnicodeSet test(t); 109 UnicodeString ustrPat; 110 test.toPattern(ustrPat, TRUE); 111} 112 113/** 114 * Test toPattern(). 115 */ 116void UnicodeSetTest::TestToPattern() { 117 UErrorCode ec = U_ZERO_ERROR; 118 119 // Test that toPattern() round trips with syntax characters and 120 // whitespace. 121 { 122 static const char* OTHER_TOPATTERN_TESTS[] = { 123 "[[:latin:]&[:greek:]]", 124 "[[:latin:]-[:greek:]]", 125 "[:nonspacing mark:]", 126 NULL 127 }; 128 129 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) { 130 ec = U_ZERO_ERROR; 131 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec); 132 if (U_FAILURE(ec)) { 133 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec))); 134 continue; 135 } 136 checkPat(OTHER_TOPATTERN_TESTS[j], s); 137 } 138 139 for (UChar32 i = 0; i <= 0x10FFFF; ++i) { 140 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) { 141 142 // check various combinations to make sure they all work. 143 if (i != 0 && !toPatternAux(i, i)){ 144 continue; 145 } 146 if (!toPatternAux(0, i)){ 147 continue; 148 } 149 if (!toPatternAux(i, 0xFFFF)){ 150 continue; 151 } 152 } 153 } 154 } 155 156 // Test pattern behavior of multicharacter strings. 157 { 158 ec = U_ZERO_ERROR; 159 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec); 160 161 // This loop isn't a loop. It's here to make the compiler happy. 162 // If you're curious, try removing it and changing the 'break' 163 // statements (except for the last) to goto's. 164 for (;;) { 165 if (U_FAILURE(ec)) break; 166 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL}; 167 expectToPattern(*s, "[a-z{aa}{ab}]", exp1); 168 169 s->add("ac"); 170 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL}; 171 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2); 172 173 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec); 174 if (U_FAILURE(ec)) break; 175 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL}; 176 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3); 177 178 s->add("[]"); 179 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL}; 180 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4); 181 182 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec); 183 if (U_FAILURE(ec)) break; 184 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL}; 185 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5); 186 187 // j2189 188 s->clear(); 189 s->add(UnicodeString("abc", "")); 190 s->add(UnicodeString("abc", "")); 191 const char* exp6[] = {"abc", NOT, "ab", NULL}; 192 expectToPattern(*s, "[{abc}]", exp6); 193 194 break; 195 } 196 197 if (U_FAILURE(ec)) errln("FAIL: pattern parse error"); 198 delete s; 199 } 200 201 // JB#3400: For 2 character ranges prefer [ab] to [a-b] 202 UnicodeSet s; 203 s.add((UChar)97, (UChar)98); // 'a', 'b' 204 expectToPattern(s, "[ab]", NULL); 205} 206 207UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) { 208 209 // use Integer.toString because Utility.hex doesn't handle ints 210 UnicodeString pat = ""; 211 // TODO do these in hex 212 //String source = "0x" + Integer.toString(start,16).toUpperCase(); 213 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase(); 214 UnicodeString source; 215 source = source + (uint32_t)start; 216 if (start != end) 217 source = source + ".." + (uint32_t)end; 218 UnicodeSet testSet; 219 testSet.add(start, end); 220 return checkPat(source, testSet); 221} 222 223UBool UnicodeSetTest::checkPat(const UnicodeString& source, 224 const UnicodeSet& testSet) { 225 // What we want to make sure of is that a pattern generated 226 // by toPattern(), with or without escaped unprintables, can 227 // be passed back into the UnicodeSet constructor. 228 UnicodeString pat0; 229 230 testSet.toPattern(pat0, TRUE); 231 232 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE; 233 234 //String pat1 = unescapeLeniently(pat0); 235 //if (!checkPat(source + " (in code)", testSet, pat1)) return false; 236 237 UnicodeString pat2; 238 testSet.toPattern(pat2, FALSE); 239 if (!checkPat(source, testSet, pat2)) return FALSE; 240 241 //String pat3 = unescapeLeniently(pat2); 242 // if (!checkPat(source + " (in code)", testSet, pat3)) return false; 243 244 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3); 245 logln((UnicodeString)source + " => " + pat0 + ", " + pat2); 246 return TRUE; 247} 248 249UBool UnicodeSetTest::checkPat(const UnicodeString& source, 250 const UnicodeSet& testSet, 251 const UnicodeString& pat) { 252 UErrorCode ec = U_ZERO_ERROR; 253 UnicodeSet testSet2(pat, ec); 254 if (testSet2 != testSet) { 255 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat); 256 return FALSE; 257 } 258 return TRUE; 259} 260 261void 262UnicodeSetTest::TestPatterns(void) { 263 UnicodeSet set; 264 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km"); 265 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz"); 266 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz"); 267 expectPattern(set, UnicodeString("[-az]", ""), "--aazz"); 268 expectPattern(set, UnicodeString("[az-]", ""), "--aazz"); 269 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz"); 270 271 // Throw in a test of complement 272 set.complement(); 273 UnicodeString exp; 274 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF); 275 expectPairs(set, exp); 276} 277 278void 279UnicodeSetTest::TestCategories(void) { 280 UErrorCode status = U_ZERO_ERROR; 281 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:] 282 UnicodeSet set(pat, status); 283 if (U_FAILURE(status)) { 284 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status))); 285 return; 286 } else { 287 expectContainment(set, pat, "ABC", "abc"); 288 } 289 290 UChar32 i; 291 int32_t failures = 0; 292 // Make sure generation of L doesn't pollute cached Lu set 293 // First generate L, then Lu 294 set.applyPattern("[:L:]", status); 295 if (U_FAILURE(status)) { errln("FAIL"); return; } 296 for (i=0; i<0x200; ++i) { 297 UBool l = u_isalpha((UChar)i); 298 if (l != set.contains(i)) { 299 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " + 300 set.contains(i)); 301 if (++failures == 10) break; 302 } 303 } 304 305 set.applyPattern("[:Lu:]", status); 306 if (U_FAILURE(status)) { errln("FAIL"); return; } 307 for (i=0; i<0x200; ++i) { 308 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER); 309 if (lu != set.contains(i)) { 310 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " + 311 set.contains(i)); 312 if (++failures == 20) break; 313 } 314 } 315} 316void 317UnicodeSetTest::TestCloneEqualHash(void) { 318 UErrorCode status = U_ZERO_ERROR; 319 // set1 and set2 used to be built with the obsolete constructor taking 320 // UCharCategory values; replaced with pattern constructors 321 // markus 20030502 322 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase 323 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase 324 if (U_FAILURE(status)){ 325 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status))); 326 return; 327 } 328 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit 329 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit 330 if (U_FAILURE(status)){ 331 errln((UnicodeString)"FAIL: Can't construct set with category->Nd"); 332 return; 333 } 334 335 if (*set1 != *set1a) { 336 errln("FAIL: category constructor for Ll broken"); 337 } 338 if (*set2 != *set2a) { 339 errln("FAIL: category constructor for Nd broken"); 340 } 341 delete set1a; 342 delete set2a; 343 344 logln("Testing copy construction"); 345 UnicodeSet *set1copy=new UnicodeSet(*set1); 346 if(*set1 != *set1copy || *set1 == *set2 || 347 getPairs(*set1) != getPairs(*set1copy) || 348 set1->hashCode() != set1copy->hashCode()){ 349 errln("FAIL : Error in copy construction"); 350 return; 351 } 352 353 logln("Testing =operator"); 354 UnicodeSet set1equal=*set1; 355 UnicodeSet set2equal=*set2; 356 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 || 357 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){ 358 errln("FAIL: Error in =operator"); 359 } 360 361 logln("Testing clone()"); 362 UnicodeSet *set1clone=(UnicodeSet*)set1->clone(); 363 UnicodeSet *set2clone=(UnicodeSet*)set2->clone(); 364 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal || 365 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal || 366 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){ 367 errln("FAIL: Error in clone"); 368 } 369 370 logln("Testing hashcode"); 371 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() || 372 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() || 373 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() || 374 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() || 375 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){ 376 errln("FAIL: Error in hashCode()"); 377 } 378 379 delete set1; 380 delete set1copy; 381 delete set2; 382 delete set1clone; 383 delete set2clone; 384 385 386} 387void 388UnicodeSetTest::TestAddRemove(void) { 389 UnicodeSet set; // Construct empty set 390 doAssert(set.isEmpty() == TRUE, "set should be empty"); 391 doAssert(set.size() == 0, "size should be 0"); 392 set.complement(); 393 doAssert(set.size() == 0x110000, "size should be 0x110000"); 394 set.clear(); 395 set.add(0x0061, 0x007a); 396 expectPairs(set, "az"); 397 doAssert(set.isEmpty() == FALSE, "set should not be empty"); 398 doAssert(set.size() != 0, "size should not be equal to 0"); 399 doAssert(set.size() == 26, "size should be equal to 26"); 400 set.remove(0x006d, 0x0070); 401 expectPairs(set, "alqz"); 402 doAssert(set.size() == 22, "size should be equal to 22"); 403 set.remove(0x0065, 0x0067); 404 expectPairs(set, "adhlqz"); 405 doAssert(set.size() == 19, "size should be equal to 19"); 406 set.remove(0x0064, 0x0069); 407 expectPairs(set, "acjlqz"); 408 doAssert(set.size() == 16, "size should be equal to 16"); 409 set.remove(0x0063, 0x0072); 410 expectPairs(set, "absz"); 411 doAssert(set.size() == 10, "size should be equal to 10"); 412 set.add(0x0066, 0x0071); 413 expectPairs(set, "abfqsz"); 414 doAssert(set.size() == 22, "size should be equal to 22"); 415 set.remove(0x0061, 0x0067); 416 expectPairs(set, "hqsz"); 417 set.remove(0x0061, 0x007a); 418 expectPairs(set, ""); 419 doAssert(set.isEmpty() == TRUE, "set should be empty"); 420 doAssert(set.size() == 0, "size should be 0"); 421 set.add(0x0061); 422 doAssert(set.isEmpty() == FALSE, "set should not be empty"); 423 doAssert(set.size() == 1, "size should not be equal to 1"); 424 set.add(0x0062); 425 set.add(0x0063); 426 expectPairs(set, "ac"); 427 doAssert(set.size() == 3, "size should not be equal to 3"); 428 set.add(0x0070); 429 set.add(0x0071); 430 expectPairs(set, "acpq"); 431 doAssert(set.size() == 5, "size should not be equal to 5"); 432 set.clear(); 433 expectPairs(set, ""); 434 doAssert(set.isEmpty() == TRUE, "set should be empty"); 435 doAssert(set.size() == 0, "size should be 0"); 436 437 // Try removing an entire set from another set 438 expectPattern(set, "[c-x]", "cx"); 439 UnicodeSet set2; 440 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz"); 441 set.removeAll(set2); 442 expectPairs(set, "deluxx"); 443 444 // Try adding an entire set to another set 445 expectPattern(set, "[jackiemclean]", "aacceein"); 446 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort"); 447 set.addAll(set2); 448 expectPairs(set, "aacehort"); 449 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2"); 450 451 // Try retaining an set of elements contained in another set (intersection) 452 UnicodeSet set3; 453 expectPattern(set3, "[a-c]", "ac"); 454 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3"); 455 set3.remove(0x0062); 456 expectPairs(set3, "aacc"); 457 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3"); 458 set.retainAll(set3); 459 expectPairs(set, "aacc"); 460 doAssert(set.size() == set3.size(), "set.size() should be set3.size()"); 461 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3"); 462 set.clear(); 463 doAssert(set.size() != set3.size(), "set.size() != set3.size()"); 464 465 // Test commutativity 466 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort"); 467 expectPattern(set2, "[jackiemclean]", "aacceein"); 468 set.addAll(set2); 469 expectPairs(set, "aacehort"); 470 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2"); 471 472 473 474 475} 476 477/** 478 * Make sure minimal representation is maintained. 479 */ 480void UnicodeSetTest::TestMinimalRep() { 481 UErrorCode status = U_ZERO_ERROR; 482 // This is pretty thoroughly tested by checkCanonicalRep() 483 // run against the exhaustive operation results. Use the code 484 // here for debugging specific spot problems. 485 486 // 1 overlap against 2 487 UnicodeSet set("[h-km-q]", status); 488 if (U_FAILURE(status)) { errln("FAIL"); return; } 489 UnicodeSet set2("[i-o]", status); 490 if (U_FAILURE(status)) { errln("FAIL"); return; } 491 set.addAll(set2); 492 expectPairs(set, "hq"); 493 // right 494 set.applyPattern("[a-m]", status); 495 if (U_FAILURE(status)) { errln("FAIL"); return; } 496 set2.applyPattern("[e-o]", status); 497 if (U_FAILURE(status)) { errln("FAIL"); return; } 498 set.addAll(set2); 499 expectPairs(set, "ao"); 500 // left 501 set.applyPattern("[e-o]", status); 502 if (U_FAILURE(status)) { errln("FAIL"); return; } 503 set2.applyPattern("[a-m]", status); 504 if (U_FAILURE(status)) { errln("FAIL"); return; } 505 set.addAll(set2); 506 expectPairs(set, "ao"); 507 // 1 overlap against 3 508 set.applyPattern("[a-eg-mo-w]", status); 509 if (U_FAILURE(status)) { errln("FAIL"); return; } 510 set2.applyPattern("[d-q]", status); 511 if (U_FAILURE(status)) { errln("FAIL"); return; } 512 set.addAll(set2); 513 expectPairs(set, "aw"); 514} 515 516void UnicodeSetTest::TestAPI() { 517 UErrorCode status = U_ZERO_ERROR; 518 // default ct 519 UnicodeSet set; 520 if (!set.isEmpty() || set.getRangeCount() != 0) { 521 errln((UnicodeString)"FAIL, set should be empty but isn't: " + 522 set); 523 } 524 525 // clear(), isEmpty() 526 set.add(0x0061); 527 if (set.isEmpty()) { 528 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " + 529 set); 530 } 531 set.clear(); 532 if (!set.isEmpty()) { 533 errln((UnicodeString)"FAIL, set should be empty but isn't: " + 534 set); 535 } 536 537 // size() 538 set.clear(); 539 if (set.size() != 0) { 540 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() + 541 ": " + set); 542 } 543 set.add(0x0061); 544 if (set.size() != 1) { 545 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() + 546 ": " + set); 547 } 548 set.add(0x0031, 0x0039); 549 if (set.size() != 10) { 550 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() + 551 ": " + set); 552 } 553 554 // contains(first, last) 555 set.clear(); 556 set.applyPattern("[A-Y 1-8 b-d l-y]", status); 557 if (U_FAILURE(status)) { errln("FAIL"); return; } 558 for (int32_t i = 0; i<set.getRangeCount(); ++i) { 559 UChar32 a = set.getRangeStart(i); 560 UChar32 b = set.getRangeEnd(i); 561 if (!set.contains(a, b)) { 562 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b + 563 " but doesn't: " + set); 564 } 565 if (set.contains((UChar32)(a-1), b)) { 566 errln((UnicodeString)"FAIL, shouldn't contain " + 567 (unsigned short)(a-1) + '-' + (unsigned short)b + 568 " but does: " + set); 569 } 570 if (set.contains(a, (UChar32)(b+1))) { 571 errln((UnicodeString)"FAIL, shouldn't contain " + 572 (unsigned short)a + '-' + (unsigned short)(b+1) + 573 " but does: " + set); 574 } 575 } 576 577 // Ported InversionList test. 578 UnicodeSet a((UChar32)3,(UChar32)10); 579 UnicodeSet b((UChar32)7,(UChar32)15); 580 UnicodeSet c; 581 582 logln((UnicodeString)"a [3-10]: " + a); 583 logln((UnicodeString)"b [7-15]: " + b); 584 c = a; 585 c.addAll(b); 586 UnicodeSet exp((UChar32)3,(UChar32)15); 587 if (c == exp) { 588 logln((UnicodeString)"c.set(a).add(b): " + c); 589 } else { 590 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp); 591 } 592 c.complement(); 593 exp.set((UChar32)0, (UChar32)2); 594 exp.add((UChar32)16, UnicodeSet::MAX_VALUE); 595 if (c == exp) { 596 logln((UnicodeString)"c.complement(): " + c); 597 } else { 598 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp); 599 } 600 c.complement(); 601 exp.set((UChar32)3, (UChar32)15); 602 if (c == exp) { 603 logln((UnicodeString)"c.complement(): " + c); 604 } else { 605 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp); 606 } 607 c = a; 608 c.complementAll(b); 609 exp.set((UChar32)3,(UChar32)6); 610 exp.add((UChar32)11,(UChar32) 15); 611 if (c == exp) { 612 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c); 613 } else { 614 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp); 615 } 616 617 exp = c; 618 bitsToSet(setToBits(c), c); 619 if (c == exp) { 620 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c); 621 } else { 622 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp); 623 } 624 625 // Additional tests for coverage JB#2118 626 //UnicodeSet::complement(class UnicodeString const &) 627 //UnicodeSet::complementAll(class UnicodeString const &) 628 //UnicodeSet::containsNone(class UnicodeSet const &) 629 //UnicodeSet::containsNone(long,long) 630 //UnicodeSet::containsSome(class UnicodeSet const &) 631 //UnicodeSet::containsSome(long,long) 632 //UnicodeSet::removeAll(class UnicodeString const &) 633 //UnicodeSet::retain(long) 634 //UnicodeSet::retainAll(class UnicodeString const &) 635 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &) 636 //UnicodeSetIterator::getString(void) 637 set.clear(); 638 set.complement("ab"); 639 exp.applyPattern("[{ab}]", status); 640 if (U_FAILURE(status)) { errln("FAIL"); return; } 641 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; } 642 643 UnicodeSetIterator iset(set); 644 if (!iset.next() || !iset.isString()) { 645 errln("FAIL: UnicodeSetIterator::next/isString"); 646 } else if (iset.getString() != "ab") { 647 errln("FAIL: UnicodeSetIterator::getString"); 648 } 649 650 set.add((UChar32)0x61, (UChar32)0x7A); 651 set.complementAll("alan"); 652 exp.applyPattern("[{ab}b-kmo-z]", status); 653 if (U_FAILURE(status)) { errln("FAIL"); return; } 654 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; } 655 656 exp.applyPattern("[a-z]", status); 657 if (U_FAILURE(status)) { errln("FAIL"); return; } 658 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 659 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 660 exp.applyPattern("[aln]", status); 661 if (U_FAILURE(status)) { errln("FAIL"); return; } 662 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 663 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 664 665 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) { 666 errln("FAIL: containsNone(UChar32, UChar32)"); 667 } 668 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) { 669 errln("FAIL: containsSome(UChar32, UChar32)"); 670 } 671 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) { 672 errln("FAIL: containsNone(UChar32, UChar32)"); 673 } 674 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) { 675 errln("FAIL: containsSome(UChar32, UChar32)"); 676 } 677 678 set.removeAll("liu"); 679 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status); 680 if (U_FAILURE(status)) { errln("FAIL"); return; } 681 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; } 682 683 set.retainAll("star"); 684 exp.applyPattern("[rst]", status); 685 if (U_FAILURE(status)) { errln("FAIL"); return; } 686 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; } 687 688 set.retain((UChar32)0x73); 689 exp.applyPattern("[s]", status); 690 if (U_FAILURE(status)) { errln("FAIL"); return; } 691 if (set != exp) { errln("FAIL: retain('s')"); return; } 692 693 uint16_t buf[32]; 694 int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status); 695 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; } 696 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) { 697 errln("FAIL: serialize"); 698 return; 699 } 700 701 // Conversions to and from USet 702 UnicodeSet *uniset = &set; 703 USet *uset = uniset->toUSet(); 704 TEST_ASSERT((void *)uset == (void *)uniset); 705 UnicodeSet *setx = UnicodeSet::fromUSet(uset); 706 TEST_ASSERT((void *)setx == (void *)uset); 707 const UnicodeSet *constSet = uniset; 708 const USet *constUSet = constSet->toUSet(); 709 TEST_ASSERT((void *)constUSet == (void *)constSet); 710 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet); 711 TEST_ASSERT((void *)constSetx == (void *)constUSet); 712 713 // span(UnicodeString) and spanBack(UnicodeString) convenience methods 714 UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc"); 715 UnicodeSet ac(0x61, 0x63); 716 ac.remove(0x62).freeze(); 717 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 || 718 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 || 719 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 || 720 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 || 721 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 || 722 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 || 723 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 || 724 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 || 725 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 || 726 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30 727 ) { 728 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes"); 729 } 730 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 || 731 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 || 732 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 || 733 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 || 734 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 || 735 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 || 736 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 || 737 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 || 738 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 || 739 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20 740 ) { 741 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes"); 742 } 743} 744 745void UnicodeSetTest::TestIteration() { 746 UErrorCode ec = U_ZERO_ERROR; 747 int i = 0; 748 int outerLoop; 749 750 // 6 code points, 3 ranges, 2 strings, 8 total elements 751 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2" 752 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec); 753 TEST_ASSERT_SUCCESS(ec); 754 UnicodeSetIterator it(set); 755 756 for (outerLoop=0; outerLoop<3; outerLoop++) { 757 // Run the test multiple times, to check that iterator.reset() is working. 758 for (i=0; i<10; i++) { 759 UBool nextv = it.next(); 760 UBool isString = it.isString(); 761 int32_t codePoint = it.getCodepoint(); 762 //int32_t codePointEnd = it.getCodepointEnd(); 763 UnicodeString s = it.getString(); 764 switch (i) { 765 case 0: 766 TEST_ASSERT(nextv == TRUE); 767 TEST_ASSERT(isString == FALSE); 768 TEST_ASSERT(codePoint==0x61); 769 TEST_ASSERT(s == "a"); 770 break; 771 case 1: 772 TEST_ASSERT(nextv == TRUE); 773 TEST_ASSERT(isString == FALSE); 774 TEST_ASSERT(codePoint==0x62); 775 TEST_ASSERT(s == "b"); 776 break; 777 case 2: 778 TEST_ASSERT(nextv == TRUE); 779 TEST_ASSERT(isString == FALSE); 780 TEST_ASSERT(codePoint==0x63); 781 TEST_ASSERT(s == "c"); 782 break; 783 case 3: 784 TEST_ASSERT(nextv == TRUE); 785 TEST_ASSERT(isString == FALSE); 786 TEST_ASSERT(codePoint==0x79); 787 TEST_ASSERT(s == "y"); 788 break; 789 case 4: 790 TEST_ASSERT(nextv == TRUE); 791 TEST_ASSERT(isString == FALSE); 792 TEST_ASSERT(codePoint==0x7a); 793 TEST_ASSERT(s == "z"); 794 break; 795 case 5: 796 TEST_ASSERT(nextv == TRUE); 797 TEST_ASSERT(isString == FALSE); 798 TEST_ASSERT(codePoint==0x1abcd); 799 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd)); 800 break; 801 case 6: 802 TEST_ASSERT(nextv == TRUE); 803 TEST_ASSERT(isString == TRUE); 804 TEST_ASSERT(s == "str1"); 805 break; 806 case 7: 807 TEST_ASSERT(nextv == TRUE); 808 TEST_ASSERT(isString == TRUE); 809 TEST_ASSERT(s == "str2"); 810 break; 811 case 8: 812 TEST_ASSERT(nextv == FALSE); 813 break; 814 case 9: 815 TEST_ASSERT(nextv == FALSE); 816 break; 817 } 818 } 819 it.reset(); // prepare to run the iteration again. 820 } 821} 822 823 824 825 826void UnicodeSetTest::TestStrings() { 827 UErrorCode ec = U_ZERO_ERROR; 828 829 UnicodeSet* testList[] = { 830 UnicodeSet::createFromAll("abc"), 831 new UnicodeSet("[a-c]", ec), 832 833 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")), 834 new UnicodeSet("[{ll}{ch}a-z]", ec), 835 836 UnicodeSet::createFrom("ab}c"), 837 new UnicodeSet("[{ab\\}c}]", ec), 838 839 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')), 840 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec), 841 842 NULL 843 }; 844 845 if (U_FAILURE(ec)) { 846 errln("FAIL: couldn't construct test sets"); 847 } 848 849 for (int32_t i = 0; testList[i] != NULL; i+=2) { 850 if (U_SUCCESS(ec)) { 851 UnicodeString pat0, pat1; 852 testList[i]->toPattern(pat0, TRUE); 853 testList[i+1]->toPattern(pat1, TRUE); 854 if (*testList[i] == *testList[i+1]) { 855 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1); 856 } else { 857 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1); 858 } 859 } 860 delete testList[i]; 861 delete testList[i+1]; 862 } 863} 864 865/** 866 * Test the [:Latin:] syntax. 867 */ 868void UnicodeSetTest::TestScriptSet() { 869 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1")); 870 871 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA"); 872 873 /* Jitterbug 1423 */ 874 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA"); 875 876} 877 878/** 879 * Test the [:Latin:] syntax. 880 */ 881void UnicodeSetTest::TestPropertySet() { 882 static const char* const DATA[] = { 883 // Pattern, Chars IN, Chars NOT in 884 885 "[:Latin:]", 886 "aA", 887 "\\u0391\\u03B1", 888 889 "[\\p{Greek}]", 890 "\\u0391\\u03B1", 891 "aA", 892 893 "\\P{ GENERAL Category = upper case letter }", 894 "abc", 895 "ABC", 896 897#if !UCONFIG_NO_NORMALIZATION 898 // Combining class: @since ICU 2.2 899 // Check both symbolic and numeric 900 "\\p{ccc=Nukta}", 901 "\\u0ABC", 902 "abc", 903 904 "\\p{Canonical Combining Class = 11}", 905 "\\u05B1", 906 "\\u05B2", 907 908 "[:c c c = iota subscript :]", 909 "\\u0345", 910 "xyz", 911#endif 912 913 // Bidi class: @since ICU 2.2 914 "\\p{bidiclass=lefttoright}", 915 "abc", 916 "\\u0671\\u0672", 917 918 // Binary properties: @since ICU 2.2 919 "\\p{ideographic}", 920 "\\u4E0A", 921 "x", 922 923 "[:math=false:]", 924 "q)*(", 925 // weiv: )(and * were removed from math in Unicode 4.0.1 926 //"(*+)", 927 "+<>^", 928 929 // JB#1767 \N{}, \p{ASCII} 930 "[:Ascii:]", 931 "abc\\u0000\\u007F", 932 "\\u0080\\u4E00", 933 934 "[\\N{ latin small letter a }[:name= latin small letter z:]]", 935 "az", 936 "qrs", 937 938 // JB#2015 939 "[:any:]", 940 "a\\U0010FFFF", 941 "", 942 943 "[:nv=0.5:]", 944 "\\u00BD\\u0F2A", 945 "\\u00BC", 946 947 // JB#2653: Age 948 "[:Age=1.1:]", 949 "\\u03D6", // 1.1 950 "\\u03D8\\u03D9", // 3.2 951 952 "[:Age=3.1:]", 953 "\\u1800\\u3400\\U0002f800", 954 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000", 955 956 // JB#2350: Case_Sensitive 957 "[:Case Sensitive:]", 958 "A\\u1FFC\\U00010410", 959 ";\\u00B4\\U00010500", 960 961 // JB#2832: C99-compatibility props 962 "[:blank:]", 963 " \\u0009", 964 "1-9A-Z", 965 966 "[:graph:]", 967 "19AZ", 968 " \\u0003\\u0007\\u0009\\u000A\\u000D", 969 970 "[:punct:]", 971 "!@#%&*()[]{}-_\\/;:,.?'\"", 972 "09azAZ", 973 974 "[:xdigit:]", 975 "09afAF", 976 "gG!", 977 978 // Regex compatibility test 979 "[-b]", // leading '-' is literal 980 "-b", 981 "ac", 982 983 "[^-b]", // leading '-' is literal 984 "ac", 985 "-b", 986 987 "[b-]", // trailing '-' is literal 988 "-b", 989 "ac", 990 991 "[^b-]", // trailing '-' is literal 992 "ac", 993 "-b", 994 995 "[a-b-]", // trailing '-' is literal 996 "ab-", 997 "c=", 998 999 "[[a-q]&[p-z]-]", // trailing '-' is literal 1000 "pq-", 1001 "or=", 1002 1003 "[\\s|\\)|:|$|\\>]", // from regex tests 1004 "s|):$>", 1005 "abc", 1006 1007 "[\\uDC00cd]", // JB#2906: isolated trail at start 1008 "cd\\uDC00", 1009 "ab\\uD800\\U00010000", 1010 1011 "[ab\\uD800]", // JB#2906: isolated trail at start 1012 "ab\\uD800", 1013 "cd\\uDC00\\U00010000", 1014 1015 "[ab\\uD800cd]", // JB#2906: isolated lead in middle 1016 "abcd\\uD800", 1017 "ef\\uDC00\\U00010000", 1018 1019 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle 1020 "abcd\\uDC00", 1021 "ef\\uD800\\U00010000", 1022 1023#if !UCONFIG_NO_NORMALIZATION 1024 "[:^lccc=0:]", // Lead canonical class 1025 "\\u0300\\u0301", 1026 "abcd\\u00c0\\u00c5", 1027 1028 "[:^tccc=0:]", // Trail canonical class 1029 "\\u0300\\u0301\\u00c0\\u00c5", 1030 "abcd", 1031 1032 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class 1033 "\\u0300\\u0301\\u00c0\\u00c5", 1034 "abcd", 1035 1036 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now) 1037 "", 1038 "abcd\\u0300\\u0301\\u00c0\\u00c5", 1039 1040 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not 1041 "\\u0F73\\u0F75\\u0F81", 1042 "abcd\\u0300\\u0301\\u00c0\\u00c5", 1043#endif /* !UCONFIG_NO_NORMALIZATION */ 1044 1045 "[:Assigned:]", 1046 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD", 1047 "\\u0888\\uFDD3\\uFFFE\\U00050005", 1048 1049 // Script_Extensions, new in Unicode 6.0 1050 "[:scx=Arab:]", 1051 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3", 1052 "\\u061D\\uFDEF\\uFDFE", 1053 1054 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions, 1055 // so scx-sc is missing U+FDF2. 1056 "[[:Script_Extensions=Arabic:]-[:Arab:]]", 1057 "\\u0640\\u064B\\u0650\\u0655\\uFDFD", 1058 "\\uFDF2" 1059 }; 1060 1061 static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]); 1062 1063 for (int32_t i=0; i<DATA_LEN; i+=3) { 1064 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]), 1065 CharsToUnicodeString(DATA[i+2])); 1066 } 1067} 1068 1069/** 1070 * Test that Posix style character classes [:digit:], etc. 1071 * have the Unicode definitions from TR 18. 1072 */ 1073void UnicodeSetTest::TestPosixClasses() { 1074 { 1075 UErrorCode status = U_ZERO_ERROR; 1076 UnicodeSet s1("[:alpha:]", status); 1077 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status); 1078 TEST_ASSERT_SUCCESS(status); 1079 TEST_ASSERT(s1==s2); 1080 } 1081 { 1082 UErrorCode status = U_ZERO_ERROR; 1083 UnicodeSet s1("[:lower:]", status); 1084 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status); 1085 TEST_ASSERT_SUCCESS(status); 1086 TEST_ASSERT(s1==s2); 1087 } 1088 { 1089 UErrorCode status = U_ZERO_ERROR; 1090 UnicodeSet s1("[:upper:]", status); 1091 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status); 1092 TEST_ASSERT_SUCCESS(status); 1093 TEST_ASSERT(s1==s2); 1094 } 1095 { 1096 UErrorCode status = U_ZERO_ERROR; 1097 UnicodeSet s1("[:punct:]", status); 1098 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status); 1099 TEST_ASSERT_SUCCESS(status); 1100 TEST_ASSERT(s1==s2); 1101 } 1102 { 1103 UErrorCode status = U_ZERO_ERROR; 1104 UnicodeSet s1("[:digit:]", status); 1105 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status); 1106 TEST_ASSERT_SUCCESS(status); 1107 TEST_ASSERT(s1==s2); 1108 } 1109 { 1110 UErrorCode status = U_ZERO_ERROR; 1111 UnicodeSet s1("[:xdigit:]", status); 1112 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status); 1113 TEST_ASSERT_SUCCESS(status); 1114 TEST_ASSERT(s1==s2); 1115 } 1116 { 1117 UErrorCode status = U_ZERO_ERROR; 1118 UnicodeSet s1("[:alnum:]", status); 1119 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status); 1120 TEST_ASSERT_SUCCESS(status); 1121 TEST_ASSERT(s1==s2); 1122 } 1123 { 1124 UErrorCode status = U_ZERO_ERROR; 1125 UnicodeSet s1("[:space:]", status); 1126 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status); 1127 TEST_ASSERT_SUCCESS(status); 1128 TEST_ASSERT(s1==s2); 1129 } 1130 { 1131 UErrorCode status = U_ZERO_ERROR; 1132 UnicodeSet s1("[:blank:]", status); 1133 TEST_ASSERT_SUCCESS(status); 1134 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"), 1135 status); 1136 TEST_ASSERT_SUCCESS(status); 1137 TEST_ASSERT(s1==s2); 1138 } 1139 { 1140 UErrorCode status = U_ZERO_ERROR; 1141 UnicodeSet s1("[:cntrl:]", status); 1142 TEST_ASSERT_SUCCESS(status); 1143 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status); 1144 TEST_ASSERT_SUCCESS(status); 1145 TEST_ASSERT(s1==s2); 1146 } 1147 { 1148 UErrorCode status = U_ZERO_ERROR; 1149 UnicodeSet s1("[:graph:]", status); 1150 TEST_ASSERT_SUCCESS(status); 1151 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status); 1152 TEST_ASSERT_SUCCESS(status); 1153 TEST_ASSERT(s1==s2); 1154 } 1155 { 1156 UErrorCode status = U_ZERO_ERROR; 1157 UnicodeSet s1("[:print:]", status); 1158 TEST_ASSERT_SUCCESS(status); 1159 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status); 1160 TEST_ASSERT_SUCCESS(status); 1161 TEST_ASSERT(s1==s2); 1162 } 1163} 1164/** 1165 * Test cloning of UnicodeSet. For C++, we test the copy constructor. 1166 */ 1167void UnicodeSetTest::TestClone() { 1168 UErrorCode ec = U_ZERO_ERROR; 1169 UnicodeSet s("[abcxyz]", ec); 1170 UnicodeSet t(s); 1171 expectContainment(t, "abc", "def"); 1172} 1173 1174/** 1175 * Test the indexOf() and charAt() methods. 1176 */ 1177void UnicodeSetTest::TestIndexOf() { 1178 UErrorCode ec = U_ZERO_ERROR; 1179 UnicodeSet set("[a-cx-y3578]", ec); 1180 if (U_FAILURE(ec)) { 1181 errln("FAIL: UnicodeSet constructor"); 1182 return; 1183 } 1184 for (int32_t i=0; i<set.size(); ++i) { 1185 UChar32 c = set.charAt(i); 1186 if (set.indexOf(c) != i) { 1187 errln("FAIL: charAt(%d) = %X => indexOf() => %d", 1188 i, c, set.indexOf(c)); 1189 } 1190 } 1191 UChar32 c = set.charAt(set.size()); 1192 if (c != -1) { 1193 errln("FAIL: charAt(<out of range>) = %X", c); 1194 } 1195 int32_t j = set.indexOf((UChar32)0x71/*'q'*/); 1196 if (j != -1) { 1197 errln((UnicodeString)"FAIL: indexOf('q') = " + j); 1198 } 1199} 1200 1201/** 1202 * Test closure API. 1203 */ 1204void UnicodeSetTest::TestCloseOver() { 1205 UErrorCode ec = U_ZERO_ERROR; 1206 1207 char CASE[] = {(char)USET_CASE_INSENSITIVE}; 1208 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS}; 1209 const char* DATA[] = { 1210 // selector, input, output 1211 CASE, 1212 "[aq\\u00DF{Bc}{bC}{Fi}]", 1213 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1 1214 1215 CASE, 1216 "[\\u01F1]", // 'DZ' 1217 "[\\u01F1\\u01F2\\u01F3]", 1218 1219 CASE, 1220 "[\\u1FB4]", 1221 "[\\u1FB4{\\u03AC\\u03B9}]", 1222 1223 CASE, 1224 "[{F\\uFB01}]", 1225 "[\\uFB03{ffi}]", 1226 1227 CASE, // make sure binary search finds limits 1228 "[a\\uFF3A]", 1229 "[aA\\uFF3A\\uFF5A]", 1230 1231 CASE, 1232 "[a-z]","[A-Za-z\\u017F\\u212A]", 1233 CASE, 1234 "[abc]","[A-Ca-c]", 1235 CASE, 1236 "[ABC]","[A-Ca-c]", 1237 1238 CASE, "[i]", "[iI]", 1239 1240 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I 1241 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot 1242 1243 CASE, "[\\u0131]", "[\\u0131]", // dotless i 1244 1245 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]", 1246 1247 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas 1248 1249 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas 1250 1251 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]", 1252 1253 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]", 1254 1255 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]", 1256 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]", 1257 1258 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]", 1259 1260 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table 1261 1262 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table 1263 1264#if !UCONFIG_NO_FILE_IO 1265 CASE_MAPPINGS, 1266 "[aq\\u00DF{Bc}{bC}{Fi}]", 1267 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]", 1268#endif 1269 1270 CASE_MAPPINGS, 1271 "[\\u01F1]", // 'DZ' 1272 "[\\u01F1\\u01F2\\u01F3]", 1273 1274 CASE_MAPPINGS, 1275 "[a-z]", 1276 "[A-Za-z]", 1277 1278 NULL 1279 }; 1280 1281 UnicodeSet s; 1282 UnicodeSet t; 1283 UnicodeString buf; 1284 for (int32_t i=0; DATA[i]!=NULL; i+=3) { 1285 int32_t selector = DATA[i][0]; 1286 UnicodeString pat(DATA[i+1], -1, US_INV); 1287 UnicodeString exp(DATA[i+2], -1, US_INV); 1288 s.applyPattern(pat, ec); 1289 s.closeOver(selector); 1290 t.applyPattern(exp, ec); 1291 if (U_FAILURE(ec)) { 1292 errln("FAIL: applyPattern failed"); 1293 continue; 1294 } 1295 if (s == t) { 1296 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp); 1297 } else { 1298 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " + 1299 s.toPattern(buf, TRUE) + ", expected " + exp); 1300 } 1301 } 1302 1303#if 0 1304 /* 1305 * Unused test code. 1306 * This was used to compare the old implementation (using USET_CASE) 1307 * with the new one (using 0x100 temporarily) 1308 * while transitioning from hardcoded case closure tables in uniset.cpp 1309 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu. 1310 * and using ucase.c functions for closure. 1311 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file 1312 * 1313 * Note: The old and new implementation never fully matched because 1314 * the old implementation turned out to not map U+0130 and U+0131 correctly 1315 * (dotted I and dotless i) and because the old implementation's data tables 1316 * were outdated compared to Unicode 4.0.1 at the time of the change to the 1317 * new implementation. (So sigmas and some other characters were not handled 1318 * according to the newer Unicode version.) 1319 */ 1320 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2; 1321 UnicodeSetIterator si(sens); 1322 UnicodeString str, buf2; 1323 const UnicodeString *pStr; 1324 UChar32 c; 1325 while(si.next()) { 1326 if(!si.isString()) { 1327 c=si.getCodepoint(); 1328 s.clear(); 1329 s.add(c); 1330 1331 str.setTo(c); 1332 str.foldCase(); 1333 sens2.add(str); 1334 1335 t=s; 1336 s.closeOver(USET_CASE); 1337 t.closeOver(0x100); 1338 if(s!=t) { 1339 errln("FAIL: closeOver(U+%04x) differs: ", c); 1340 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE)); 1341 } 1342 } 1343 } 1344 // remove all code points 1345 // should contain all full case folding mapping strings 1346 sens2.remove(0, 0x10ffff); 1347 si.reset(sens2); 1348 while(si.next()) { 1349 if(si.isString()) { 1350 pStr=&si.getString(); 1351 s.clear(); 1352 s.add(*pStr); 1353 t=s2=s; 1354 s.closeOver(USET_CASE); 1355 t.closeOver(0x100); 1356 if(s!=t) { 1357 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: "); 1358 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE)); 1359 } 1360 } 1361 } 1362#endif 1363 1364 // Test the pattern API 1365 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec); 1366 if (U_FAILURE(ec)) { 1367 errln("FAIL: applyPattern failed"); 1368 } else { 1369 expectContainment(s, "abcABC", "defDEF"); 1370 } 1371 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec); 1372 if (U_FAILURE(ec)) { 1373 errln("FAIL: constructor failed"); 1374 } else { 1375 expectContainment(v, "defDEF", "abcABC"); 1376 } 1377 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec); 1378 if (U_FAILURE(ec)) { 1379 errln("FAIL: construct w/case mappings failed"); 1380 } else { 1381 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A")); 1382 } 1383} 1384 1385void UnicodeSetTest::TestEscapePattern() { 1386 const char pattern[] = 1387 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]"; 1388 const char exp[] = 1389 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]"; 1390 // We test this with two passes; in the second pass we 1391 // pre-unescape the pattern. Since U+200E is Pattern_White_Space, 1392 // this fails -- which is what we expect. 1393 for (int32_t pass=1; pass<=2; ++pass) { 1394 UErrorCode ec = U_ZERO_ERROR; 1395 UnicodeString pat(pattern, -1, US_INV); 1396 if (pass==2) { 1397 pat = pat.unescape(); 1398 } 1399 // Pattern is only good for pass 1 1400 UBool isPatternValid = (pass==1); 1401 1402 UnicodeSet set(pat, ec); 1403 if (U_SUCCESS(ec) != isPatternValid){ 1404 errln((UnicodeString)"FAIL: applyPattern(" + 1405 escape(pat) + ") => " + 1406 u_errorName(ec)); 1407 continue; 1408 } 1409 if (U_FAILURE(ec)) { 1410 continue; 1411 } 1412 if (set.contains((UChar)0x0644)){ 1413 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)"); 1414 } 1415 1416 UnicodeString newpat; 1417 set.toPattern(newpat, TRUE); 1418 if (newpat == UnicodeString(exp, -1, US_INV)) { 1419 logln(escape(pat) + " => " + newpat); 1420 } else { 1421 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat); 1422 } 1423 1424 for (int32_t i=0; i<set.getRangeCount(); ++i) { 1425 UnicodeString str("Range "); 1426 str.append((UChar)(0x30 + i)) 1427 .append(": ") 1428 .append((UChar32)set.getRangeStart(i)) 1429 .append(" - ") 1430 .append((UChar32)set.getRangeEnd(i)); 1431 str = str + " (" + set.getRangeStart(i) + " - " + 1432 set.getRangeEnd(i) + ")"; 1433 if (set.getRangeStart(i) < 0) { 1434 errln((UnicodeString)"FAIL: " + escape(str)); 1435 } else { 1436 logln(escape(str)); 1437 } 1438 } 1439 } 1440} 1441 1442void UnicodeSetTest::expectRange(const UnicodeString& label, 1443 const UnicodeSet& set, 1444 UChar32 start, UChar32 end) { 1445 UnicodeSet exp(start, end); 1446 UnicodeString pat; 1447 if (set == exp) { 1448 logln(label + " => " + set.toPattern(pat, TRUE)); 1449 } else { 1450 UnicodeString xpat; 1451 errln((UnicodeString)"FAIL: " + label + " => " + 1452 set.toPattern(pat, TRUE) + 1453 ", expected " + exp.toPattern(xpat, TRUE)); 1454 } 1455} 1456 1457void UnicodeSetTest::TestInvalidCodePoint() { 1458 1459 const UChar32 DATA[] = { 1460 // Test range Expected range 1461 0, 0x10FFFF, 0, 0x10FFFF, 1462 (UChar32)-1, 8, 0, 8, 1463 8, 0x110000, 8, 0x10FFFF 1464 }; 1465 const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]); 1466 1467 UnicodeString pat; 1468 int32_t i; 1469 1470 for (i=0; i<DATA_LENGTH; i+=4) { 1471 UChar32 start = DATA[i]; 1472 UChar32 end = DATA[i+1]; 1473 UChar32 xstart = DATA[i+2]; 1474 UChar32 xend = DATA[i+3]; 1475 1476 // Try various API using the test code points 1477 1478 UnicodeSet set(start, end); 1479 expectRange((UnicodeString)"ct(" + start + "," + end + ")", 1480 set, xstart, xend); 1481 1482 set.clear(); 1483 set.set(start, end); 1484 expectRange((UnicodeString)"set(" + start + "," + end + ")", 1485 set, xstart, xend); 1486 1487 UBool b = set.contains(start); 1488 b = set.contains(start, end); 1489 b = set.containsNone(start, end); 1490 b = set.containsSome(start, end); 1491 (void)b; // Suppress set but not used warning. 1492 1493 /*int32_t index = set.indexOf(start);*/ 1494 1495 set.clear(); 1496 set.add(start); 1497 set.add(start, end); 1498 expectRange((UnicodeString)"add(" + start + "," + end + ")", 1499 set, xstart, xend); 1500 1501 set.set(0, 0x10FFFF); 1502 set.retain(start, end); 1503 expectRange((UnicodeString)"retain(" + start + "," + end + ")", 1504 set, xstart, xend); 1505 set.retain(start); 1506 1507 set.set(0, 0x10FFFF); 1508 set.remove(start); 1509 set.remove(start, end); 1510 set.complement(); 1511 expectRange((UnicodeString)"!remove(" + start + "," + end + ")", 1512 set, xstart, xend); 1513 1514 set.set(0, 0x10FFFF); 1515 set.complement(start, end); 1516 set.complement(); 1517 expectRange((UnicodeString)"!complement(" + start + "," + end + ")", 1518 set, xstart, xend); 1519 set.complement(start); 1520 } 1521 1522 const UChar32 DATA2[] = { 1523 0, 1524 0x10FFFF, 1525 (UChar32)-1, 1526 0x110000 1527 }; 1528 const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]); 1529 1530 for (i=0; i<DATA2_LENGTH; ++i) { 1531 UChar32 c = DATA2[i], end = 0x10FFFF; 1532 UBool valid = (c >= 0 && c <= 0x10FFFF); 1533 1534 UnicodeSet set(0, 0x10FFFF); 1535 1536 // For single-codepoint contains, invalid codepoints are NOT contained 1537 UBool b = set.contains(c); 1538 if (b == valid) { 1539 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c + 1540 ") = " + b); 1541 } else { 1542 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c + 1543 ") = " + b); 1544 } 1545 1546 // For codepoint range contains, containsNone, and containsSome, 1547 // invalid or empty (start > end) ranges have UNDEFINED behavior. 1548 b = set.contains(c, end); 1549 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c + 1550 "," + end + ") = " + b); 1551 1552 b = set.containsNone(c, end); 1553 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c + 1554 "," + end + ") = " + b); 1555 1556 b = set.containsSome(c, end); 1557 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c + 1558 "," + end + ") = " + b); 1559 1560 int32_t index = set.indexOf(c); 1561 if ((index >= 0) == valid) { 1562 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c + 1563 ") = " + index); 1564 } else { 1565 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c + 1566 ") = " + index); 1567 } 1568 } 1569} 1570 1571// Used by TestSymbolTable 1572class TokenSymbolTable : public SymbolTable { 1573public: 1574 Hashtable contents; 1575 1576 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) { 1577 contents.setValueDeleter(uprv_deleteUObject); 1578 } 1579 1580 ~TokenSymbolTable() {} 1581 1582 /** 1583 * (Non-SymbolTable API) Add the given variable and value to 1584 * the table. Variable should NOT contain leading '$'. 1585 */ 1586 void add(const UnicodeString& var, const UnicodeString& value, 1587 UErrorCode& ec) { 1588 if (U_SUCCESS(ec)) { 1589 contents.put(var, new UnicodeString(value), ec); 1590 } 1591 } 1592 1593 /** 1594 * SymbolTable API 1595 */ 1596 virtual const UnicodeString* lookup(const UnicodeString& s) const { 1597 return (const UnicodeString*) contents.get(s); 1598 } 1599 1600 /** 1601 * SymbolTable API 1602 */ 1603 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const { 1604 return NULL; 1605 } 1606 1607 /** 1608 * SymbolTable API 1609 */ 1610 virtual UnicodeString parseReference(const UnicodeString& text, 1611 ParsePosition& pos, int32_t limit) const { 1612 int32_t start = pos.getIndex(); 1613 int32_t i = start; 1614 UnicodeString result; 1615 while (i < limit) { 1616 UChar c = text.charAt(i); 1617 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { 1618 break; 1619 } 1620 ++i; 1621 } 1622 if (i == start) { // No valid name chars 1623 return result; // Indicate failure with empty string 1624 } 1625 pos.setIndex(i); 1626 text.extractBetween(start, i, result); 1627 return result; 1628 } 1629}; 1630 1631void UnicodeSetTest::TestSymbolTable() { 1632 // Multiple test cases can be set up here. Each test case 1633 // is terminated by null: 1634 // var, value, var, value,..., input pat., exp. output pat., null 1635 const char* DATA[] = { 1636 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL, 1637 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL, 1638 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL, 1639 NULL 1640 }; 1641 1642 for (int32_t i=0; DATA[i]!=NULL; ++i) { 1643 UErrorCode ec = U_ZERO_ERROR; 1644 TokenSymbolTable sym(ec); 1645 if (U_FAILURE(ec)) { 1646 errln("FAIL: couldn't construct TokenSymbolTable"); 1647 continue; 1648 } 1649 1650 // Set up variables 1651 while (DATA[i+2] != NULL) { 1652 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec); 1653 if (U_FAILURE(ec)) { 1654 errln("FAIL: couldn't add to TokenSymbolTable"); 1655 continue; 1656 } 1657 i += 2; 1658 } 1659 1660 // Input pattern and expected output pattern 1661 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV); 1662 i += 2; 1663 1664 ParsePosition pos(0); 1665 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec); 1666 if (U_FAILURE(ec)) { 1667 errln("FAIL: couldn't construct UnicodeSet"); 1668 continue; 1669 } 1670 1671 // results 1672 if (pos.getIndex() != inpat.length()) { 1673 errln((UnicodeString)"Failed to read to end of string \"" 1674 + inpat + "\": read to " 1675 + pos.getIndex() + ", length is " 1676 + inpat.length()); 1677 } 1678 1679 UnicodeSet us2(exppat, ec); 1680 if (U_FAILURE(ec)) { 1681 errln("FAIL: couldn't construct expected UnicodeSet"); 1682 continue; 1683 } 1684 1685 UnicodeString a, b; 1686 if (us != us2) { 1687 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) + 1688 ", expected " + us2.toPattern(b, TRUE)); 1689 } else { 1690 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE)); 1691 } 1692 } 1693} 1694 1695void UnicodeSetTest::TestSurrogate() { 1696 const char* DATA[] = { 1697 // These should all behave identically 1698 "[abc\\uD800\\uDC00]", 1699 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java 1700 "[abc\\U00010000]", 1701 0 1702 }; 1703 for (int i=0; DATA[i] != 0; ++i) { 1704 UErrorCode ec = U_ZERO_ERROR; 1705 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV)); 1706 UnicodeString str = UnicodeString(DATA[i], -1, US_INV); 1707 UnicodeSet set(str, ec); 1708 if (U_FAILURE(ec)) { 1709 errln("FAIL: UnicodeSet constructor"); 1710 continue; 1711 } 1712 expectContainment(set, 1713 CharsToUnicodeString("abc\\U00010000"), 1714 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair 1715 if (set.size() != 4) { 1716 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " + 1717 set.size() + ", expected 4"); 1718 } 1719 } 1720} 1721 1722void UnicodeSetTest::TestExhaustive() { 1723 // exhaustive tests. Simulate UnicodeSets with integers. 1724 // That gives us very solid tests (except for large memory tests). 1725 1726 int32_t limit = 128; 1727 1728 UnicodeSet x, y, z, aa; 1729 1730 for (int32_t i = 0; i < limit; ++i) { 1731 bitsToSet(i, x); 1732 logln((UnicodeString)"Testing " + i + ", " + x); 1733 _testComplement(i, x, y); 1734 1735 // AS LONG AS WE ARE HERE, check roundtrip 1736 checkRoundTrip(bitsToSet(i, aa)); 1737 1738 for (int32_t j = 0; j < limit; ++j) { 1739 _testAdd(i,j, x,y,z); 1740 _testXor(i,j, x,y,z); 1741 _testRetain(i,j, x,y,z); 1742 _testRemove(i,j, x,y,z); 1743 } 1744 } 1745} 1746 1747void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) { 1748 bitsToSet(a, x); 1749 z = x; 1750 z.complement(); 1751 int32_t c = setToBits(z); 1752 if (c != (~a)) { 1753 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z); 1754 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c); 1755 } 1756 checkCanonicalRep(z, (UnicodeString)"complement " + a); 1757} 1758 1759void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1760 bitsToSet(a, x); 1761 bitsToSet(b, y); 1762 z = x; 1763 z.addAll(y); 1764 int32_t c = setToBits(z); 1765 if (c != (a | b)) { 1766 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z); 1767 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c); 1768 } 1769 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b); 1770} 1771 1772void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1773 bitsToSet(a, x); 1774 bitsToSet(b, y); 1775 z = x; 1776 z.retainAll(y); 1777 int32_t c = setToBits(z); 1778 if (c != (a & b)) { 1779 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z); 1780 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c); 1781 } 1782 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b); 1783} 1784 1785void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1786 bitsToSet(a, x); 1787 bitsToSet(b, y); 1788 z = x; 1789 z.removeAll(y); 1790 int32_t c = setToBits(z); 1791 if (c != (a &~ b)) { 1792 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z); 1793 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c); 1794 } 1795 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b); 1796} 1797 1798void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1799 bitsToSet(a, x); 1800 bitsToSet(b, y); 1801 z = x; 1802 z.complementAll(y); 1803 int32_t c = setToBits(z); 1804 if (c != (a ^ b)) { 1805 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z); 1806 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c); 1807 } 1808 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b); 1809} 1810 1811/** 1812 * Check that ranges are monotonically increasing and non- 1813 * overlapping. 1814 */ 1815void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) { 1816 int32_t n = set.getRangeCount(); 1817 if (n < 0) { 1818 errln((UnicodeString)"FAIL result of " + msg + 1819 ": range count should be >= 0 but is " + 1820 n /*+ " for " + set.toPattern())*/); 1821 return; 1822 } 1823 UChar32 last = 0; 1824 for (int32_t i=0; i<n; ++i) { 1825 UChar32 start = set.getRangeStart(i); 1826 UChar32 end = set.getRangeEnd(i); 1827 if (start > end) { 1828 errln((UnicodeString)"FAIL result of " + msg + 1829 ": range " + (i+1) + 1830 " start > end: " + (int)start + ", " + (int)end + 1831 " for " + set); 1832 } 1833 if (i > 0 && start <= last) { 1834 errln((UnicodeString)"FAIL result of " + msg + 1835 ": range " + (i+1) + 1836 " overlaps previous range: " + (int)start + ", " + (int)end + 1837 " for " + set); 1838 } 1839 last = end; 1840 } 1841} 1842 1843/** 1844 * Convert a bitmask to a UnicodeSet. 1845 */ 1846UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) { 1847 result.clear(); 1848 for (UChar32 i = 0; i < 32; ++i) { 1849 if ((a & (1<<i)) != 0) { 1850 result.add(i); 1851 } 1852 } 1853 return result; 1854} 1855 1856/** 1857 * Convert a UnicodeSet to a bitmask. Only the characters 1858 * U+0000 to U+0020 are represented in the bitmask. 1859 */ 1860int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) { 1861 int32_t result = 0; 1862 for (int32_t i = 0; i < 32; ++i) { 1863 if (x.contains((UChar32)i)) { 1864 result |= (1<<i); 1865 } 1866 } 1867 return result; 1868} 1869 1870/** 1871 * Return the representation of an inversion list based UnicodeSet 1872 * as a pairs list. Ranges are listed in ascending Unicode order. 1873 * For example, the set [a-zA-M3] is represented as "33AMaz". 1874 */ 1875UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) { 1876 UnicodeString pairs; 1877 for (int32_t i=0; i<set.getRangeCount(); ++i) { 1878 UChar32 start = set.getRangeStart(i); 1879 UChar32 end = set.getRangeEnd(i); 1880 if (end > 0xFFFF) { 1881 end = 0xFFFF; 1882 i = set.getRangeCount(); // Should be unnecessary 1883 } 1884 pairs.append((UChar)start).append((UChar)end); 1885 } 1886 return pairs; 1887} 1888 1889/** 1890 * Basic consistency check for a few items. 1891 * That the iterator works, and that we can create a pattern and 1892 * get the same thing back 1893 */ 1894void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) { 1895 UErrorCode ec = U_ZERO_ERROR; 1896 1897 UnicodeSet t(s); 1898 checkEqual(s, t, "copy ct"); 1899 1900 t = s; 1901 checkEqual(s, t, "operator="); 1902 1903 copyWithIterator(t, s, FALSE); 1904 checkEqual(s, t, "iterator roundtrip"); 1905 1906 copyWithIterator(t, s, TRUE); // try range 1907 checkEqual(s, t, "iterator roundtrip"); 1908 1909 UnicodeString pat; s.toPattern(pat, FALSE); 1910 t.applyPattern(pat, ec); 1911 if (U_FAILURE(ec)) { 1912 errln("FAIL: applyPattern"); 1913 return; 1914 } else { 1915 checkEqual(s, t, "toPattern(false)"); 1916 } 1917 1918 s.toPattern(pat, TRUE); 1919 t.applyPattern(pat, ec); 1920 if (U_FAILURE(ec)) { 1921 errln("FAIL: applyPattern"); 1922 return; 1923 } else { 1924 checkEqual(s, t, "toPattern(true)"); 1925 } 1926} 1927 1928void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) { 1929 t.clear(); 1930 UnicodeSetIterator it(s); 1931 if (withRange) { 1932 while (it.nextRange()) { 1933 if (it.isString()) { 1934 t.add(it.getString()); 1935 } else { 1936 t.add(it.getCodepoint(), it.getCodepointEnd()); 1937 } 1938 } 1939 } else { 1940 while (it.next()) { 1941 if (it.isString()) { 1942 t.add(it.getString()); 1943 } else { 1944 t.add(it.getCodepoint()); 1945 } 1946 } 1947 } 1948} 1949 1950UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) { 1951 UnicodeString source; s.toPattern(source, TRUE); 1952 UnicodeString result; t.toPattern(result, TRUE); 1953 if (s != t) { 1954 errln((UnicodeString)"FAIL: " + message 1955 + "; source = " + source 1956 + "; result = " + result 1957 ); 1958 return FALSE; 1959 } else { 1960 logln((UnicodeString)"Ok: " + message 1961 + "; source = " + source 1962 + "; result = " + result 1963 ); 1964 } 1965 return TRUE; 1966} 1967 1968void 1969UnicodeSetTest::expectContainment(const UnicodeString& pat, 1970 const UnicodeString& charsIn, 1971 const UnicodeString& charsOut) { 1972 UErrorCode ec = U_ZERO_ERROR; 1973 UnicodeSet set(pat, ec); 1974 if (U_FAILURE(ec)) { 1975 dataerrln((UnicodeString)"FAIL: pattern \"" + 1976 pat + "\" => " + u_errorName(ec)); 1977 return; 1978 } 1979 expectContainment(set, pat, charsIn, charsOut); 1980} 1981 1982void 1983UnicodeSetTest::expectContainment(const UnicodeSet& set, 1984 const UnicodeString& charsIn, 1985 const UnicodeString& charsOut) { 1986 UnicodeString pat; 1987 set.toPattern(pat); 1988 expectContainment(set, pat, charsIn, charsOut); 1989} 1990 1991void 1992UnicodeSetTest::expectContainment(const UnicodeSet& set, 1993 const UnicodeString& setName, 1994 const UnicodeString& charsIn, 1995 const UnicodeString& charsOut) { 1996 UnicodeString bad; 1997 UChar32 c; 1998 int32_t i; 1999 2000 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) { 2001 c = charsIn.char32At(i); 2002 if (!set.contains(c)) { 2003 bad.append(c); 2004 } 2005 } 2006 if (bad.length() > 0) { 2007 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) + 2008 ", expected containment of " + prettify(charsIn)); 2009 } else { 2010 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn)); 2011 } 2012 2013 bad.truncate(0); 2014 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) { 2015 c = charsOut.char32At(i); 2016 if (set.contains(c)) { 2017 bad.append(c); 2018 } 2019 } 2020 if (bad.length() > 0) { 2021 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) + 2022 ", expected non-containment of " + prettify(charsOut)); 2023 } else { 2024 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut)); 2025 } 2026} 2027 2028void 2029UnicodeSetTest::expectPattern(UnicodeSet& set, 2030 const UnicodeString& pattern, 2031 const UnicodeString& expectedPairs){ 2032 UErrorCode status = U_ZERO_ERROR; 2033 set.applyPattern(pattern, status); 2034 if (U_FAILURE(status)) { 2035 errln(UnicodeString("FAIL: applyPattern(\"") + pattern + 2036 "\") failed"); 2037 return; 2038 } else { 2039 if (getPairs(set) != expectedPairs ) { 2040 errln(UnicodeString("FAIL: applyPattern(\"") + pattern + 2041 "\") => pairs \"" + 2042 escape(getPairs(set)) + "\", expected \"" + 2043 escape(expectedPairs) + "\""); 2044 } else { 2045 logln(UnicodeString("Ok: applyPattern(\"") + pattern + 2046 "\") => pairs \"" + 2047 escape(getPairs(set)) + "\""); 2048 } 2049 } 2050 // the result of calling set.toPattern(), which is the string representation of 2051 // this set(set), is passed to a UnicodeSet constructor, and tested that it 2052 // will produce another set that is equal to this one. 2053 UnicodeString temppattern; 2054 set.toPattern(temppattern); 2055 UnicodeSet *tempset=new UnicodeSet(temppattern, status); 2056 if (U_FAILURE(status)) { 2057 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern")); 2058 return; 2059 } 2060 if(*tempset != set || getPairs(*tempset) != getPairs(set)){ 2061 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" + 2062 escape(getPairs(set)) + "\"")); 2063 } else{ 2064 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\"")); 2065 } 2066 2067 delete tempset; 2068 2069} 2070 2071void 2072UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) { 2073 if (getPairs(set) != expectedPairs) { 2074 errln(UnicodeString("FAIL: Expected pair list \"") + 2075 escape(expectedPairs) + "\", got \"" + 2076 escape(getPairs(set)) + "\""); 2077 } 2078} 2079 2080void UnicodeSetTest::expectToPattern(const UnicodeSet& set, 2081 const UnicodeString& expPat, 2082 const char** expStrings) { 2083 UnicodeString pat; 2084 set.toPattern(pat, TRUE); 2085 if (pat == expPat) { 2086 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\""); 2087 } else { 2088 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\""); 2089 return; 2090 } 2091 if (expStrings == NULL) { 2092 return; 2093 } 2094 UBool in = TRUE; 2095 for (int32_t i=0; expStrings[i] != NULL; ++i) { 2096 if (expStrings[i] == NOT) { // sic; pointer comparison 2097 in = FALSE; 2098 continue; 2099 } 2100 UnicodeString s = CharsToUnicodeString(expStrings[i]); 2101 UBool contained = set.contains(s); 2102 if (contained == in) { 2103 logln((UnicodeString)"Ok: " + expPat + 2104 (contained ? " contains {" : " does not contain {") + 2105 escape(expStrings[i]) + "}"); 2106 } else { 2107 errln((UnicodeString)"FAIL: " + expPat + 2108 (contained ? " contains {" : " does not contain {") + 2109 escape(expStrings[i]) + "}"); 2110 } 2111 } 2112} 2113 2114static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); } 2115 2116void 2117UnicodeSetTest::doAssert(UBool condition, const char *message) 2118{ 2119 if (!condition) { 2120 errln(UnicodeString("ERROR : ") + message); 2121 } 2122} 2123 2124UnicodeString 2125UnicodeSetTest::escape(const UnicodeString& s) { 2126 UnicodeString buf; 2127 for (int32_t i=0; i<s.length(); ) 2128 { 2129 UChar32 c = s.char32At(i); 2130 if (0x0020 <= c && c <= 0x007F) { 2131 buf += c; 2132 } else { 2133 if (c <= 0xFFFF) { 2134 buf += (UChar)0x5c; buf += (UChar)0x75; 2135 } else { 2136 buf += (UChar)0x5c; buf += (UChar)0x55; 2137 buf += toHexString((c & 0xF0000000) >> 28); 2138 buf += toHexString((c & 0x0F000000) >> 24); 2139 buf += toHexString((c & 0x00F00000) >> 20); 2140 buf += toHexString((c & 0x000F0000) >> 16); 2141 } 2142 buf += toHexString((c & 0xF000) >> 12); 2143 buf += toHexString((c & 0x0F00) >> 8); 2144 buf += toHexString((c & 0x00F0) >> 4); 2145 buf += toHexString(c & 0x000F); 2146 } 2147 i += U16_LENGTH(c); 2148 } 2149 return buf; 2150} 2151 2152void UnicodeSetTest::TestFreezable() { 2153 UErrorCode errorCode=U_ZERO_ERROR; 2154 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15); 2155 UnicodeSet idSet(idPattern, errorCode); 2156 if(U_FAILURE(errorCode)) { 2157 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode)); 2158 return; 2159 } 2160 2161 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15); 2162 UnicodeSet wsSet(wsPattern, errorCode); 2163 if(U_FAILURE(errorCode)) { 2164 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode)); 2165 return; 2166 } 2167 2168 idSet.add(idPattern); 2169 UnicodeSet frozen(idSet); 2170 frozen.freeze(); 2171 2172 if(idSet.isFrozen() || !frozen.isFrozen()) { 2173 errln("FAIL: isFrozen() is wrong"); 2174 } 2175 if(frozen!=idSet || !(frozen==idSet)) { 2176 errln("FAIL: a copy-constructed frozen set differs from its original"); 2177 } 2178 2179 frozen=wsSet; 2180 if(frozen!=idSet || !(frozen==idSet)) { 2181 errln("FAIL: a frozen set was modified by operator="); 2182 } 2183 2184 UnicodeSet frozen2(frozen); 2185 if(frozen2!=frozen || frozen2!=idSet) { 2186 errln("FAIL: a copied frozen set differs from its frozen original"); 2187 } 2188 if(!frozen2.isFrozen()) { 2189 errln("FAIL: copy-constructing a frozen set results in a thawed one"); 2190 } 2191 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction. 2192 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) { 2193 errln("FAIL: UnicodeSet(5, 55) failed"); 2194 } 2195 frozen3=frozen; 2196 if(!frozen3.isFrozen()) { 2197 errln("FAIL: copying a frozen set results in a thawed one"); 2198 } 2199 2200 UnicodeSet *cloned=(UnicodeSet *)frozen.clone(); 2201 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) { 2202 errln("FAIL: clone() failed"); 2203 } 2204 cloned->add(0xd802, 0xd805); 2205 if(cloned->containsSome(0xd802, 0xd805)) { 2206 errln("FAIL: unable to modify clone"); 2207 } 2208 delete cloned; 2209 2210 UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed(); 2211 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) { 2212 errln("FAIL: cloneAsThawed() failed"); 2213 } 2214 thawed->add(0xd802, 0xd805); 2215 if(!thawed->contains(0xd802, 0xd805)) { 2216 errln("FAIL: unable to modify thawed clone"); 2217 } 2218 delete thawed; 2219 2220 frozen.set(5, 55); 2221 if(frozen!=idSet || !(frozen==idSet)) { 2222 errln("FAIL: UnicodeSet::set() modified a frozen set"); 2223 } 2224 2225 frozen.clear(); 2226 if(frozen!=idSet || !(frozen==idSet)) { 2227 errln("FAIL: UnicodeSet::clear() modified a frozen set"); 2228 } 2229 2230 frozen.closeOver(USET_CASE_INSENSITIVE); 2231 if(frozen!=idSet || !(frozen==idSet)) { 2232 errln("FAIL: UnicodeSet::closeOver() modified a frozen set"); 2233 } 2234 2235 frozen.compact(); 2236 if(frozen!=idSet || !(frozen==idSet)) { 2237 errln("FAIL: UnicodeSet::compact() modified a frozen set"); 2238 } 2239 2240 ParsePosition pos; 2241 frozen. 2242 applyPattern(wsPattern, errorCode). 2243 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode). 2244 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode). 2245 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode). 2246 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode); 2247 if(frozen!=idSet || !(frozen==idSet)) { 2248 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set"); 2249 } 2250 2251 frozen. 2252 add(0xd800). 2253 add(0xd802, 0xd805). 2254 add(wsPattern). 2255 addAll(idPattern). 2256 addAll(wsSet); 2257 if(frozen!=idSet || !(frozen==idSet)) { 2258 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set"); 2259 } 2260 2261 frozen. 2262 retain(0x62). 2263 retain(0x64, 0x69). 2264 retainAll(wsPattern). 2265 retainAll(wsSet); 2266 if(frozen!=idSet || !(frozen==idSet)) { 2267 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set"); 2268 } 2269 2270 frozen. 2271 remove(0x62). 2272 remove(0x64, 0x69). 2273 remove(idPattern). 2274 removeAll(idPattern). 2275 removeAll(idSet); 2276 if(frozen!=idSet || !(frozen==idSet)) { 2277 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set"); 2278 } 2279 2280 frozen. 2281 complement(). 2282 complement(0x62). 2283 complement(0x64, 0x69). 2284 complement(idPattern). 2285 complementAll(idPattern). 2286 complementAll(idSet); 2287 if(frozen!=idSet || !(frozen==idSet)) { 2288 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set"); 2289 } 2290} 2291 2292// Test span() etc. -------------------------------------------------------- *** 2293 2294// Append the UTF-8 version of the string to t and return the appended UTF-8 length. 2295static int32_t 2296appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) { 2297 UErrorCode errorCode=U_ZERO_ERROR; 2298 int32_t length8=0; 2299 u_strToUTF8(t, capacity, &length8, s, length, &errorCode); 2300 if(U_SUCCESS(errorCode)) { 2301 return length8; 2302 } else { 2303 // The string contains an unpaired surrogate. 2304 // Ignore this string. 2305 return 0; 2306 } 2307} 2308 2309class UnicodeSetWithStringsIterator; 2310 2311// Make the strings in a UnicodeSet easily accessible. 2312class UnicodeSetWithStrings { 2313public: 2314 UnicodeSetWithStrings(const UnicodeSet &normalSet) : 2315 set(normalSet), stringsLength(0), hasSurrogates(FALSE) { 2316 int32_t size=set.size(); 2317 if(size>0 && set.charAt(size-1)<0) { 2318 // If a set's last element is not a code point, then it must contain strings. 2319 // Iterate over the set, skip all code point ranges, and cache the strings. 2320 // Convert them to UTF-8 for spanUTF8(). 2321 UnicodeSetIterator iter(set); 2322 const UnicodeString *s; 2323 char *s8=utf8; 2324 int32_t length8, utf8Count=0; 2325 while(iter.nextRange() && stringsLength<LENGTHOF(strings)) { 2326 if(iter.isString()) { 2327 // Store the pointer to the set's string element 2328 // which we happen to know is a stable pointer. 2329 strings[stringsLength]=s=&iter.getString(); 2330 utf8Count+= 2331 utf8Lengths[stringsLength]=length8= 2332 appendUTF8(s->getBuffer(), s->length(), 2333 s8, (int32_t)(sizeof(utf8)-utf8Count)); 2334 if(length8==0) { 2335 hasSurrogates=TRUE; // Contains unpaired surrogates. 2336 } 2337 s8+=length8; 2338 ++stringsLength; 2339 } 2340 } 2341 } 2342 } 2343 2344 const UnicodeSet &getSet() const { 2345 return set; 2346 } 2347 2348 UBool hasStrings() const { 2349 return (UBool)(stringsLength>0); 2350 } 2351 2352 UBool hasStringsWithSurrogates() const { 2353 return hasSurrogates; 2354 } 2355 2356private: 2357 friend class UnicodeSetWithStringsIterator; 2358 2359 const UnicodeSet &set; 2360 2361 const UnicodeString *strings[20]; 2362 int32_t stringsLength; 2363 UBool hasSurrogates; 2364 2365 char utf8[1024]; 2366 int32_t utf8Lengths[20]; 2367}; 2368 2369class UnicodeSetWithStringsIterator { 2370public: 2371 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) : 2372 fSet(set), nextStringIndex(0), nextUTF8Start(0) { 2373 } 2374 2375 void reset() { 2376 nextStringIndex=nextUTF8Start=0; 2377 } 2378 2379 const UnicodeString *nextString() { 2380 if(nextStringIndex<fSet.stringsLength) { 2381 return fSet.strings[nextStringIndex++]; 2382 } else { 2383 return NULL; 2384 } 2385 } 2386 2387 // Do not mix with calls to nextString(). 2388 const char *nextUTF8(int32_t &length) { 2389 if(nextStringIndex<fSet.stringsLength) { 2390 const char *s8=fSet.utf8+nextUTF8Start; 2391 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++]; 2392 return s8; 2393 } else { 2394 length=0; 2395 return NULL; 2396 } 2397 } 2398 2399private: 2400 const UnicodeSetWithStrings &fSet; 2401 int32_t nextStringIndex; 2402 int32_t nextUTF8Start; 2403}; 2404 2405// Compare 16-bit Unicode strings (which may be malformed UTF-16) 2406// at code point boundaries. 2407// That is, each edge of a match must not be in the middle of a surrogate pair. 2408static inline UBool 2409matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) { 2410 s+=start; 2411 limit-=start; 2412 int32_t length=t.length(); 2413 return 0==t.compare(s, length) && 2414 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 2415 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 2416} 2417 2418// Implement span() with contains() for comparison. 2419static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length, 2420 USetSpanCondition spanCondition) { 2421 const UnicodeSet &realSet(set.getSet()); 2422 if(!set.hasStrings()) { 2423 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2424 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2425 } 2426 2427 UChar32 c; 2428 int32_t start=0, prev; 2429 while((prev=start)<length) { 2430 U16_NEXT(s, start, length, c); 2431 if(realSet.contains(c)!=spanCondition) { 2432 break; 2433 } 2434 } 2435 return prev; 2436 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2437 UnicodeSetWithStringsIterator iter(set); 2438 UChar32 c; 2439 int32_t start, next; 2440 for(start=next=0; start<length;) { 2441 U16_NEXT(s, next, length, c); 2442 if(realSet.contains(c)) { 2443 break; 2444 } 2445 const UnicodeString *str; 2446 iter.reset(); 2447 while((str=iter.nextString())!=NULL) { 2448 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) { 2449 // spanNeedsStrings=TRUE; 2450 return start; 2451 } 2452 } 2453 start=next; 2454 } 2455 return start; 2456 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2457 UnicodeSetWithStringsIterator iter(set); 2458 UChar32 c; 2459 int32_t start, next, maxSpanLimit=0; 2460 for(start=next=0; start<length;) { 2461 U16_NEXT(s, next, length, c); 2462 if(!realSet.contains(c)) { 2463 next=start; // Do not span this single, not-contained code point. 2464 } 2465 const UnicodeString *str; 2466 iter.reset(); 2467 while((str=iter.nextString())!=NULL) { 2468 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) { 2469 // spanNeedsStrings=TRUE; 2470 int32_t matchLimit=start+str->length(); 2471 if(matchLimit==length) { 2472 return length; 2473 } 2474 if(spanCondition==USET_SPAN_CONTAINED) { 2475 // Iterate for the shortest match at each position. 2476 // Recurse for each but the shortest match. 2477 if(next==start) { 2478 next=matchLimit; // First match from start. 2479 } else { 2480 if(matchLimit<next) { 2481 // Remember shortest match from start for iteration. 2482 int32_t temp=next; 2483 next=matchLimit; 2484 matchLimit=temp; 2485 } 2486 // Recurse for non-shortest match from start. 2487 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit, 2488 USET_SPAN_CONTAINED); 2489 if((matchLimit+spanLength)>maxSpanLimit) { 2490 maxSpanLimit=matchLimit+spanLength; 2491 if(maxSpanLimit==length) { 2492 return length; 2493 } 2494 } 2495 } 2496 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2497 if(matchLimit>next) { 2498 // Remember longest match from start. 2499 next=matchLimit; 2500 } 2501 } 2502 } 2503 } 2504 if(next==start) { 2505 break; // No match from start. 2506 } 2507 start=next; 2508 } 2509 if(start>maxSpanLimit) { 2510 return start; 2511 } else { 2512 return maxSpanLimit; 2513 } 2514 } 2515} 2516 2517static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length, 2518 USetSpanCondition spanCondition) { 2519 if(length==0) { 2520 return 0; 2521 } 2522 const UnicodeSet &realSet(set.getSet()); 2523 if(!set.hasStrings()) { 2524 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2525 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2526 } 2527 2528 UChar32 c; 2529 int32_t prev=length; 2530 do { 2531 U16_PREV(s, 0, length, c); 2532 if(realSet.contains(c)!=spanCondition) { 2533 break; 2534 } 2535 } while((prev=length)>0); 2536 return prev; 2537 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2538 UnicodeSetWithStringsIterator iter(set); 2539 UChar32 c; 2540 int32_t prev=length, length0=length; 2541 do { 2542 U16_PREV(s, 0, length, c); 2543 if(realSet.contains(c)) { 2544 break; 2545 } 2546 const UnicodeString *str; 2547 iter.reset(); 2548 while((str=iter.nextString())!=NULL) { 2549 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) { 2550 // spanNeedsStrings=TRUE; 2551 return prev; 2552 } 2553 } 2554 } while((prev=length)>0); 2555 return prev; 2556 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2557 UnicodeSetWithStringsIterator iter(set); 2558 UChar32 c; 2559 int32_t prev=length, minSpanStart=length, length0=length; 2560 do { 2561 U16_PREV(s, 0, length, c); 2562 if(!realSet.contains(c)) { 2563 length=prev; // Do not span this single, not-contained code point. 2564 } 2565 const UnicodeString *str; 2566 iter.reset(); 2567 while((str=iter.nextString())!=NULL) { 2568 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) { 2569 // spanNeedsStrings=TRUE; 2570 int32_t matchStart=prev-str->length(); 2571 if(matchStart==0) { 2572 return 0; 2573 } 2574 if(spanCondition==USET_SPAN_CONTAINED) { 2575 // Iterate for the shortest match at each position. 2576 // Recurse for each but the shortest match. 2577 if(length==prev) { 2578 length=matchStart; // First match from prev. 2579 } else { 2580 if(matchStart>length) { 2581 // Remember shortest match from prev for iteration. 2582 int32_t temp=length; 2583 length=matchStart; 2584 matchStart=temp; 2585 } 2586 // Recurse for non-shortest match from prev. 2587 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart, 2588 USET_SPAN_CONTAINED); 2589 if(spanStart<minSpanStart) { 2590 minSpanStart=spanStart; 2591 if(minSpanStart==0) { 2592 return 0; 2593 } 2594 } 2595 } 2596 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2597 if(matchStart<length) { 2598 // Remember longest match from prev. 2599 length=matchStart; 2600 } 2601 } 2602 } 2603 } 2604 if(length==prev) { 2605 break; // No match from prev. 2606 } 2607 } while((prev=length)>0); 2608 if(prev<minSpanStart) { 2609 return prev; 2610 } else { 2611 return minSpanStart; 2612 } 2613 } 2614} 2615 2616static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length, 2617 USetSpanCondition spanCondition) { 2618 const UnicodeSet &realSet(set.getSet()); 2619 if(!set.hasStrings()) { 2620 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2621 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2622 } 2623 2624 UChar32 c; 2625 int32_t start=0, prev; 2626 while((prev=start)<length) { 2627 U8_NEXT_OR_FFFD(s, start, length, c); 2628 if(realSet.contains(c)!=spanCondition) { 2629 break; 2630 } 2631 } 2632 return prev; 2633 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2634 UnicodeSetWithStringsIterator iter(set); 2635 UChar32 c; 2636 int32_t start, next; 2637 for(start=next=0; start<length;) { 2638 U8_NEXT_OR_FFFD(s, next, length, c); 2639 if(realSet.contains(c)) { 2640 break; 2641 } 2642 const char *s8; 2643 int32_t length8; 2644 iter.reset(); 2645 while((s8=iter.nextUTF8(length8))!=NULL) { 2646 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) { 2647 // spanNeedsStrings=TRUE; 2648 return start; 2649 } 2650 } 2651 start=next; 2652 } 2653 return start; 2654 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2655 UnicodeSetWithStringsIterator iter(set); 2656 UChar32 c; 2657 int32_t start, next, maxSpanLimit=0; 2658 for(start=next=0; start<length;) { 2659 U8_NEXT_OR_FFFD(s, next, length, c); 2660 if(!realSet.contains(c)) { 2661 next=start; // Do not span this single, not-contained code point. 2662 } 2663 const char *s8; 2664 int32_t length8; 2665 iter.reset(); 2666 while((s8=iter.nextUTF8(length8))!=NULL) { 2667 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) { 2668 // spanNeedsStrings=TRUE; 2669 int32_t matchLimit=start+length8; 2670 if(matchLimit==length) { 2671 return length; 2672 } 2673 if(spanCondition==USET_SPAN_CONTAINED) { 2674 // Iterate for the shortest match at each position. 2675 // Recurse for each but the shortest match. 2676 if(next==start) { 2677 next=matchLimit; // First match from start. 2678 } else { 2679 if(matchLimit<next) { 2680 // Remember shortest match from start for iteration. 2681 int32_t temp=next; 2682 next=matchLimit; 2683 matchLimit=temp; 2684 } 2685 // Recurse for non-shortest match from start. 2686 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit, 2687 USET_SPAN_CONTAINED); 2688 if((matchLimit+spanLength)>maxSpanLimit) { 2689 maxSpanLimit=matchLimit+spanLength; 2690 if(maxSpanLimit==length) { 2691 return length; 2692 } 2693 } 2694 } 2695 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2696 if(matchLimit>next) { 2697 // Remember longest match from start. 2698 next=matchLimit; 2699 } 2700 } 2701 } 2702 } 2703 if(next==start) { 2704 break; // No match from start. 2705 } 2706 start=next; 2707 } 2708 if(start>maxSpanLimit) { 2709 return start; 2710 } else { 2711 return maxSpanLimit; 2712 } 2713 } 2714} 2715 2716static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length, 2717 USetSpanCondition spanCondition) { 2718 if(length==0) { 2719 return 0; 2720 } 2721 const UnicodeSet &realSet(set.getSet()); 2722 if(!set.hasStrings()) { 2723 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2724 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2725 } 2726 2727 UChar32 c; 2728 int32_t prev=length; 2729 do { 2730 U8_PREV_OR_FFFD(s, 0, length, c); 2731 if(realSet.contains(c)!=spanCondition) { 2732 break; 2733 } 2734 } while((prev=length)>0); 2735 return prev; 2736 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2737 UnicodeSetWithStringsIterator iter(set); 2738 UChar32 c; 2739 int32_t prev=length; 2740 do { 2741 U8_PREV_OR_FFFD(s, 0, length, c); 2742 if(realSet.contains(c)) { 2743 break; 2744 } 2745 const char *s8; 2746 int32_t length8; 2747 iter.reset(); 2748 while((s8=iter.nextUTF8(length8))!=NULL) { 2749 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) { 2750 // spanNeedsStrings=TRUE; 2751 return prev; 2752 } 2753 } 2754 } while((prev=length)>0); 2755 return prev; 2756 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2757 UnicodeSetWithStringsIterator iter(set); 2758 UChar32 c; 2759 int32_t prev=length, minSpanStart=length; 2760 do { 2761 U8_PREV_OR_FFFD(s, 0, length, c); 2762 if(!realSet.contains(c)) { 2763 length=prev; // Do not span this single, not-contained code point. 2764 } 2765 const char *s8; 2766 int32_t length8; 2767 iter.reset(); 2768 while((s8=iter.nextUTF8(length8))!=NULL) { 2769 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) { 2770 // spanNeedsStrings=TRUE; 2771 int32_t matchStart=prev-length8; 2772 if(matchStart==0) { 2773 return 0; 2774 } 2775 if(spanCondition==USET_SPAN_CONTAINED) { 2776 // Iterate for the shortest match at each position. 2777 // Recurse for each but the shortest match. 2778 if(length==prev) { 2779 length=matchStart; // First match from prev. 2780 } else { 2781 if(matchStart>length) { 2782 // Remember shortest match from prev for iteration. 2783 int32_t temp=length; 2784 length=matchStart; 2785 matchStart=temp; 2786 } 2787 // Recurse for non-shortest match from prev. 2788 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart, 2789 USET_SPAN_CONTAINED); 2790 if(spanStart<minSpanStart) { 2791 minSpanStart=spanStart; 2792 if(minSpanStart==0) { 2793 return 0; 2794 } 2795 } 2796 } 2797 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2798 if(matchStart<length) { 2799 // Remember longest match from prev. 2800 length=matchStart; 2801 } 2802 } 2803 } 2804 } 2805 if(length==prev) { 2806 break; // No match from prev. 2807 } 2808 } while((prev=length)>0); 2809 if(prev<minSpanStart) { 2810 return prev; 2811 } else { 2812 return minSpanStart; 2813 } 2814 } 2815} 2816 2817// spans to be performed and compared 2818enum { 2819 SPAN_UTF16 =1, 2820 SPAN_UTF8 =2, 2821 SPAN_UTFS =3, 2822 2823 SPAN_SET =4, 2824 SPAN_COMPLEMENT =8, 2825 SPAN_POLARITY =0xc, 2826 2827 SPAN_FWD =0x10, 2828 SPAN_BACK =0x20, 2829 SPAN_DIRS =0x30, 2830 2831 SPAN_CONTAINED =0x100, 2832 SPAN_SIMPLE =0x200, 2833 SPAN_CONDITION =0x300, 2834 2835 SPAN_ALL =0x33f 2836}; 2837 2838static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) { 2839 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED; 2840} 2841 2842static inline int32_t slen(const void *s, UBool isUTF16) { 2843 return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s); 2844} 2845 2846/* 2847 * Count spans on a string with the method according to type and set the span limits. 2848 * The set may be the complement of the original. 2849 * When using spanBack() and comparing with span(), use a span condition for the first spanBack() 2850 * according to the expected number of spans. 2851 * Sets typeName to an empty string if there is no such type. 2852 * Returns -1 if the span option is filtered out. 2853 */ 2854static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement, 2855 const void *s, int32_t length, UBool isUTF16, 2856 uint32_t whichSpans, 2857 int type, const char *&typeName, 2858 int32_t limits[], int32_t limitsCapacity, 2859 int32_t expectCount) { 2860 const UnicodeSet &realSet(set.getSet()); 2861 int32_t start, count; 2862 USetSpanCondition spanCondition, firstSpanCondition, contained; 2863 UBool isForward; 2864 2865 if(type<0 || 7<type) { 2866 typeName=""; 2867 return 0; 2868 } 2869 2870 static const char *const typeNames16[]={ 2871 "contains", "contains(LM)", 2872 "span", "span(LM)", 2873 "containsBack", "containsBack(LM)", 2874 "spanBack", "spanBack(LM)" 2875 }; 2876 2877 static const char *const typeNames8[]={ 2878 "containsUTF8", "containsUTF8(LM)", 2879 "spanUTF8", "spanUTF8(LM)", 2880 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented 2881 "spanBackUTF8", "spanBackUTF8(LM)" 2882 }; 2883 2884 typeName= isUTF16 ? typeNames16[type] : typeNames8[type]; 2885 2886 // filter span options 2887 if(type<=3) { 2888 // span forward 2889 if((whichSpans&SPAN_FWD)==0) { 2890 return -1; 2891 } 2892 isForward=TRUE; 2893 } else { 2894 // span backward 2895 if((whichSpans&SPAN_BACK)==0) { 2896 return -1; 2897 } 2898 isForward=FALSE; 2899 } 2900 if((type&1)==0) { 2901 // use USET_SPAN_CONTAINED 2902 if((whichSpans&SPAN_CONTAINED)==0) { 2903 return -1; 2904 } 2905 contained=USET_SPAN_CONTAINED; 2906 } else { 2907 // use USET_SPAN_SIMPLE 2908 if((whichSpans&SPAN_SIMPLE)==0) { 2909 return -1; 2910 } 2911 contained=USET_SPAN_SIMPLE; 2912 } 2913 2914 // Default first span condition for going forward with an uncomplemented set. 2915 spanCondition=USET_SPAN_NOT_CONTAINED; 2916 if(isComplement) { 2917 spanCondition=invertSpanCondition(spanCondition, contained); 2918 } 2919 2920 // First span condition for span(), used to terminate the spanBack() iteration. 2921 firstSpanCondition=spanCondition; 2922 2923 // spanBack(): Its initial span condition is span()'s last span condition, 2924 // which is the opposite of span()'s first span condition 2925 // if we expect an even number of spans. 2926 // (The loop inverts spanCondition (expectCount-1) times 2927 // before the expectCount'th span() call.) 2928 // If we do not compare forward and backward directions, then we do not have an 2929 // expectCount and just start with firstSpanCondition. 2930 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) { 2931 spanCondition=invertSpanCondition(spanCondition, contained); 2932 } 2933 2934 count=0; 2935 switch(type) { 2936 case 0: 2937 case 1: 2938 start=0; 2939 if(length<0) { 2940 length=slen(s, isUTF16); 2941 } 2942 for(;;) { 2943 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) : 2944 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition); 2945 if(count<limitsCapacity) { 2946 limits[count]=start; 2947 } 2948 ++count; 2949 if(start>=length) { 2950 break; 2951 } 2952 spanCondition=invertSpanCondition(spanCondition, contained); 2953 } 2954 break; 2955 case 2: 2956 case 3: 2957 start=0; 2958 for(;;) { 2959 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) : 2960 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition); 2961 if(count<limitsCapacity) { 2962 limits[count]=start; 2963 } 2964 ++count; 2965 if(length>=0 ? start>=length : 2966 isUTF16 ? ((const UChar *)s)[start]==0 : 2967 ((const char *)s)[start]==0 2968 ) { 2969 break; 2970 } 2971 spanCondition=invertSpanCondition(spanCondition, contained); 2972 } 2973 break; 2974 case 4: 2975 case 5: 2976 if(length<0) { 2977 length=slen(s, isUTF16); 2978 } 2979 for(;;) { 2980 ++count; 2981 if(count<=limitsCapacity) { 2982 limits[limitsCapacity-count]=length; 2983 } 2984 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) : 2985 containsSpanBackUTF8(set, (const char *)s, length, spanCondition); 2986 if(length==0 && spanCondition==firstSpanCondition) { 2987 break; 2988 } 2989 spanCondition=invertSpanCondition(spanCondition, contained); 2990 } 2991 if(count<limitsCapacity) { 2992 memmove(limits, limits+(limitsCapacity-count), count*4); 2993 } 2994 break; 2995 case 6: 2996 case 7: 2997 for(;;) { 2998 ++count; 2999 if(count<=limitsCapacity) { 3000 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16); 3001 } 3002 // Note: Length<0 is tested only for the first spanBack(). 3003 // If we wanted to keep length<0 for all spanBack()s, we would have to 3004 // temporarily modify the string by placing a NUL where the previous spanBack() stopped. 3005 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) : 3006 realSet.spanBackUTF8((const char *)s, length, spanCondition); 3007 if(length==0 && spanCondition==firstSpanCondition) { 3008 break; 3009 } 3010 spanCondition=invertSpanCondition(spanCondition, contained); 3011 } 3012 if(count<limitsCapacity) { 3013 memmove(limits, limits+(limitsCapacity-count), count*4); 3014 } 3015 break; 3016 default: 3017 typeName=""; 3018 return -1; 3019 } 3020 3021 return count; 3022} 3023 3024// sets to be tested; odd index=isComplement 3025enum { 3026 SLOW, 3027 SLOW_NOT, 3028 FAST, 3029 FAST_NOT, 3030 SET_COUNT 3031}; 3032 3033static const char *const setNames[SET_COUNT]={ 3034 "slow", 3035 "slow.not", 3036 "fast", 3037 "fast.not" 3038}; 3039 3040/* 3041 * Verify that we get the same results whether we look at text with contains(), 3042 * span() or spanBack(), using unfrozen or frozen versions of the set, 3043 * and using the set or its complement (switching the spanConditions accordingly). 3044 * The latter verifies that 3045 * set.span(spanCondition) == set.complement().span(!spanCondition). 3046 * 3047 * The expectLimits[] are either provided by the caller (with expectCount>=0) 3048 * or returned to the caller (with an input expectCount<0). 3049 */ 3050void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4], 3051 const void *s, int32_t length, UBool isUTF16, 3052 uint32_t whichSpans, 3053 int32_t expectLimits[], int32_t &expectCount, 3054 const char *testName, int32_t index) { 3055 int32_t limits[500]; 3056 int32_t limitsCount; 3057 int i, j; 3058 3059 const char *typeName; 3060 int type; 3061 3062 for(i=0; i<SET_COUNT; ++i) { 3063 if((i&1)==0) { 3064 // Even-numbered sets are original, uncomplemented sets. 3065 if((whichSpans&SPAN_SET)==0) { 3066 continue; 3067 } 3068 } else { 3069 // Odd-numbered sets are complemented. 3070 if((whichSpans&SPAN_COMPLEMENT)==0) { 3071 continue; 3072 } 3073 } 3074 for(type=0;; ++type) { 3075 limitsCount=getSpans(*sets[i], (UBool)(i&1), 3076 s, length, isUTF16, 3077 whichSpans, 3078 type, typeName, 3079 limits, LENGTHOF(limits), expectCount); 3080 if(typeName[0]==0) { 3081 break; // All types tried. 3082 } 3083 if(limitsCount<0) { 3084 continue; // Span option filtered out. 3085 } 3086 if(expectCount<0) { 3087 expectCount=limitsCount; 3088 if(limitsCount>LENGTHOF(limits)) { 3089 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans", 3090 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits)); 3091 return; 3092 } 3093 memcpy(expectLimits, limits, limitsCount*4); 3094 } else if(limitsCount!=expectCount) { 3095 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld", 3096 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount); 3097 } else { 3098 for(j=0; j<limitsCount; ++j) { 3099 if(limits[j]!=expectLimits[j]) { 3100 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld", 3101 testName, (long)index, setNames[i], typeName, (long)limitsCount, 3102 j, (long)limits[j], (long)expectLimits[j]); 3103 break; 3104 } 3105 } 3106 } 3107 } 3108 } 3109 3110 // Compare span() with containsAll()/containsNone(), 3111 // but only if we have expectLimits[] from the uncomplemented set. 3112 if(isUTF16 && (whichSpans&SPAN_SET)!=0) { 3113 const UChar *s16=(const UChar *)s; 3114 UnicodeString string; 3115 int32_t prev=0, limit, length; 3116 for(i=0; i<expectCount; ++i) { 3117 limit=expectLimits[i]; 3118 length=limit-prev; 3119 if(length>0) { 3120 string.setTo(FALSE, s16+prev, length); // read-only alias 3121 if(i&1) { 3122 if(!sets[SLOW]->getSet().containsAll(string)) { 3123 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()", 3124 testName, (long)index, setNames[SLOW], (long)prev, (long)limit); 3125 return; 3126 } 3127 if(!sets[FAST]->getSet().containsAll(string)) { 3128 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()", 3129 testName, (long)index, setNames[FAST], (long)prev, (long)limit); 3130 return; 3131 } 3132 } else { 3133 if(!sets[SLOW]->getSet().containsNone(string)) { 3134 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()", 3135 testName, (long)index, setNames[SLOW], (long)prev, (long)limit); 3136 return; 3137 } 3138 if(!sets[FAST]->getSet().containsNone(string)) { 3139 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()", 3140 testName, (long)index, setNames[FAST], (long)prev, (long)limit); 3141 return; 3142 } 3143 } 3144 } 3145 prev=limit; 3146 } 3147 } 3148} 3149 3150// Specifically test either UTF-16 or UTF-8. 3151void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4], 3152 const void *s, int32_t length, UBool isUTF16, 3153 uint32_t whichSpans, 3154 const char *testName, int32_t index) { 3155 int32_t expectLimits[500]; 3156 int32_t expectCount=-1; 3157 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index); 3158} 3159 3160UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) { 3161 UChar c, c2; 3162 3163 if(length>=0) { 3164 while(length>0) { 3165 c=*s++; 3166 --length; 3167 if(0xd800<=c && c<0xe000) { 3168 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) { 3169 return TRUE; 3170 } 3171 --length; 3172 } 3173 } 3174 } else { 3175 while((c=*s++)!=0) { 3176 if(0xd800<=c && c<0xe000) { 3177 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) { 3178 return TRUE; 3179 } 3180 } 3181 } 3182 } 3183 return FALSE; 3184} 3185 3186// Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text, 3187// unless either UTF is turned off in whichSpans. 3188// Testing UTF-16 and UTF-8 together requires that surrogate code points 3189// have the same contains(c) value as U+FFFD. 3190void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4], 3191 const UChar *s16, int32_t length16, 3192 uint32_t whichSpans, 3193 const char *testName, int32_t index) { 3194 int32_t expectLimits[500]; 3195 int32_t expectCount; 3196 3197 expectCount=-1; // Get expectLimits[] from testSpan(). 3198 3199 if((whichSpans&SPAN_UTF16)!=0) { 3200 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index); 3201 } 3202 if((whichSpans&SPAN_UTF8)==0) { 3203 return; 3204 } 3205 3206 // Convert s16[] and expectLimits[] to UTF-8. 3207 uint8_t s8[3000]; 3208 int32_t offsets[3000]; 3209 3210 const UChar *s16Limit=s16+length16; 3211 char *t=(char *)s8; 3212 char *tLimit=t+sizeof(s8); 3213 int32_t *o=offsets; 3214 UErrorCode errorCode=U_ZERO_ERROR; 3215 3216 // Convert with substitution: Turn unpaired surrogates into U+FFFD. 3217 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode); 3218 if(U_FAILURE(errorCode)) { 3219 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s", 3220 testName, (long)index, u_errorName(errorCode)); 3221 ucnv_resetFromUnicode(utf8Cnv); 3222 return; 3223 } 3224 int32_t length8=(int32_t)(t-(char *)s8); 3225 3226 // Convert expectLimits[]. 3227 int32_t i, j, expect; 3228 for(i=j=0; i<expectCount; ++i) { 3229 expect=expectLimits[i]; 3230 if(expect==length16) { 3231 expectLimits[i]=length8; 3232 } else { 3233 while(offsets[j]<expect) { 3234 ++j; 3235 } 3236 expectLimits[i]=j; 3237 } 3238 } 3239 3240 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index); 3241} 3242 3243static UChar32 nextCodePoint(UChar32 c) { 3244 // Skip some large and boring ranges. 3245 switch(c) { 3246 case 0x3441: 3247 return 0x4d7f; 3248 case 0x5100: 3249 return 0x9f00; 3250 case 0xb040: 3251 return 0xd780; 3252 case 0xe041: 3253 return 0xf8fe; 3254 case 0x10100: 3255 return 0x20000; 3256 case 0x20041: 3257 return 0xe0000; 3258 case 0xe0101: 3259 return 0x10fffd; 3260 default: 3261 return c+1; 3262 } 3263} 3264 3265// Verify that all implementations represent the same set. 3266void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3267 // contains(U+FFFD) is inconsistent with contains(some surrogates), 3268 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8: 3269 // Skip the UTF-8 part of the test - if the string contains surrogates - 3270 // because it is likely to produce a different result. 3271 UBool inconsistentSurrogates= 3272 (!(sets[0]->getSet().contains(0xfffd) ? 3273 sets[0]->getSet().contains(0xd800, 0xdfff) : 3274 sets[0]->getSet().containsNone(0xd800, 0xdfff)) || 3275 sets[0]->hasStringsWithSurrogates()); 3276 3277 UChar s[1000]; 3278 int32_t length=0; 3279 uint32_t localWhichSpans; 3280 3281 UChar32 c, first; 3282 for(first=c=0;; c=nextCodePoint(c)) { 3283 if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) { 3284 localWhichSpans=whichSpans; 3285 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) { 3286 localWhichSpans&=~SPAN_UTF8; 3287 } 3288 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first); 3289 if(c>0x10ffff) { 3290 break; 3291 } 3292 length=0; 3293 first=c; 3294 } 3295 U16_APPEND_UNSAFE(s, length, c); 3296 } 3297} 3298 3299// Test with a particular, interesting string. 3300// Specify length and try NUL-termination. 3301void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3302 static const UChar s[]={ 3303 0x61, 0x62, 0x20, // Latin, space 3304 0x3b1, 0x3b2, 0x3b3, // Greek 3305 0xd900, // lead surrogate 3306 0x3000, 0x30ab, 0x30ad, // wide space, Katakana 3307 0xdc05, // trail surrogate 3308 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul 3309 0xd900, 0xdc05, // unassigned supplementary 3310 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary 3311 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS 3312 0 // NUL 3313 }; 3314 3315 if((whichSpans&SPAN_UTF16)==0) { 3316 return; 3317 } 3318 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0); 3319 testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1); 3320} 3321 3322void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3323 static const char s[]={ 3324 "abc" // Latin 3325 3326 /* trail byte in lead position */ 3327 "\x80" 3328 3329 " " // space 3330 3331 /* truncated multi-byte sequences */ 3332 "\xd0" 3333 "\xe0" 3334 "\xe1" 3335 "\xed" 3336 "\xee" 3337 "\xf0" 3338 "\xf1" 3339 "\xf4" 3340 "\xf8" 3341 "\xfc" 3342 3343 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek 3344 3345 /* trail byte in lead position */ 3346 "\x80" 3347 3348 "\xe0\x80" 3349 "\xe0\xa0" 3350 "\xe1\x80" 3351 "\xed\x80" 3352 "\xed\xa0" 3353 "\xee\x80" 3354 "\xf0\x80" 3355 "\xf0\x90" 3356 "\xf1\x80" 3357 "\xf4\x80" 3358 "\xf4\x90" 3359 "\xf8\x80" 3360 "\xfc\x80" 3361 3362 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana 3363 3364 /* trail byte in lead position */ 3365 "\x80" 3366 3367 "\xf0\x80\x80" 3368 "\xf0\x90\x80" 3369 "\xf1\x80\x80" 3370 "\xf4\x80\x80" 3371 "\xf4\x90\x80" 3372 "\xf8\x80\x80" 3373 "\xfc\x80\x80" 3374 3375 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul 3376 3377 /* trail byte in lead position */ 3378 "\x80" 3379 3380 "\xf8\x80\x80\x80" 3381 "\xfc\x80\x80\x80" 3382 3383 "\xF1\x90\x80\x85" // unassigned supplementary 3384 3385 /* trail byte in lead position */ 3386 "\x80" 3387 3388 "\xfc\x80\x80\x80\x80" 3389 3390 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary 3391 3392 /* trail byte in lead position */ 3393 "\x80" 3394 3395 /* complete sequences but non-shortest forms or out of range etc. */ 3396 "\xc0\x80" 3397 "\xe0\x80\x80" 3398 "\xed\xa0\x80" 3399 "\xf0\x80\x80\x80" 3400 "\xf4\x90\x80\x80" 3401 "\xf8\x80\x80\x80\x80" 3402 "\xfc\x80\x80\x80\x80\x80" 3403 "\xfe" 3404 "\xff" 3405 3406 /* trail byte in lead position */ 3407 "\x80" 3408 3409 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated 3410 }; 3411 3412 if((whichSpans&SPAN_UTF8)==0) { 3413 return; 3414 } 3415 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0); 3416 testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1); 3417} 3418 3419// Take a set of span options and multiply them so that 3420// each portion only has one of the options a, b and c. 3421// If b==0, then the set of options is just modified with mask and a. 3422// If b!=0 and c==0, then the set of options is just modified with mask, a and b. 3423static int32_t 3424addAlternative(uint32_t whichSpans[], int32_t whichSpansCount, 3425 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) { 3426 uint32_t s; 3427 int32_t i; 3428 3429 for(i=0; i<whichSpansCount; ++i) { 3430 s=whichSpans[i]&mask; 3431 whichSpans[i]=s|a; 3432 if(b!=0) { 3433 whichSpans[whichSpansCount+i]=s|b; 3434 if(c!=0) { 3435 whichSpans[2*whichSpansCount+i]=s|c; 3436 } 3437 } 3438 } 3439 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount; 3440} 3441 3442#define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 3443#define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 3444#define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" 3445#define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" 3446 3447void UnicodeSetTest::TestSpan() { 3448 // "[...]" is a UnicodeSet pattern. 3449 // "*" performs tests on all Unicode code points and on a selection of 3450 // malformed UTF-8/16 strings. 3451 // "-options" limits the scope of testing for the current set. 3452 // By default, the test verifies that equivalent boundaries are found 3453 // for UTF-16 and UTF-8, going forward and backward, 3454 // alternating USET_SPAN_NOT_CONTAINED with 3455 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE. 3456 // Single-character options: 3457 // 8 -- UTF-16 and UTF-8 boundaries may differ. 3458 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates), 3459 // or the set contains strings with unpaired surrogates 3460 // which do not translate to valid UTF-8. 3461 // c -- set.span() and set.complement().span() boundaries may differ. 3462 // Cause: Set strings are not complemented. 3463 // b -- span() and spanBack() boundaries may differ. 3464 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED) 3465 // and spanBack(USET_SPAN_SIMPLE) are defined to 3466 // match with non-overlapping substrings. 3467 // For example, with a set containing "ab" and "ba", 3468 // span() of "aba" yields boundaries { 0, 2, 3 } 3469 // because the initial "ab" matches from 0 to 2, 3470 // while spanBack() yields boundaries { 0, 1, 3 } 3471 // because the final "ba" matches from 1 to 3. 3472 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ. 3473 // Cause: Strings in the set overlap, and a longer match may 3474 // require a sequence including non-longest substrings. 3475 // For example, with a set containing "ab", "abc" and "cd", 3476 // span(contained) of "abcd" spans the entire string 3477 // but span(longest match) only spans the first 3 characters. 3478 // Each "-options" first resets all options and then applies the specified options. 3479 // A "-" without options resets the options. 3480 // The options are also reset for each new set. 3481 // Other strings will be spanned. 3482 static const char *const testdata[]={ 3483 "[:ID_Continue:]", 3484 "*", 3485 "[:White_Space:]", 3486 "*", 3487 "[]", 3488 "*", 3489 "[\\u0000-\\U0010FFFF]", 3490 "*", 3491 "[\\u0000\\u0080\\u0800\\U00010000]", 3492 "*", 3493 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]", 3494 "*", 3495 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]", 3496 "-c", 3497 "*", 3498 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]", 3499 "-c", 3500 "*", 3501 3502 // Overlapping strings cause overlapping attempts to match. 3503 "[x{xy}{xya}{axy}{ax}]", 3504 "-cl", 3505 3506 // More repetitions of "xya" would take too long with the recursive 3507 // reference implementation. 3508 // containsAll()=FALSE 3509 // test_string 0x14 3510 "xx" 3511 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here. 3512 "xx" // set.complement().span(contained) will stop between the two 'x'es. 3513 "xyaxyaxyaxya" 3514 "xx" 3515 "xyaxyaxyaxya" // span() ends here. 3516 "aaa", 3517 3518 // containsAll()=TRUE 3519 // test_string 0x15 3520 "xx" 3521 "xyaxyaxyaxya" 3522 "xx" 3523 "xyaxyaxyaxya" 3524 "xx" 3525 "xyaxyaxyaxy", 3526 3527 "-bc", 3528 // test_string 0x17 3529 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 } 3530 "-c", 3531 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 } 3532 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 } 3533 "-", 3534 "byaya", // span() -> { 5 } 3535 "byay", // span() -> { 4 } 3536 "bya", // span() -> { 3 } 3537 3538 // span(longest match) will not span the whole string. 3539 "[a{ab}{bc}]", 3540 "-cl", 3541 // test_string 0x21 3542 "abc", 3543 3544 "[a{ab}{abc}{cd}]", 3545 "-cl", 3546 "acdabcdabccd", 3547 3548 // spanBack(longest match) will not span the whole string. 3549 "[c{ab}{bc}]", 3550 "-cl", 3551 "abc", 3552 3553 "[d{cd}{bcd}{ab}]", 3554 "-cl", 3555 "abbcdabcdabd", 3556 3557 // Test with non-ASCII set strings - test proper handling of surrogate pairs 3558 // and UTF-8 trail bytes. 3559 // Copies of above test sets and strings, but transliterated to have 3560 // different code points with similar trail units. 3561 // Previous: a b c d 3562 // Unicode: 042B 30AB 200AB 204AB 3563 // UTF-16: 042B 30AB D840 DCAB D841 DCAB 3564 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB 3565 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]", 3566 "-cl", 3567 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB", 3568 3569 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]", 3570 "-cl", 3571 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB", 3572 3573 // Stress bookkeeping and recursion. 3574 // The following strings are barely doable with the recursive 3575 // reference implementation. 3576 // The not-contained character at the end prevents an early exit from the span(). 3577 "[b{bb}]", 3578 "-c", 3579 // test_string 0x33 3580 "bbbbbbbbbbbbbbbbbbbbbbbb-", 3581 // On complement sets, span() and spanBack() get different results 3582 // because b is not in the complement set and there is an odd number of b's 3583 // in the test string. 3584 "-bc", 3585 "bbbbbbbbbbbbbbbbbbbbbbbbb-", 3586 3587 // Test with set strings with an initial or final code point span 3588 // longer than 254. 3589 "[a{" _64_a _64_a _64_a _64_a "b}" 3590 "{a" _64_b _64_b _64_b _64_b "}]", 3591 "-c", 3592 _64_a _64_a _64_a _63_a "b", 3593 _64_a _64_a _64_a _64_a "b", 3594 _64_a _64_a _64_a _64_a "aaaabbbb", 3595 "a" _64_b _64_b _64_b _63_b, 3596 "a" _64_b _64_b _64_b _64_b, 3597 "aaaabbbb" _64_b _64_b _64_b _64_b, 3598 3599 // Test with strings containing unpaired surrogates. 3600 // They are not representable in UTF-8, and a leading trail surrogate 3601 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair. 3602 // U+20001 == \\uD840\\uDC01 3603 // U+20400 == \\uD841\\uDC00 3604 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]", 3605 "-8cl", 3606 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb" 3607 }; 3608 uint32_t whichSpans[96]={ SPAN_ALL }; 3609 int32_t whichSpansCount=1; 3610 3611 UnicodeSet *sets[SET_COUNT]={ NULL }; 3612 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL }; 3613 3614 char testName[1024]; 3615 char *testNameLimit=testName; 3616 3617 int32_t i, j; 3618 for(i=0; i<LENGTHOF(testdata); ++i) { 3619 const char *s=testdata[i]; 3620 if(s[0]=='[') { 3621 // Create new test sets from this pattern. 3622 for(j=0; j<SET_COUNT; ++j) { 3623 delete sets_with_str[j]; 3624 delete sets[j]; 3625 } 3626 UErrorCode errorCode=U_ZERO_ERROR; 3627 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode); 3628 if(U_FAILURE(errorCode)) { 3629 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode)); 3630 break; 3631 } 3632 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]); 3633 sets[SLOW_NOT]->complement(); 3634 // Intermediate set: Test cloning of a frozen set. 3635 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]); 3636 fast->freeze(); 3637 sets[FAST]=(UnicodeSet *)fast->clone(); 3638 delete fast; 3639 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]); 3640 fastNot->freeze(); 3641 sets[FAST_NOT]=(UnicodeSet *)fastNot->clone(); 3642 delete fastNot; 3643 3644 for(j=0; j<SET_COUNT; ++j) { 3645 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]); 3646 } 3647 3648 strcpy(testName, s); 3649 testNameLimit=strchr(testName, 0); 3650 *testNameLimit++=':'; 3651 *testNameLimit=0; 3652 3653 whichSpans[0]=SPAN_ALL; 3654 whichSpansCount=1; 3655 } else if(s[0]=='-') { 3656 whichSpans[0]=SPAN_ALL; 3657 whichSpansCount=1; 3658 3659 while(*++s!=0) { 3660 switch(*s) { 3661 case 'c': 3662 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3663 ~SPAN_POLARITY, 3664 SPAN_SET, 3665 SPAN_COMPLEMENT, 3666 0); 3667 break; 3668 case 'b': 3669 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3670 ~SPAN_DIRS, 3671 SPAN_FWD, 3672 SPAN_BACK, 3673 0); 3674 break; 3675 case 'l': 3676 // test USET_SPAN_CONTAINED FWD & BACK, and separately 3677 // USET_SPAN_SIMPLE only FWD, and separately 3678 // USET_SPAN_SIMPLE only BACK 3679 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3680 ~(SPAN_DIRS|SPAN_CONDITION), 3681 SPAN_DIRS|SPAN_CONTAINED, 3682 SPAN_FWD|SPAN_SIMPLE, 3683 SPAN_BACK|SPAN_SIMPLE); 3684 break; 3685 case '8': 3686 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3687 ~SPAN_UTFS, 3688 SPAN_UTF16, 3689 SPAN_UTF8, 3690 0); 3691 break; 3692 default: 3693 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]); 3694 break; 3695 } 3696 } 3697 } else if(0==strcmp(s, "*")) { 3698 strcpy(testNameLimit, "bad_string"); 3699 for(j=0; j<whichSpansCount; ++j) { 3700 if(whichSpansCount>1) { 3701 sprintf(testNameLimit+10 /* strlen("bad_string") */, 3702 "%%0x%3x", 3703 whichSpans[j]); 3704 } 3705 testSpanUTF16String(sets_with_str, whichSpans[j], testName); 3706 testSpanUTF8String(sets_with_str, whichSpans[j], testName); 3707 } 3708 3709 strcpy(testNameLimit, "contents"); 3710 for(j=0; j<whichSpansCount; ++j) { 3711 if(whichSpansCount>1) { 3712 sprintf(testNameLimit+8 /* strlen("contents") */, 3713 "%%0x%3x", 3714 whichSpans[j]); 3715 } 3716 testSpanContents(sets_with_str, whichSpans[j], testName); 3717 } 3718 } else { 3719 UnicodeString string=UnicodeString(s, -1, US_INV).unescape(); 3720 strcpy(testNameLimit, "test_string"); 3721 for(j=0; j<whichSpansCount; ++j) { 3722 if(whichSpansCount>1) { 3723 sprintf(testNameLimit+11 /* strlen("test_string") */, 3724 "%%0x%3x", 3725 whichSpans[j]); 3726 } 3727 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i); 3728 } 3729 } 3730 } 3731 for(j=0; j<SET_COUNT; ++j) { 3732 delete sets_with_str[j]; 3733 delete sets[j]; 3734 } 3735} 3736 3737// Test select patterns and strings, and test USET_SPAN_SIMPLE. 3738void UnicodeSetTest::TestStringSpan() { 3739 static const char *pattern="[x{xy}{xya}{axy}{ax}]"; 3740 static const char *const string= 3741 "xx" 3742 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" 3743 "xx" 3744 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" 3745 "xx" 3746 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy" 3747 "aaaa"; 3748 3749 UErrorCode errorCode=U_ZERO_ERROR; 3750 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV); 3751 UnicodeSet set(pattern16, errorCode); 3752 if(U_FAILURE(errorCode)) { 3753 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3754 return; 3755 } 3756 3757 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape(); 3758 3759 if(set.containsAll(string16)) { 3760 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string); 3761 } 3762 3763 // Remove trailing "aaaa". 3764 string16.truncate(string16.length()-4); 3765 if(!set.containsAll(string16)) { 3766 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string); 3767 } 3768 3769 string16=UNICODE_STRING_SIMPLE("byayaxya"); 3770 const UChar *s16=string16.getBuffer(); 3771 int32_t length16=string16.length(); 3772 (void)length16; // Suppress set but not used warning. 3773 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 || 3774 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 || 3775 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 || 3776 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 || 3777 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 || 3778 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3 3779 ) { 3780 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern); 3781 } 3782 3783 pattern="[a{ab}{abc}{cd}]"; 3784 pattern16=UnicodeString(pattern, -1, US_INV); 3785 set.applyPattern(pattern16, errorCode); 3786 if(U_FAILURE(errorCode)) { 3787 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3788 return; 3789 } 3790 string16=UNICODE_STRING_SIMPLE("acdabcdabccd"); 3791 s16=string16.getBuffer(); 3792 length16=string16.length(); 3793 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 || 3794 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 || 3795 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5 3796 ) { 3797 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern); 3798 } 3799 3800 pattern="[d{cd}{bcd}{ab}]"; 3801 pattern16=UnicodeString(pattern, -1, US_INV); 3802 set.applyPattern(pattern16, errorCode).freeze(); 3803 if(U_FAILURE(errorCode)) { 3804 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3805 return; 3806 } 3807 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd"); 3808 s16=string16.getBuffer(); 3809 length16=string16.length(); 3810 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 || 3811 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 || 3812 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0 3813 ) { 3814 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern); 3815 } 3816} 3817