1/* 2******************************************************************************** 3* Copyright (C) 1999-2010 International Business Machines Corporation and 4* others. All Rights Reserved. 5******************************************************************************** 6* Date Name Description 7* 10/20/99 alan Creation. 8* 03/22/2000 Madhu Added additional tests 9******************************************************************************** 10*/ 11 12#include <stdio.h> 13 14#include <string.h> 15#include "unicode/utypes.h" 16#include "usettest.h" 17#include "unicode/ucnv.h" 18#include "unicode/uniset.h" 19#include "unicode/uchar.h" 20#include "unicode/usetiter.h" 21#include "unicode/ustring.h" 22#include "unicode/parsepos.h" 23#include "unicode/symtable.h" 24#include "unicode/uversion.h" 25#include "hash.h" 26 27#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 28 29#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \ 30 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \ 31 u_errorName(status));}} 32 33#define TEST_ASSERT(expr) {if (!(expr)) { \ 34 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }} 35 36UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) { 37 UnicodeString pat; 38 set.toPattern(pat); 39 return left + UnicodeSetTest::escape(pat); 40} 41 42#define CASE(id,test) case id: \ 43 name = #test; \ 44 if (exec) { \ 45 logln(#test "---"); \ 46 logln(); \ 47 test(); \ 48 } \ 49 break 50 51UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) { 52} 53 54UConverter *UnicodeSetTest::openUTF8Converter() { 55 if(utf8Cnv==NULL) { 56 UErrorCode errorCode=U_ZERO_ERROR; 57 utf8Cnv=ucnv_open("UTF-8", &errorCode); 58 } 59 return utf8Cnv; 60} 61 62UnicodeSetTest::~UnicodeSetTest() { 63 ucnv_close(utf8Cnv); 64} 65 66void 67UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, 68 const char* &name, char* /*par*/) { 69 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest"); 70 switch (index) { 71 CASE(0,TestPatterns); 72 CASE(1,TestAddRemove); 73 CASE(2,TestCategories); 74 CASE(3,TestCloneEqualHash); 75 CASE(4,TestMinimalRep); 76 CASE(5,TestAPI); 77 CASE(6,TestScriptSet); 78 CASE(7,TestPropertySet); 79 CASE(8,TestClone); 80 CASE(9,TestExhaustive); 81 CASE(10,TestToPattern); 82 CASE(11,TestIndexOf); 83 CASE(12,TestStrings); 84 CASE(13,Testj2268); 85 CASE(14,TestCloseOver); 86 CASE(15,TestEscapePattern); 87 CASE(16,TestInvalidCodePoint); 88 CASE(17,TestSymbolTable); 89 CASE(18,TestSurrogate); 90 CASE(19,TestPosixClasses); 91 CASE(20,TestIteration); 92 CASE(21,TestFreezable); 93 CASE(22,TestSpan); 94 CASE(23,TestStringSpan); 95 default: name = ""; break; 96 } 97} 98 99static const char NOT[] = "%%%%"; 100 101/** 102 * UVector was improperly copying contents 103 * This code will crash this is still true 104 */ 105void UnicodeSetTest::Testj2268() { 106 UnicodeSet t; 107 t.add(UnicodeString("abc")); 108 UnicodeSet test(t); 109 UnicodeString ustrPat; 110 test.toPattern(ustrPat, TRUE); 111} 112 113/** 114 * Test toPattern(). 115 */ 116void UnicodeSetTest::TestToPattern() { 117 UErrorCode ec = U_ZERO_ERROR; 118 119 // Test that toPattern() round trips with syntax characters and 120 // whitespace. 121 { 122 static const char* OTHER_TOPATTERN_TESTS[] = { 123 "[[:latin:]&[:greek:]]", 124 "[[:latin:]-[:greek:]]", 125 "[:nonspacing mark:]", 126 NULL 127 }; 128 129 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) { 130 ec = U_ZERO_ERROR; 131 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec); 132 if (U_FAILURE(ec)) { 133 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec))); 134 continue; 135 } 136 checkPat(OTHER_TOPATTERN_TESTS[j], s); 137 } 138 139 for (UChar32 i = 0; i <= 0x10FFFF; ++i) { 140 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) { 141 142 // check various combinations to make sure they all work. 143 if (i != 0 && !toPatternAux(i, i)){ 144 continue; 145 } 146 if (!toPatternAux(0, i)){ 147 continue; 148 } 149 if (!toPatternAux(i, 0xFFFF)){ 150 continue; 151 } 152 } 153 } 154 } 155 156 // Test pattern behavior of multicharacter strings. 157 { 158 ec = U_ZERO_ERROR; 159 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec); 160 161 // This loop isn't a loop. It's here to make the compiler happy. 162 // If you're curious, try removing it and changing the 'break' 163 // statements (except for the last) to goto's. 164 for (;;) { 165 if (U_FAILURE(ec)) break; 166 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL}; 167 expectToPattern(*s, "[a-z{aa}{ab}]", exp1); 168 169 s->add("ac"); 170 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL}; 171 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2); 172 173 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec); 174 if (U_FAILURE(ec)) break; 175 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL}; 176 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3); 177 178 s->add("[]"); 179 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL}; 180 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4); 181 182 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec); 183 if (U_FAILURE(ec)) break; 184 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL}; 185 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5); 186 187 // j2189 188 s->clear(); 189 s->add(UnicodeString("abc", "")); 190 s->add(UnicodeString("abc", "")); 191 const char* exp6[] = {"abc", NOT, "ab", NULL}; 192 expectToPattern(*s, "[{abc}]", exp6); 193 194 break; 195 } 196 197 if (U_FAILURE(ec)) errln("FAIL: pattern parse error"); 198 delete s; 199 } 200 201 // JB#3400: For 2 character ranges prefer [ab] to [a-b] 202 UnicodeSet s; 203 s.add((UChar)97, (UChar)98); // 'a', 'b' 204 expectToPattern(s, "[ab]", NULL); 205} 206 207UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) { 208 209 // use Integer.toString because Utility.hex doesn't handle ints 210 UnicodeString pat = ""; 211 // TODO do these in hex 212 //String source = "0x" + Integer.toString(start,16).toUpperCase(); 213 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase(); 214 UnicodeString source; 215 source = source + (uint32_t)start; 216 if (start != end) 217 source = source + ".." + (uint32_t)end; 218 UnicodeSet testSet; 219 testSet.add(start, end); 220 return checkPat(source, testSet); 221} 222 223UBool UnicodeSetTest::checkPat(const UnicodeString& source, 224 const UnicodeSet& testSet) { 225 // What we want to make sure of is that a pattern generated 226 // by toPattern(), with or without escaped unprintables, can 227 // be passed back into the UnicodeSet constructor. 228 UnicodeString pat0; 229 230 testSet.toPattern(pat0, TRUE); 231 232 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE; 233 234 //String pat1 = unescapeLeniently(pat0); 235 //if (!checkPat(source + " (in code)", testSet, pat1)) return false; 236 237 UnicodeString pat2; 238 testSet.toPattern(pat2, FALSE); 239 if (!checkPat(source, testSet, pat2)) return FALSE; 240 241 //String pat3 = unescapeLeniently(pat2); 242 // if (!checkPat(source + " (in code)", testSet, pat3)) return false; 243 244 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3); 245 logln((UnicodeString)source + " => " + pat0 + ", " + pat2); 246 return TRUE; 247} 248 249UBool UnicodeSetTest::checkPat(const UnicodeString& source, 250 const UnicodeSet& testSet, 251 const UnicodeString& pat) { 252 UErrorCode ec = U_ZERO_ERROR; 253 UnicodeSet testSet2(pat, ec); 254 if (testSet2 != testSet) { 255 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat); 256 return FALSE; 257 } 258 return TRUE; 259} 260 261void 262UnicodeSetTest::TestPatterns(void) { 263 UnicodeSet set; 264 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km"); 265 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz"); 266 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz"); 267 expectPattern(set, UnicodeString("[-az]", ""), "--aazz"); 268 expectPattern(set, UnicodeString("[az-]", ""), "--aazz"); 269 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz"); 270 271 // Throw in a test of complement 272 set.complement(); 273 UnicodeString exp; 274 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF); 275 expectPairs(set, exp); 276} 277 278void 279UnicodeSetTest::TestCategories(void) { 280 UErrorCode status = U_ZERO_ERROR; 281 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:] 282 UnicodeSet set(pat, status); 283 if (U_FAILURE(status)) { 284 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status))); 285 return; 286 } else { 287 expectContainment(set, pat, "ABC", "abc"); 288 } 289 290 UChar32 i; 291 int32_t failures = 0; 292 // Make sure generation of L doesn't pollute cached Lu set 293 // First generate L, then Lu 294 set.applyPattern("[:L:]", status); 295 if (U_FAILURE(status)) { errln("FAIL"); return; } 296 for (i=0; i<0x200; ++i) { 297 UBool l = u_isalpha((UChar)i); 298 if (l != set.contains(i)) { 299 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " + 300 set.contains(i)); 301 if (++failures == 10) break; 302 } 303 } 304 305 set.applyPattern("[:Lu:]", status); 306 if (U_FAILURE(status)) { errln("FAIL"); return; } 307 for (i=0; i<0x200; ++i) { 308 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER); 309 if (lu != set.contains(i)) { 310 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " + 311 set.contains(i)); 312 if (++failures == 20) break; 313 } 314 } 315} 316void 317UnicodeSetTest::TestCloneEqualHash(void) { 318 UErrorCode status = U_ZERO_ERROR; 319 // set1 and set2 used to be built with the obsolete constructor taking 320 // UCharCategory values; replaced with pattern constructors 321 // markus 20030502 322 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase 323 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase 324 if (U_FAILURE(status)){ 325 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status))); 326 return; 327 } 328 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit 329 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit 330 if (U_FAILURE(status)){ 331 errln((UnicodeString)"FAIL: Can't construct set with category->Nd"); 332 return; 333 } 334 335 if (*set1 != *set1a) { 336 errln("FAIL: category constructor for Ll broken"); 337 } 338 if (*set2 != *set2a) { 339 errln("FAIL: category constructor for Nd broken"); 340 } 341 delete set1a; 342 delete set2a; 343 344 logln("Testing copy construction"); 345 UnicodeSet *set1copy=new UnicodeSet(*set1); 346 if(*set1 != *set1copy || *set1 == *set2 || 347 getPairs(*set1) != getPairs(*set1copy) || 348 set1->hashCode() != set1copy->hashCode()){ 349 errln("FAIL : Error in copy construction"); 350 return; 351 } 352 353 logln("Testing =operator"); 354 UnicodeSet set1equal=*set1; 355 UnicodeSet set2equal=*set2; 356 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 || 357 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){ 358 errln("FAIL: Error in =operator"); 359 } 360 361 logln("Testing clone()"); 362 UnicodeSet *set1clone=(UnicodeSet*)set1->clone(); 363 UnicodeSet *set2clone=(UnicodeSet*)set2->clone(); 364 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal || 365 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal || 366 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){ 367 errln("FAIL: Error in clone"); 368 } 369 370 logln("Testing hashcode"); 371 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() || 372 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() || 373 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() || 374 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() || 375 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){ 376 errln("FAIL: Error in hashCode()"); 377 } 378 379 delete set1; 380 delete set1copy; 381 delete set2; 382 delete set1clone; 383 delete set2clone; 384 385 386} 387void 388UnicodeSetTest::TestAddRemove(void) { 389 UnicodeSet set; // Construct empty set 390 doAssert(set.isEmpty() == TRUE, "set should be empty"); 391 doAssert(set.size() == 0, "size should be 0"); 392 set.complement(); 393 doAssert(set.size() == 0x110000, "size should be 0x110000"); 394 set.clear(); 395 set.add(0x0061, 0x007a); 396 expectPairs(set, "az"); 397 doAssert(set.isEmpty() == FALSE, "set should not be empty"); 398 doAssert(set.size() != 0, "size should not be equal to 0"); 399 doAssert(set.size() == 26, "size should be equal to 26"); 400 set.remove(0x006d, 0x0070); 401 expectPairs(set, "alqz"); 402 doAssert(set.size() == 22, "size should be equal to 22"); 403 set.remove(0x0065, 0x0067); 404 expectPairs(set, "adhlqz"); 405 doAssert(set.size() == 19, "size should be equal to 19"); 406 set.remove(0x0064, 0x0069); 407 expectPairs(set, "acjlqz"); 408 doAssert(set.size() == 16, "size should be equal to 16"); 409 set.remove(0x0063, 0x0072); 410 expectPairs(set, "absz"); 411 doAssert(set.size() == 10, "size should be equal to 10"); 412 set.add(0x0066, 0x0071); 413 expectPairs(set, "abfqsz"); 414 doAssert(set.size() == 22, "size should be equal to 22"); 415 set.remove(0x0061, 0x0067); 416 expectPairs(set, "hqsz"); 417 set.remove(0x0061, 0x007a); 418 expectPairs(set, ""); 419 doAssert(set.isEmpty() == TRUE, "set should be empty"); 420 doAssert(set.size() == 0, "size should be 0"); 421 set.add(0x0061); 422 doAssert(set.isEmpty() == FALSE, "set should not be empty"); 423 doAssert(set.size() == 1, "size should not be equal to 1"); 424 set.add(0x0062); 425 set.add(0x0063); 426 expectPairs(set, "ac"); 427 doAssert(set.size() == 3, "size should not be equal to 3"); 428 set.add(0x0070); 429 set.add(0x0071); 430 expectPairs(set, "acpq"); 431 doAssert(set.size() == 5, "size should not be equal to 5"); 432 set.clear(); 433 expectPairs(set, ""); 434 doAssert(set.isEmpty() == TRUE, "set should be empty"); 435 doAssert(set.size() == 0, "size should be 0"); 436 437 // Try removing an entire set from another set 438 expectPattern(set, "[c-x]", "cx"); 439 UnicodeSet set2; 440 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz"); 441 set.removeAll(set2); 442 expectPairs(set, "deluxx"); 443 444 // Try adding an entire set to another set 445 expectPattern(set, "[jackiemclean]", "aacceein"); 446 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort"); 447 set.addAll(set2); 448 expectPairs(set, "aacehort"); 449 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2"); 450 451 // Try retaining an set of elements contained in another set (intersection) 452 UnicodeSet set3; 453 expectPattern(set3, "[a-c]", "ac"); 454 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3"); 455 set3.remove(0x0062); 456 expectPairs(set3, "aacc"); 457 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3"); 458 set.retainAll(set3); 459 expectPairs(set, "aacc"); 460 doAssert(set.size() == set3.size(), "set.size() should be set3.size()"); 461 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3"); 462 set.clear(); 463 doAssert(set.size() != set3.size(), "set.size() != set3.size()"); 464 465 // Test commutativity 466 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort"); 467 expectPattern(set2, "[jackiemclean]", "aacceein"); 468 set.addAll(set2); 469 expectPairs(set, "aacehort"); 470 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2"); 471 472 473 474 475} 476 477/** 478 * Make sure minimal representation is maintained. 479 */ 480void UnicodeSetTest::TestMinimalRep() { 481 UErrorCode status = U_ZERO_ERROR; 482 // This is pretty thoroughly tested by checkCanonicalRep() 483 // run against the exhaustive operation results. Use the code 484 // here for debugging specific spot problems. 485 486 // 1 overlap against 2 487 UnicodeSet set("[h-km-q]", status); 488 if (U_FAILURE(status)) { errln("FAIL"); return; } 489 UnicodeSet set2("[i-o]", status); 490 if (U_FAILURE(status)) { errln("FAIL"); return; } 491 set.addAll(set2); 492 expectPairs(set, "hq"); 493 // right 494 set.applyPattern("[a-m]", status); 495 if (U_FAILURE(status)) { errln("FAIL"); return; } 496 set2.applyPattern("[e-o]", status); 497 if (U_FAILURE(status)) { errln("FAIL"); return; } 498 set.addAll(set2); 499 expectPairs(set, "ao"); 500 // left 501 set.applyPattern("[e-o]", status); 502 if (U_FAILURE(status)) { errln("FAIL"); return; } 503 set2.applyPattern("[a-m]", status); 504 if (U_FAILURE(status)) { errln("FAIL"); return; } 505 set.addAll(set2); 506 expectPairs(set, "ao"); 507 // 1 overlap against 3 508 set.applyPattern("[a-eg-mo-w]", status); 509 if (U_FAILURE(status)) { errln("FAIL"); return; } 510 set2.applyPattern("[d-q]", status); 511 if (U_FAILURE(status)) { errln("FAIL"); return; } 512 set.addAll(set2); 513 expectPairs(set, "aw"); 514} 515 516void UnicodeSetTest::TestAPI() { 517 UErrorCode status = U_ZERO_ERROR; 518 // default ct 519 UnicodeSet set; 520 if (!set.isEmpty() || set.getRangeCount() != 0) { 521 errln((UnicodeString)"FAIL, set should be empty but isn't: " + 522 set); 523 } 524 525 // clear(), isEmpty() 526 set.add(0x0061); 527 if (set.isEmpty()) { 528 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " + 529 set); 530 } 531 set.clear(); 532 if (!set.isEmpty()) { 533 errln((UnicodeString)"FAIL, set should be empty but isn't: " + 534 set); 535 } 536 537 // size() 538 set.clear(); 539 if (set.size() != 0) { 540 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() + 541 ": " + set); 542 } 543 set.add(0x0061); 544 if (set.size() != 1) { 545 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() + 546 ": " + set); 547 } 548 set.add(0x0031, 0x0039); 549 if (set.size() != 10) { 550 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() + 551 ": " + set); 552 } 553 554 // contains(first, last) 555 set.clear(); 556 set.applyPattern("[A-Y 1-8 b-d l-y]", status); 557 if (U_FAILURE(status)) { errln("FAIL"); return; } 558 for (int32_t i = 0; i<set.getRangeCount(); ++i) { 559 UChar32 a = set.getRangeStart(i); 560 UChar32 b = set.getRangeEnd(i); 561 if (!set.contains(a, b)) { 562 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b + 563 " but doesn't: " + set); 564 } 565 if (set.contains((UChar32)(a-1), b)) { 566 errln((UnicodeString)"FAIL, shouldn't contain " + 567 (unsigned short)(a-1) + '-' + (unsigned short)b + 568 " but does: " + set); 569 } 570 if (set.contains(a, (UChar32)(b+1))) { 571 errln((UnicodeString)"FAIL, shouldn't contain " + 572 (unsigned short)a + '-' + (unsigned short)(b+1) + 573 " but does: " + set); 574 } 575 } 576 577 // Ported InversionList test. 578 UnicodeSet a((UChar32)3,(UChar32)10); 579 UnicodeSet b((UChar32)7,(UChar32)15); 580 UnicodeSet c; 581 582 logln((UnicodeString)"a [3-10]: " + a); 583 logln((UnicodeString)"b [7-15]: " + b); 584 c = a; 585 c.addAll(b); 586 UnicodeSet exp((UChar32)3,(UChar32)15); 587 if (c == exp) { 588 logln((UnicodeString)"c.set(a).add(b): " + c); 589 } else { 590 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp); 591 } 592 c.complement(); 593 exp.set((UChar32)0, (UChar32)2); 594 exp.add((UChar32)16, UnicodeSet::MAX_VALUE); 595 if (c == exp) { 596 logln((UnicodeString)"c.complement(): " + c); 597 } else { 598 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp); 599 } 600 c.complement(); 601 exp.set((UChar32)3, (UChar32)15); 602 if (c == exp) { 603 logln((UnicodeString)"c.complement(): " + c); 604 } else { 605 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp); 606 } 607 c = a; 608 c.complementAll(b); 609 exp.set((UChar32)3,(UChar32)6); 610 exp.add((UChar32)11,(UChar32) 15); 611 if (c == exp) { 612 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c); 613 } else { 614 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp); 615 } 616 617 exp = c; 618 bitsToSet(setToBits(c), c); 619 if (c == exp) { 620 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c); 621 } else { 622 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp); 623 } 624 625 // Additional tests for coverage JB#2118 626 //UnicodeSet::complement(class UnicodeString const &) 627 //UnicodeSet::complementAll(class UnicodeString const &) 628 //UnicodeSet::containsNone(class UnicodeSet const &) 629 //UnicodeSet::containsNone(long,long) 630 //UnicodeSet::containsSome(class UnicodeSet const &) 631 //UnicodeSet::containsSome(long,long) 632 //UnicodeSet::removeAll(class UnicodeString const &) 633 //UnicodeSet::retain(long) 634 //UnicodeSet::retainAll(class UnicodeString const &) 635 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &) 636 //UnicodeSetIterator::getString(void) 637 set.clear(); 638 set.complement("ab"); 639 exp.applyPattern("[{ab}]", status); 640 if (U_FAILURE(status)) { errln("FAIL"); return; } 641 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; } 642 643 UnicodeSetIterator iset(set); 644 if (!iset.next() || !iset.isString()) { 645 errln("FAIL: UnicodeSetIterator::next/isString"); 646 } else if (iset.getString() != "ab") { 647 errln("FAIL: UnicodeSetIterator::getString"); 648 } 649 650 set.add((UChar32)0x61, (UChar32)0x7A); 651 set.complementAll("alan"); 652 exp.applyPattern("[{ab}b-kmo-z]", status); 653 if (U_FAILURE(status)) { errln("FAIL"); return; } 654 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; } 655 656 exp.applyPattern("[a-z]", status); 657 if (U_FAILURE(status)) { errln("FAIL"); return; } 658 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 659 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 660 exp.applyPattern("[aln]", status); 661 if (U_FAILURE(status)) { errln("FAIL"); return; } 662 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 663 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 664 665 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) { 666 errln("FAIL: containsNone(UChar32, UChar32)"); 667 } 668 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) { 669 errln("FAIL: containsSome(UChar32, UChar32)"); 670 } 671 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) { 672 errln("FAIL: containsNone(UChar32, UChar32)"); 673 } 674 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) { 675 errln("FAIL: containsSome(UChar32, UChar32)"); 676 } 677 678 set.removeAll("liu"); 679 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status); 680 if (U_FAILURE(status)) { errln("FAIL"); return; } 681 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; } 682 683 set.retainAll("star"); 684 exp.applyPattern("[rst]", status); 685 if (U_FAILURE(status)) { errln("FAIL"); return; } 686 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; } 687 688 set.retain((UChar32)0x73); 689 exp.applyPattern("[s]", status); 690 if (U_FAILURE(status)) { errln("FAIL"); return; } 691 if (set != exp) { errln("FAIL: retain('s')"); return; } 692 693 uint16_t buf[32]; 694 int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status); 695 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; } 696 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) { 697 errln("FAIL: serialize"); 698 return; 699 } 700 701 // Conversions to and from USet 702 UnicodeSet *uniset = &set; 703 USet *uset = uniset->toUSet(); 704 TEST_ASSERT((void *)uset == (void *)uniset); 705 UnicodeSet *setx = UnicodeSet::fromUSet(uset); 706 TEST_ASSERT((void *)setx == (void *)uset); 707 const UnicodeSet *constSet = uniset; 708 const USet *constUSet = constSet->toUSet(); 709 TEST_ASSERT((void *)constUSet == (void *)constSet); 710 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet); 711 TEST_ASSERT((void *)constSetx == (void *)constUSet); 712 713 // span(UnicodeString) and spanBack(UnicodeString) convenience methods 714 UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc"); 715 UnicodeSet ac(0x61, 0x63); 716 ac.remove(0x62).freeze(); 717 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 || 718 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 || 719 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 || 720 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 || 721 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 || 722 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 || 723 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 || 724 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 || 725 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 || 726 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30 727 ) { 728 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes"); 729 } 730 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 || 731 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 || 732 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 || 733 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 || 734 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 || 735 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 || 736 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 || 737 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 || 738 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 || 739 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20 740 ) { 741 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes"); 742 } 743} 744 745void UnicodeSetTest::TestIteration() { 746 UErrorCode ec = U_ZERO_ERROR; 747 int i = 0; 748 int outerLoop; 749 750 // 6 code points, 3 ranges, 2 strings, 8 total elements 751 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2" 752 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec); 753 TEST_ASSERT_SUCCESS(ec); 754 UnicodeSetIterator it(set); 755 756 for (outerLoop=0; outerLoop<3; outerLoop++) { 757 // Run the test multiple times, to check that iterator.reset() is working. 758 for (i=0; i<10; i++) { 759 UBool nextv = it.next(); 760 UBool isString = it.isString(); 761 int32_t codePoint = it.getCodepoint(); 762 //int32_t codePointEnd = it.getCodepointEnd(); 763 UnicodeString s = it.getString(); 764 switch (i) { 765 case 0: 766 TEST_ASSERT(nextv == TRUE); 767 TEST_ASSERT(isString == FALSE); 768 TEST_ASSERT(codePoint==0x61); 769 TEST_ASSERT(s == "a"); 770 break; 771 case 1: 772 TEST_ASSERT(nextv == TRUE); 773 TEST_ASSERT(isString == FALSE); 774 TEST_ASSERT(codePoint==0x62); 775 TEST_ASSERT(s == "b"); 776 break; 777 case 2: 778 TEST_ASSERT(nextv == TRUE); 779 TEST_ASSERT(isString == FALSE); 780 TEST_ASSERT(codePoint==0x63); 781 TEST_ASSERT(s == "c"); 782 break; 783 case 3: 784 TEST_ASSERT(nextv == TRUE); 785 TEST_ASSERT(isString == FALSE); 786 TEST_ASSERT(codePoint==0x79); 787 TEST_ASSERT(s == "y"); 788 break; 789 case 4: 790 TEST_ASSERT(nextv == TRUE); 791 TEST_ASSERT(isString == FALSE); 792 TEST_ASSERT(codePoint==0x7a); 793 TEST_ASSERT(s == "z"); 794 break; 795 case 5: 796 TEST_ASSERT(nextv == TRUE); 797 TEST_ASSERT(isString == FALSE); 798 TEST_ASSERT(codePoint==0x1abcd); 799 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd)); 800 break; 801 case 6: 802 TEST_ASSERT(nextv == TRUE); 803 TEST_ASSERT(isString == TRUE); 804 TEST_ASSERT(s == "str1"); 805 break; 806 case 7: 807 TEST_ASSERT(nextv == TRUE); 808 TEST_ASSERT(isString == TRUE); 809 TEST_ASSERT(s == "str2"); 810 break; 811 case 8: 812 TEST_ASSERT(nextv == FALSE); 813 break; 814 case 9: 815 TEST_ASSERT(nextv == FALSE); 816 break; 817 } 818 } 819 it.reset(); // prepare to run the iteration again. 820 } 821} 822 823 824 825 826void UnicodeSetTest::TestStrings() { 827 UErrorCode ec = U_ZERO_ERROR; 828 829 UnicodeSet* testList[] = { 830 UnicodeSet::createFromAll("abc"), 831 new UnicodeSet("[a-c]", ec), 832 833 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")), 834 new UnicodeSet("[{ll}{ch}a-z]", ec), 835 836 UnicodeSet::createFrom("ab}c"), 837 new UnicodeSet("[{ab\\}c}]", ec), 838 839 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')), 840 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec), 841 842 NULL 843 }; 844 845 if (U_FAILURE(ec)) { 846 errln("FAIL: couldn't construct test sets"); 847 } 848 849 for (int32_t i = 0; testList[i] != NULL; i+=2) { 850 if (U_SUCCESS(ec)) { 851 UnicodeString pat0, pat1; 852 testList[i]->toPattern(pat0, TRUE); 853 testList[i+1]->toPattern(pat1, TRUE); 854 if (*testList[i] == *testList[i+1]) { 855 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1); 856 } else { 857 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1); 858 } 859 } 860 delete testList[i]; 861 delete testList[i+1]; 862 } 863} 864 865/** 866 * Test the [:Latin:] syntax. 867 */ 868void UnicodeSetTest::TestScriptSet() { 869 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1")); 870 871 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA"); 872 873 /* Jitterbug 1423 */ 874 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA"); 875 876} 877 878/** 879 * Test the [:Latin:] syntax. 880 */ 881void UnicodeSetTest::TestPropertySet() { 882 static const char* const DATA[] = { 883 // Pattern, Chars IN, Chars NOT in 884 885 "[:Latin:]", 886 "aA", 887 "\\u0391\\u03B1", 888 889 "[\\p{Greek}]", 890 "\\u0391\\u03B1", 891 "aA", 892 893 "\\P{ GENERAL Category = upper case letter }", 894 "abc", 895 "ABC", 896 897#if !UCONFIG_NO_NORMALIZATION 898 // Combining class: @since ICU 2.2 899 // Check both symbolic and numeric 900 "\\p{ccc=Nukta}", 901 "\\u0ABC", 902 "abc", 903 904 "\\p{Canonical Combining Class = 11}", 905 "\\u05B1", 906 "\\u05B2", 907 908 "[:c c c = iota subscript :]", 909 "\\u0345", 910 "xyz", 911#endif 912 913 // Bidi class: @since ICU 2.2 914 "\\p{bidiclass=lefttoright}", 915 "abc", 916 "\\u0671\\u0672", 917 918 // Binary properties: @since ICU 2.2 919 "\\p{ideographic}", 920 "\\u4E0A", 921 "x", 922 923 "[:math=false:]", 924 "q)*(", 925 // weiv: )(and * were removed from math in Unicode 4.0.1 926 //"(*+)", 927 "+<>^", 928 929 // JB#1767 \N{}, \p{ASCII} 930 "[:Ascii:]", 931 "abc\\u0000\\u007F", 932 "\\u0080\\u4E00", 933 934 "[\\N{ latin small letter a }[:name= latin small letter z:]]", 935 "az", 936 "qrs", 937 938 // JB#2015 939 "[:any:]", 940 "a\\U0010FFFF", 941 "", 942 943 "[:nv=0.5:]", 944 "\\u00BD\\u0F2A", 945 "\\u00BC", 946 947 // JB#2653: Age 948 "[:Age=1.1:]", 949 "\\u03D6", // 1.1 950 "\\u03D8\\u03D9", // 3.2 951 952 "[:Age=3.1:]", 953 "\\u1800\\u3400\\U0002f800", 954 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000", 955 956 // JB#2350: Case_Sensitive 957 "[:Case Sensitive:]", 958 "A\\u1FFC\\U00010410", 959 ";\\u00B4\\U00010500", 960 961 // JB#2832: C99-compatibility props 962 "[:blank:]", 963 " \\u0009", 964 "1-9A-Z", 965 966 "[:graph:]", 967 "19AZ", 968 " \\u0003\\u0007\\u0009\\u000A\\u000D", 969 970 "[:punct:]", 971 "!@#%&*()[]{}-_\\/;:,.?'\"", 972 "09azAZ", 973 974 "[:xdigit:]", 975 "09afAF", 976 "gG!", 977 978 // Regex compatibility test 979 "[-b]", // leading '-' is literal 980 "-b", 981 "ac", 982 983 "[^-b]", // leading '-' is literal 984 "ac", 985 "-b", 986 987 "[b-]", // trailing '-' is literal 988 "-b", 989 "ac", 990 991 "[^b-]", // trailing '-' is literal 992 "ac", 993 "-b", 994 995 "[a-b-]", // trailing '-' is literal 996 "ab-", 997 "c=", 998 999 "[[a-q]&[p-z]-]", // trailing '-' is literal 1000 "pq-", 1001 "or=", 1002 1003 "[\\s|\\)|:|$|\\>]", // from regex tests 1004 "s|):$>", 1005 "abc", 1006 1007 "[\\uDC00cd]", // JB#2906: isolated trail at start 1008 "cd\\uDC00", 1009 "ab\\uD800\\U00010000", 1010 1011 "[ab\\uD800]", // JB#2906: isolated trail at start 1012 "ab\\uD800", 1013 "cd\\uDC00\\U00010000", 1014 1015 "[ab\\uD800cd]", // JB#2906: isolated lead in middle 1016 "abcd\\uD800", 1017 "ef\\uDC00\\U00010000", 1018 1019 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle 1020 "abcd\\uDC00", 1021 "ef\\uD800\\U00010000", 1022 1023#if !UCONFIG_NO_NORMALIZATION 1024 "[:^lccc=0:]", // Lead canonical class 1025 "\\u0300\\u0301", 1026 "abcd\\u00c0\\u00c5", 1027 1028 "[:^tccc=0:]", // Trail canonical class 1029 "\\u0300\\u0301\\u00c0\\u00c5", 1030 "abcd", 1031 1032 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class 1033 "\\u0300\\u0301\\u00c0\\u00c5", 1034 "abcd", 1035 1036 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now) 1037 "", 1038 "abcd\\u0300\\u0301\\u00c0\\u00c5", 1039 1040 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not 1041 "\\u0F73\\u0F75\\u0F81", 1042 "abcd\\u0300\\u0301\\u00c0\\u00c5", 1043#endif /* !UCONFIG_NO_NORMALIZATION */ 1044 1045 "[:Assigned:]", 1046 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD", 1047 "\\u0888\\uFDD3\\uFFFE\\U00050005" 1048 }; 1049 1050 static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]); 1051 1052 for (int32_t i=0; i<DATA_LEN; i+=3) { 1053 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]), 1054 CharsToUnicodeString(DATA[i+2])); 1055 } 1056} 1057 1058/** 1059 * Test that Posix style character classes [:digit:], etc. 1060 * have the Unicode definitions from TR 18. 1061 */ 1062void UnicodeSetTest::TestPosixClasses() { 1063 { 1064 UErrorCode status = U_ZERO_ERROR; 1065 UnicodeSet s1("[:alpha:]", status); 1066 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status); 1067 TEST_ASSERT_SUCCESS(status); 1068 TEST_ASSERT(s1==s2); 1069 } 1070 { 1071 UErrorCode status = U_ZERO_ERROR; 1072 UnicodeSet s1("[:lower:]", status); 1073 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status); 1074 TEST_ASSERT_SUCCESS(status); 1075 TEST_ASSERT(s1==s2); 1076 } 1077 { 1078 UErrorCode status = U_ZERO_ERROR; 1079 UnicodeSet s1("[:upper:]", status); 1080 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status); 1081 TEST_ASSERT_SUCCESS(status); 1082 TEST_ASSERT(s1==s2); 1083 } 1084 { 1085 UErrorCode status = U_ZERO_ERROR; 1086 UnicodeSet s1("[:punct:]", status); 1087 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status); 1088 TEST_ASSERT_SUCCESS(status); 1089 TEST_ASSERT(s1==s2); 1090 } 1091 { 1092 UErrorCode status = U_ZERO_ERROR; 1093 UnicodeSet s1("[:digit:]", status); 1094 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status); 1095 TEST_ASSERT_SUCCESS(status); 1096 TEST_ASSERT(s1==s2); 1097 } 1098 { 1099 UErrorCode status = U_ZERO_ERROR; 1100 UnicodeSet s1("[:xdigit:]", status); 1101 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status); 1102 TEST_ASSERT_SUCCESS(status); 1103 TEST_ASSERT(s1==s2); 1104 } 1105 { 1106 UErrorCode status = U_ZERO_ERROR; 1107 UnicodeSet s1("[:alnum:]", status); 1108 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status); 1109 TEST_ASSERT_SUCCESS(status); 1110 TEST_ASSERT(s1==s2); 1111 } 1112 { 1113 UErrorCode status = U_ZERO_ERROR; 1114 UnicodeSet s1("[:space:]", status); 1115 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status); 1116 TEST_ASSERT_SUCCESS(status); 1117 TEST_ASSERT(s1==s2); 1118 } 1119 { 1120 UErrorCode status = U_ZERO_ERROR; 1121 UnicodeSet s1("[:blank:]", status); 1122 TEST_ASSERT_SUCCESS(status); 1123 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"), 1124 status); 1125 TEST_ASSERT_SUCCESS(status); 1126 TEST_ASSERT(s1==s2); 1127 } 1128 { 1129 UErrorCode status = U_ZERO_ERROR; 1130 UnicodeSet s1("[:cntrl:]", status); 1131 TEST_ASSERT_SUCCESS(status); 1132 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status); 1133 TEST_ASSERT_SUCCESS(status); 1134 TEST_ASSERT(s1==s2); 1135 } 1136 { 1137 UErrorCode status = U_ZERO_ERROR; 1138 UnicodeSet s1("[:graph:]", status); 1139 TEST_ASSERT_SUCCESS(status); 1140 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status); 1141 TEST_ASSERT_SUCCESS(status); 1142 TEST_ASSERT(s1==s2); 1143 } 1144 { 1145 UErrorCode status = U_ZERO_ERROR; 1146 UnicodeSet s1("[:print:]", status); 1147 TEST_ASSERT_SUCCESS(status); 1148 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status); 1149 TEST_ASSERT_SUCCESS(status); 1150 TEST_ASSERT(s1==s2); 1151 } 1152} 1153/** 1154 * Test cloning of UnicodeSet. For C++, we test the copy constructor. 1155 */ 1156void UnicodeSetTest::TestClone() { 1157 UErrorCode ec = U_ZERO_ERROR; 1158 UnicodeSet s("[abcxyz]", ec); 1159 UnicodeSet t(s); 1160 expectContainment(t, "abc", "def"); 1161} 1162 1163/** 1164 * Test the indexOf() and charAt() methods. 1165 */ 1166void UnicodeSetTest::TestIndexOf() { 1167 UErrorCode ec = U_ZERO_ERROR; 1168 UnicodeSet set("[a-cx-y3578]", ec); 1169 if (U_FAILURE(ec)) { 1170 errln("FAIL: UnicodeSet constructor"); 1171 return; 1172 } 1173 for (int32_t i=0; i<set.size(); ++i) { 1174 UChar32 c = set.charAt(i); 1175 if (set.indexOf(c) != i) { 1176 errln("FAIL: charAt(%d) = %X => indexOf() => %d", 1177 i, c, set.indexOf(c)); 1178 } 1179 } 1180 UChar32 c = set.charAt(set.size()); 1181 if (c != -1) { 1182 errln("FAIL: charAt(<out of range>) = %X", c); 1183 } 1184 int32_t j = set.indexOf((UChar32)0x71/*'q'*/); 1185 if (j != -1) { 1186 errln((UnicodeString)"FAIL: indexOf('q') = " + j); 1187 } 1188} 1189 1190/** 1191 * Test closure API. 1192 */ 1193void UnicodeSetTest::TestCloseOver() { 1194 UErrorCode ec = U_ZERO_ERROR; 1195 1196 char CASE[] = {(char)USET_CASE_INSENSITIVE}; 1197 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS}; 1198 const char* DATA[] = { 1199 // selector, input, output 1200 CASE, 1201 "[aq\\u00DF{Bc}{bC}{Fi}]", 1202 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1 1203 1204 CASE, 1205 "[\\u01F1]", // 'DZ' 1206 "[\\u01F1\\u01F2\\u01F3]", 1207 1208 CASE, 1209 "[\\u1FB4]", 1210 "[\\u1FB4{\\u03AC\\u03B9}]", 1211 1212 CASE, 1213 "[{F\\uFB01}]", 1214 "[\\uFB03{ffi}]", 1215 1216 CASE, // make sure binary search finds limits 1217 "[a\\uFF3A]", 1218 "[aA\\uFF3A\\uFF5A]", 1219 1220 CASE, 1221 "[a-z]","[A-Za-z\\u017F\\u212A]", 1222 CASE, 1223 "[abc]","[A-Ca-c]", 1224 CASE, 1225 "[ABC]","[A-Ca-c]", 1226 1227 CASE, "[i]", "[iI]", 1228 1229 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I 1230 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot 1231 1232 CASE, "[\\u0131]", "[\\u0131]", // dotless i 1233 1234 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]", 1235 1236 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas 1237 1238 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas 1239 1240 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]", 1241 1242 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]", 1243 1244 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]", 1245 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]", 1246 1247 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]", 1248 1249 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table 1250 1251 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table 1252 1253#if !UCONFIG_NO_FILE_IO 1254 CASE_MAPPINGS, 1255 "[aq\\u00DF{Bc}{bC}{Fi}]", 1256 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]", 1257#endif 1258 1259 CASE_MAPPINGS, 1260 "[\\u01F1]", // 'DZ' 1261 "[\\u01F1\\u01F2\\u01F3]", 1262 1263 CASE_MAPPINGS, 1264 "[a-z]", 1265 "[A-Za-z]", 1266 1267 NULL 1268 }; 1269 1270 UnicodeSet s; 1271 UnicodeSet t; 1272 UnicodeString buf; 1273 for (int32_t i=0; DATA[i]!=NULL; i+=3) { 1274 int32_t selector = DATA[i][0]; 1275 UnicodeString pat(DATA[i+1], -1, US_INV); 1276 UnicodeString exp(DATA[i+2], -1, US_INV); 1277 s.applyPattern(pat, ec); 1278 s.closeOver(selector); 1279 t.applyPattern(exp, ec); 1280 if (U_FAILURE(ec)) { 1281 errln("FAIL: applyPattern failed"); 1282 continue; 1283 } 1284 if (s == t) { 1285 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp); 1286 } else { 1287 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " + 1288 s.toPattern(buf, TRUE) + ", expected " + exp); 1289 } 1290 } 1291 1292#if 0 1293 /* 1294 * Unused test code. 1295 * This was used to compare the old implementation (using USET_CASE) 1296 * with the new one (using 0x100 temporarily) 1297 * while transitioning from hardcoded case closure tables in uniset.cpp 1298 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu. 1299 * and using ucase.c functions for closure. 1300 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file 1301 * 1302 * Note: The old and new implementation never fully matched because 1303 * the old implementation turned out to not map U+0130 and U+0131 correctly 1304 * (dotted I and dotless i) and because the old implementation's data tables 1305 * were outdated compared to Unicode 4.0.1 at the time of the change to the 1306 * new implementation. (So sigmas and some other characters were not handled 1307 * according to the newer Unicode version.) 1308 */ 1309 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2; 1310 UnicodeSetIterator si(sens); 1311 UnicodeString str, buf2; 1312 const UnicodeString *pStr; 1313 UChar32 c; 1314 while(si.next()) { 1315 if(!si.isString()) { 1316 c=si.getCodepoint(); 1317 s.clear(); 1318 s.add(c); 1319 1320 str.setTo(c); 1321 str.foldCase(); 1322 sens2.add(str); 1323 1324 t=s; 1325 s.closeOver(USET_CASE); 1326 t.closeOver(0x100); 1327 if(s!=t) { 1328 errln("FAIL: closeOver(U+%04x) differs: ", c); 1329 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE)); 1330 } 1331 } 1332 } 1333 // remove all code points 1334 // should contain all full case folding mapping strings 1335 sens2.remove(0, 0x10ffff); 1336 si.reset(sens2); 1337 while(si.next()) { 1338 if(si.isString()) { 1339 pStr=&si.getString(); 1340 s.clear(); 1341 s.add(*pStr); 1342 t=s2=s; 1343 s.closeOver(USET_CASE); 1344 t.closeOver(0x100); 1345 if(s!=t) { 1346 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: "); 1347 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE)); 1348 } 1349 } 1350 } 1351#endif 1352 1353 // Test the pattern API 1354 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec); 1355 if (U_FAILURE(ec)) { 1356 errln("FAIL: applyPattern failed"); 1357 } else { 1358 expectContainment(s, "abcABC", "defDEF"); 1359 } 1360 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec); 1361 if (U_FAILURE(ec)) { 1362 errln("FAIL: constructor failed"); 1363 } else { 1364 expectContainment(v, "defDEF", "abcABC"); 1365 } 1366 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec); 1367 if (U_FAILURE(ec)) { 1368 errln("FAIL: construct w/case mappings failed"); 1369 } else { 1370 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A")); 1371 } 1372} 1373 1374void UnicodeSetTest::TestEscapePattern() { 1375 const char pattern[] = 1376 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]"; 1377 const char exp[] = 1378 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]"; 1379 // We test this with two passes; in the second pass we 1380 // pre-unescape the pattern. Since U+200E is rule whitespace, 1381 // this fails -- which is what we expect. 1382 for (int32_t pass=1; pass<=2; ++pass) { 1383 UErrorCode ec = U_ZERO_ERROR; 1384 UnicodeString pat(pattern, -1, US_INV); 1385 if (pass==2) { 1386 pat = pat.unescape(); 1387 } 1388 // Pattern is only good for pass 1 1389 UBool isPatternValid = (pass==1); 1390 1391 UnicodeSet set(pat, ec); 1392 if (U_SUCCESS(ec) != isPatternValid){ 1393 errln((UnicodeString)"FAIL: applyPattern(" + 1394 escape(pat) + ") => " + 1395 u_errorName(ec)); 1396 continue; 1397 } 1398 if (U_FAILURE(ec)) { 1399 continue; 1400 } 1401 if (set.contains((UChar)0x0644)){ 1402 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)"); 1403 } 1404 1405 UnicodeString newpat; 1406 set.toPattern(newpat, TRUE); 1407 if (newpat == UnicodeString(exp, -1, US_INV)) { 1408 logln(escape(pat) + " => " + newpat); 1409 } else { 1410 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat); 1411 } 1412 1413 for (int32_t i=0; i<set.getRangeCount(); ++i) { 1414 UnicodeString str("Range "); 1415 str.append((UChar)(0x30 + i)) 1416 .append(": ") 1417 .append((UChar32)set.getRangeStart(i)) 1418 .append(" - ") 1419 .append((UChar32)set.getRangeEnd(i)); 1420 str = str + " (" + set.getRangeStart(i) + " - " + 1421 set.getRangeEnd(i) + ")"; 1422 if (set.getRangeStart(i) < 0) { 1423 errln((UnicodeString)"FAIL: " + escape(str)); 1424 } else { 1425 logln(escape(str)); 1426 } 1427 } 1428 } 1429} 1430 1431void UnicodeSetTest::expectRange(const UnicodeString& label, 1432 const UnicodeSet& set, 1433 UChar32 start, UChar32 end) { 1434 UnicodeSet exp(start, end); 1435 UnicodeString pat; 1436 if (set == exp) { 1437 logln(label + " => " + set.toPattern(pat, TRUE)); 1438 } else { 1439 UnicodeString xpat; 1440 errln((UnicodeString)"FAIL: " + label + " => " + 1441 set.toPattern(pat, TRUE) + 1442 ", expected " + exp.toPattern(xpat, TRUE)); 1443 } 1444} 1445 1446void UnicodeSetTest::TestInvalidCodePoint() { 1447 1448 const UChar32 DATA[] = { 1449 // Test range Expected range 1450 0, 0x10FFFF, 0, 0x10FFFF, 1451 (UChar32)-1, 8, 0, 8, 1452 8, 0x110000, 8, 0x10FFFF 1453 }; 1454 const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]); 1455 1456 UnicodeString pat; 1457 int32_t i; 1458 1459 for (i=0; i<DATA_LENGTH; i+=4) { 1460 UChar32 start = DATA[i]; 1461 UChar32 end = DATA[i+1]; 1462 UChar32 xstart = DATA[i+2]; 1463 UChar32 xend = DATA[i+3]; 1464 1465 // Try various API using the test code points 1466 1467 UnicodeSet set(start, end); 1468 expectRange((UnicodeString)"ct(" + start + "," + end + ")", 1469 set, xstart, xend); 1470 1471 set.clear(); 1472 set.set(start, end); 1473 expectRange((UnicodeString)"set(" + start + "," + end + ")", 1474 set, xstart, xend); 1475 1476 UBool b = set.contains(start); 1477 b = set.contains(start, end); 1478 b = set.containsNone(start, end); 1479 b = set.containsSome(start, end); 1480 1481 /*int32_t index = set.indexOf(start);*/ 1482 1483 set.clear(); 1484 set.add(start); 1485 set.add(start, end); 1486 expectRange((UnicodeString)"add(" + start + "," + end + ")", 1487 set, xstart, xend); 1488 1489 set.set(0, 0x10FFFF); 1490 set.retain(start, end); 1491 expectRange((UnicodeString)"retain(" + start + "," + end + ")", 1492 set, xstart, xend); 1493 set.retain(start); 1494 1495 set.set(0, 0x10FFFF); 1496 set.remove(start); 1497 set.remove(start, end); 1498 set.complement(); 1499 expectRange((UnicodeString)"!remove(" + start + "," + end + ")", 1500 set, xstart, xend); 1501 1502 set.set(0, 0x10FFFF); 1503 set.complement(start, end); 1504 set.complement(); 1505 expectRange((UnicodeString)"!complement(" + start + "," + end + ")", 1506 set, xstart, xend); 1507 set.complement(start); 1508 } 1509 1510 const UChar32 DATA2[] = { 1511 0, 1512 0x10FFFF, 1513 (UChar32)-1, 1514 0x110000 1515 }; 1516 const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]); 1517 1518 for (i=0; i<DATA2_LENGTH; ++i) { 1519 UChar32 c = DATA2[i], end = 0x10FFFF; 1520 UBool valid = (c >= 0 && c <= 0x10FFFF); 1521 1522 UnicodeSet set(0, 0x10FFFF); 1523 1524 // For single-codepoint contains, invalid codepoints are NOT contained 1525 UBool b = set.contains(c); 1526 if (b == valid) { 1527 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c + 1528 ") = " + b); 1529 } else { 1530 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c + 1531 ") = " + b); 1532 } 1533 1534 // For codepoint range contains, containsNone, and containsSome, 1535 // invalid or empty (start > end) ranges have UNDEFINED behavior. 1536 b = set.contains(c, end); 1537 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c + 1538 "," + end + ") = " + b); 1539 1540 b = set.containsNone(c, end); 1541 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c + 1542 "," + end + ") = " + b); 1543 1544 b = set.containsSome(c, end); 1545 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c + 1546 "," + end + ") = " + b); 1547 1548 int32_t index = set.indexOf(c); 1549 if ((index >= 0) == valid) { 1550 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c + 1551 ") = " + index); 1552 } else { 1553 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c + 1554 ") = " + index); 1555 } 1556 } 1557} 1558 1559// Used by TestSymbolTable 1560class TokenSymbolTable : public SymbolTable { 1561public: 1562 Hashtable contents; 1563 1564 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) { 1565 contents.setValueDeleter(uhash_deleteUnicodeString); 1566 } 1567 1568 ~TokenSymbolTable() {} 1569 1570 /** 1571 * (Non-SymbolTable API) Add the given variable and value to 1572 * the table. Variable should NOT contain leading '$'. 1573 */ 1574 void add(const UnicodeString& var, const UnicodeString& value, 1575 UErrorCode& ec) { 1576 if (U_SUCCESS(ec)) { 1577 contents.put(var, new UnicodeString(value), ec); 1578 } 1579 } 1580 1581 /** 1582 * SymbolTable API 1583 */ 1584 virtual const UnicodeString* lookup(const UnicodeString& s) const { 1585 return (const UnicodeString*) contents.get(s); 1586 } 1587 1588 /** 1589 * SymbolTable API 1590 */ 1591 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const { 1592 return NULL; 1593 } 1594 1595 /** 1596 * SymbolTable API 1597 */ 1598 virtual UnicodeString parseReference(const UnicodeString& text, 1599 ParsePosition& pos, int32_t limit) const { 1600 int32_t start = pos.getIndex(); 1601 int32_t i = start; 1602 UnicodeString result; 1603 while (i < limit) { 1604 UChar c = text.charAt(i); 1605 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { 1606 break; 1607 } 1608 ++i; 1609 } 1610 if (i == start) { // No valid name chars 1611 return result; // Indicate failure with empty string 1612 } 1613 pos.setIndex(i); 1614 text.extractBetween(start, i, result); 1615 return result; 1616 } 1617}; 1618 1619void UnicodeSetTest::TestSymbolTable() { 1620 // Multiple test cases can be set up here. Each test case 1621 // is terminated by null: 1622 // var, value, var, value,..., input pat., exp. output pat., null 1623 const char* DATA[] = { 1624 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL, 1625 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL, 1626 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL, 1627 NULL 1628 }; 1629 1630 for (int32_t i=0; DATA[i]!=NULL; ++i) { 1631 UErrorCode ec = U_ZERO_ERROR; 1632 TokenSymbolTable sym(ec); 1633 if (U_FAILURE(ec)) { 1634 errln("FAIL: couldn't construct TokenSymbolTable"); 1635 continue; 1636 } 1637 1638 // Set up variables 1639 while (DATA[i+2] != NULL) { 1640 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec); 1641 if (U_FAILURE(ec)) { 1642 errln("FAIL: couldn't add to TokenSymbolTable"); 1643 continue; 1644 } 1645 i += 2; 1646 } 1647 1648 // Input pattern and expected output pattern 1649 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV); 1650 i += 2; 1651 1652 ParsePosition pos(0); 1653 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec); 1654 if (U_FAILURE(ec)) { 1655 errln("FAIL: couldn't construct UnicodeSet"); 1656 continue; 1657 } 1658 1659 // results 1660 if (pos.getIndex() != inpat.length()) { 1661 errln((UnicodeString)"Failed to read to end of string \"" 1662 + inpat + "\": read to " 1663 + pos.getIndex() + ", length is " 1664 + inpat.length()); 1665 } 1666 1667 UnicodeSet us2(exppat, ec); 1668 if (U_FAILURE(ec)) { 1669 errln("FAIL: couldn't construct expected UnicodeSet"); 1670 continue; 1671 } 1672 1673 UnicodeString a, b; 1674 if (us != us2) { 1675 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) + 1676 ", expected " + us2.toPattern(b, TRUE)); 1677 } else { 1678 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE)); 1679 } 1680 } 1681} 1682 1683void UnicodeSetTest::TestSurrogate() { 1684 const char* DATA[] = { 1685 // These should all behave identically 1686 "[abc\\uD800\\uDC00]", 1687 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java 1688 "[abc\\U00010000]", 1689 0 1690 }; 1691 for (int i=0; DATA[i] != 0; ++i) { 1692 UErrorCode ec = U_ZERO_ERROR; 1693 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV)); 1694 UnicodeString str = UnicodeString(DATA[i], -1, US_INV); 1695 UnicodeSet set(str, ec); 1696 if (U_FAILURE(ec)) { 1697 errln("FAIL: UnicodeSet constructor"); 1698 continue; 1699 } 1700 expectContainment(set, 1701 CharsToUnicodeString("abc\\U00010000"), 1702 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair 1703 if (set.size() != 4) { 1704 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " + 1705 set.size() + ", expected 4"); 1706 } 1707 } 1708} 1709 1710void UnicodeSetTest::TestExhaustive() { 1711 // exhaustive tests. Simulate UnicodeSets with integers. 1712 // That gives us very solid tests (except for large memory tests). 1713 1714 int32_t limit = 128; 1715 1716 UnicodeSet x, y, z, aa; 1717 1718 for (int32_t i = 0; i < limit; ++i) { 1719 bitsToSet(i, x); 1720 logln((UnicodeString)"Testing " + i + ", " + x); 1721 _testComplement(i, x, y); 1722 1723 // AS LONG AS WE ARE HERE, check roundtrip 1724 checkRoundTrip(bitsToSet(i, aa)); 1725 1726 for (int32_t j = 0; j < limit; ++j) { 1727 _testAdd(i,j, x,y,z); 1728 _testXor(i,j, x,y,z); 1729 _testRetain(i,j, x,y,z); 1730 _testRemove(i,j, x,y,z); 1731 } 1732 } 1733} 1734 1735void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) { 1736 bitsToSet(a, x); 1737 z = x; 1738 z.complement(); 1739 int32_t c = setToBits(z); 1740 if (c != (~a)) { 1741 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z); 1742 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c); 1743 } 1744 checkCanonicalRep(z, (UnicodeString)"complement " + a); 1745} 1746 1747void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1748 bitsToSet(a, x); 1749 bitsToSet(b, y); 1750 z = x; 1751 z.addAll(y); 1752 int32_t c = setToBits(z); 1753 if (c != (a | b)) { 1754 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z); 1755 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c); 1756 } 1757 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b); 1758} 1759 1760void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1761 bitsToSet(a, x); 1762 bitsToSet(b, y); 1763 z = x; 1764 z.retainAll(y); 1765 int32_t c = setToBits(z); 1766 if (c != (a & b)) { 1767 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z); 1768 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c); 1769 } 1770 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b); 1771} 1772 1773void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1774 bitsToSet(a, x); 1775 bitsToSet(b, y); 1776 z = x; 1777 z.removeAll(y); 1778 int32_t c = setToBits(z); 1779 if (c != (a &~ b)) { 1780 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z); 1781 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c); 1782 } 1783 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b); 1784} 1785 1786void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) { 1787 bitsToSet(a, x); 1788 bitsToSet(b, y); 1789 z = x; 1790 z.complementAll(y); 1791 int32_t c = setToBits(z); 1792 if (c != (a ^ b)) { 1793 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z); 1794 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c); 1795 } 1796 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b); 1797} 1798 1799/** 1800 * Check that ranges are monotonically increasing and non- 1801 * overlapping. 1802 */ 1803void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) { 1804 int32_t n = set.getRangeCount(); 1805 if (n < 0) { 1806 errln((UnicodeString)"FAIL result of " + msg + 1807 ": range count should be >= 0 but is " + 1808 n /*+ " for " + set.toPattern())*/); 1809 return; 1810 } 1811 UChar32 last = 0; 1812 for (int32_t i=0; i<n; ++i) { 1813 UChar32 start = set.getRangeStart(i); 1814 UChar32 end = set.getRangeEnd(i); 1815 if (start > end) { 1816 errln((UnicodeString)"FAIL result of " + msg + 1817 ": range " + (i+1) + 1818 " start > end: " + (int)start + ", " + (int)end + 1819 " for " + set); 1820 } 1821 if (i > 0 && start <= last) { 1822 errln((UnicodeString)"FAIL result of " + msg + 1823 ": range " + (i+1) + 1824 " overlaps previous range: " + (int)start + ", " + (int)end + 1825 " for " + set); 1826 } 1827 last = end; 1828 } 1829} 1830 1831/** 1832 * Convert a bitmask to a UnicodeSet. 1833 */ 1834UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) { 1835 result.clear(); 1836 for (UChar32 i = 0; i < 32; ++i) { 1837 if ((a & (1<<i)) != 0) { 1838 result.add(i); 1839 } 1840 } 1841 return result; 1842} 1843 1844/** 1845 * Convert a UnicodeSet to a bitmask. Only the characters 1846 * U+0000 to U+0020 are represented in the bitmask. 1847 */ 1848int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) { 1849 int32_t result = 0; 1850 for (int32_t i = 0; i < 32; ++i) { 1851 if (x.contains((UChar32)i)) { 1852 result |= (1<<i); 1853 } 1854 } 1855 return result; 1856} 1857 1858/** 1859 * Return the representation of an inversion list based UnicodeSet 1860 * as a pairs list. Ranges are listed in ascending Unicode order. 1861 * For example, the set [a-zA-M3] is represented as "33AMaz". 1862 */ 1863UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) { 1864 UnicodeString pairs; 1865 for (int32_t i=0; i<set.getRangeCount(); ++i) { 1866 UChar32 start = set.getRangeStart(i); 1867 UChar32 end = set.getRangeEnd(i); 1868 if (end > 0xFFFF) { 1869 end = 0xFFFF; 1870 i = set.getRangeCount(); // Should be unnecessary 1871 } 1872 pairs.append((UChar)start).append((UChar)end); 1873 } 1874 return pairs; 1875} 1876 1877/** 1878 * Basic consistency check for a few items. 1879 * That the iterator works, and that we can create a pattern and 1880 * get the same thing back 1881 */ 1882void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) { 1883 UErrorCode ec = U_ZERO_ERROR; 1884 1885 UnicodeSet t(s); 1886 checkEqual(s, t, "copy ct"); 1887 1888 t = s; 1889 checkEqual(s, t, "operator="); 1890 1891 copyWithIterator(t, s, FALSE); 1892 checkEqual(s, t, "iterator roundtrip"); 1893 1894 copyWithIterator(t, s, TRUE); // try range 1895 checkEqual(s, t, "iterator roundtrip"); 1896 1897 UnicodeString pat; s.toPattern(pat, FALSE); 1898 t.applyPattern(pat, ec); 1899 if (U_FAILURE(ec)) { 1900 errln("FAIL: applyPattern"); 1901 return; 1902 } else { 1903 checkEqual(s, t, "toPattern(false)"); 1904 } 1905 1906 s.toPattern(pat, TRUE); 1907 t.applyPattern(pat, ec); 1908 if (U_FAILURE(ec)) { 1909 errln("FAIL: applyPattern"); 1910 return; 1911 } else { 1912 checkEqual(s, t, "toPattern(true)"); 1913 } 1914} 1915 1916void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) { 1917 t.clear(); 1918 UnicodeSetIterator it(s); 1919 if (withRange) { 1920 while (it.nextRange()) { 1921 if (it.isString()) { 1922 t.add(it.getString()); 1923 } else { 1924 t.add(it.getCodepoint(), it.getCodepointEnd()); 1925 } 1926 } 1927 } else { 1928 while (it.next()) { 1929 if (it.isString()) { 1930 t.add(it.getString()); 1931 } else { 1932 t.add(it.getCodepoint()); 1933 } 1934 } 1935 } 1936} 1937 1938UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) { 1939 UnicodeString source; s.toPattern(source, TRUE); 1940 UnicodeString result; t.toPattern(result, TRUE); 1941 if (s != t) { 1942 errln((UnicodeString)"FAIL: " + message 1943 + "; source = " + source 1944 + "; result = " + result 1945 ); 1946 return FALSE; 1947 } else { 1948 logln((UnicodeString)"Ok: " + message 1949 + "; source = " + source 1950 + "; result = " + result 1951 ); 1952 } 1953 return TRUE; 1954} 1955 1956void 1957UnicodeSetTest::expectContainment(const UnicodeString& pat, 1958 const UnicodeString& charsIn, 1959 const UnicodeString& charsOut) { 1960 UErrorCode ec = U_ZERO_ERROR; 1961 UnicodeSet set(pat, ec); 1962 if (U_FAILURE(ec)) { 1963 dataerrln((UnicodeString)"FAIL: pattern \"" + 1964 pat + "\" => " + u_errorName(ec)); 1965 return; 1966 } 1967 expectContainment(set, pat, charsIn, charsOut); 1968} 1969 1970void 1971UnicodeSetTest::expectContainment(const UnicodeSet& set, 1972 const UnicodeString& charsIn, 1973 const UnicodeString& charsOut) { 1974 UnicodeString pat; 1975 set.toPattern(pat); 1976 expectContainment(set, pat, charsIn, charsOut); 1977} 1978 1979void 1980UnicodeSetTest::expectContainment(const UnicodeSet& set, 1981 const UnicodeString& setName, 1982 const UnicodeString& charsIn, 1983 const UnicodeString& charsOut) { 1984 UnicodeString bad; 1985 UChar32 c; 1986 int32_t i; 1987 1988 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) { 1989 c = charsIn.char32At(i); 1990 if (!set.contains(c)) { 1991 bad.append(c); 1992 } 1993 } 1994 if (bad.length() > 0) { 1995 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) + 1996 ", expected containment of " + prettify(charsIn)); 1997 } else { 1998 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn)); 1999 } 2000 2001 bad.truncate(0); 2002 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) { 2003 c = charsOut.char32At(i); 2004 if (set.contains(c)) { 2005 bad.append(c); 2006 } 2007 } 2008 if (bad.length() > 0) { 2009 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) + 2010 ", expected non-containment of " + prettify(charsOut)); 2011 } else { 2012 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut)); 2013 } 2014} 2015 2016void 2017UnicodeSetTest::expectPattern(UnicodeSet& set, 2018 const UnicodeString& pattern, 2019 const UnicodeString& expectedPairs){ 2020 UErrorCode status = U_ZERO_ERROR; 2021 set.applyPattern(pattern, status); 2022 if (U_FAILURE(status)) { 2023 errln(UnicodeString("FAIL: applyPattern(\"") + pattern + 2024 "\") failed"); 2025 return; 2026 } else { 2027 if (getPairs(set) != expectedPairs ) { 2028 errln(UnicodeString("FAIL: applyPattern(\"") + pattern + 2029 "\") => pairs \"" + 2030 escape(getPairs(set)) + "\", expected \"" + 2031 escape(expectedPairs) + "\""); 2032 } else { 2033 logln(UnicodeString("Ok: applyPattern(\"") + pattern + 2034 "\") => pairs \"" + 2035 escape(getPairs(set)) + "\""); 2036 } 2037 } 2038 // the result of calling set.toPattern(), which is the string representation of 2039 // this set(set), is passed to a UnicodeSet constructor, and tested that it 2040 // will produce another set that is equal to this one. 2041 UnicodeString temppattern; 2042 set.toPattern(temppattern); 2043 UnicodeSet *tempset=new UnicodeSet(temppattern, status); 2044 if (U_FAILURE(status)) { 2045 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern")); 2046 return; 2047 } 2048 if(*tempset != set || getPairs(*tempset) != getPairs(set)){ 2049 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" + 2050 escape(getPairs(set)) + "\"")); 2051 } else{ 2052 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\"")); 2053 } 2054 2055 delete tempset; 2056 2057} 2058 2059void 2060UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) { 2061 if (getPairs(set) != expectedPairs) { 2062 errln(UnicodeString("FAIL: Expected pair list \"") + 2063 escape(expectedPairs) + "\", got \"" + 2064 escape(getPairs(set)) + "\""); 2065 } 2066} 2067 2068void UnicodeSetTest::expectToPattern(const UnicodeSet& set, 2069 const UnicodeString& expPat, 2070 const char** expStrings) { 2071 UnicodeString pat; 2072 set.toPattern(pat, TRUE); 2073 if (pat == expPat) { 2074 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\""); 2075 } else { 2076 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\""); 2077 return; 2078 } 2079 if (expStrings == NULL) { 2080 return; 2081 } 2082 UBool in = TRUE; 2083 for (int32_t i=0; expStrings[i] != NULL; ++i) { 2084 if (expStrings[i] == NOT) { // sic; pointer comparison 2085 in = FALSE; 2086 continue; 2087 } 2088 UnicodeString s = CharsToUnicodeString(expStrings[i]); 2089 UBool contained = set.contains(s); 2090 if (contained == in) { 2091 logln((UnicodeString)"Ok: " + expPat + 2092 (contained ? " contains {" : " does not contain {") + 2093 escape(expStrings[i]) + "}"); 2094 } else { 2095 errln((UnicodeString)"FAIL: " + expPat + 2096 (contained ? " contains {" : " does not contain {") + 2097 escape(expStrings[i]) + "}"); 2098 } 2099 } 2100} 2101 2102static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); } 2103 2104void 2105UnicodeSetTest::doAssert(UBool condition, const char *message) 2106{ 2107 if (!condition) { 2108 errln(UnicodeString("ERROR : ") + message); 2109 } 2110} 2111 2112UnicodeString 2113UnicodeSetTest::escape(const UnicodeString& s) { 2114 UnicodeString buf; 2115 for (int32_t i=0; i<s.length(); ) 2116 { 2117 UChar32 c = s.char32At(i); 2118 if (0x0020 <= c && c <= 0x007F) { 2119 buf += c; 2120 } else { 2121 if (c <= 0xFFFF) { 2122 buf += (UChar)0x5c; buf += (UChar)0x75; 2123 } else { 2124 buf += (UChar)0x5c; buf += (UChar)0x55; 2125 buf += toHexString((c & 0xF0000000) >> 28); 2126 buf += toHexString((c & 0x0F000000) >> 24); 2127 buf += toHexString((c & 0x00F00000) >> 20); 2128 buf += toHexString((c & 0x000F0000) >> 16); 2129 } 2130 buf += toHexString((c & 0xF000) >> 12); 2131 buf += toHexString((c & 0x0F00) >> 8); 2132 buf += toHexString((c & 0x00F0) >> 4); 2133 buf += toHexString(c & 0x000F); 2134 } 2135 i += U16_LENGTH(c); 2136 } 2137 return buf; 2138} 2139 2140void UnicodeSetTest::TestFreezable() { 2141 UErrorCode errorCode=U_ZERO_ERROR; 2142 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15); 2143 UnicodeSet idSet(idPattern, errorCode); 2144 if(U_FAILURE(errorCode)) { 2145 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode)); 2146 return; 2147 } 2148 2149 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15); 2150 UnicodeSet wsSet(wsPattern, errorCode); 2151 if(U_FAILURE(errorCode)) { 2152 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode)); 2153 return; 2154 } 2155 2156 idSet.add(idPattern); 2157 UnicodeSet frozen(idSet); 2158 frozen.freeze(); 2159 2160 if(idSet.isFrozen() || !frozen.isFrozen()) { 2161 errln("FAIL: isFrozen() is wrong"); 2162 } 2163 if(frozen!=idSet || !(frozen==idSet)) { 2164 errln("FAIL: a copy-constructed frozen set differs from its original"); 2165 } 2166 2167 frozen=wsSet; 2168 if(frozen!=idSet || !(frozen==idSet)) { 2169 errln("FAIL: a frozen set was modified by operator="); 2170 } 2171 2172 UnicodeSet frozen2(frozen); 2173 if(frozen2!=frozen || frozen2!=idSet) { 2174 errln("FAIL: a copied frozen set differs from its frozen original"); 2175 } 2176 if(!frozen2.isFrozen()) { 2177 errln("FAIL: copy-constructing a frozen set results in a thawed one"); 2178 } 2179 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction. 2180 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) { 2181 errln("FAIL: UnicodeSet(5, 55) failed"); 2182 } 2183 frozen3=frozen; 2184 if(!frozen3.isFrozen()) { 2185 errln("FAIL: copying a frozen set results in a thawed one"); 2186 } 2187 2188 UnicodeSet *cloned=(UnicodeSet *)frozen.clone(); 2189 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) { 2190 errln("FAIL: clone() failed"); 2191 } 2192 cloned->add(0xd802, 0xd805); 2193 if(cloned->containsSome(0xd802, 0xd805)) { 2194 errln("FAIL: unable to modify clone"); 2195 } 2196 delete cloned; 2197 2198 UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed(); 2199 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) { 2200 errln("FAIL: cloneAsThawed() failed"); 2201 } 2202 thawed->add(0xd802, 0xd805); 2203 if(!thawed->contains(0xd802, 0xd805)) { 2204 errln("FAIL: unable to modify thawed clone"); 2205 } 2206 delete thawed; 2207 2208 frozen.set(5, 55); 2209 if(frozen!=idSet || !(frozen==idSet)) { 2210 errln("FAIL: UnicodeSet::set() modified a frozen set"); 2211 } 2212 2213 frozen.clear(); 2214 if(frozen!=idSet || !(frozen==idSet)) { 2215 errln("FAIL: UnicodeSet::clear() modified a frozen set"); 2216 } 2217 2218 frozen.closeOver(USET_CASE_INSENSITIVE); 2219 if(frozen!=idSet || !(frozen==idSet)) { 2220 errln("FAIL: UnicodeSet::closeOver() modified a frozen set"); 2221 } 2222 2223 frozen.compact(); 2224 if(frozen!=idSet || !(frozen==idSet)) { 2225 errln("FAIL: UnicodeSet::compact() modified a frozen set"); 2226 } 2227 2228 ParsePosition pos; 2229 frozen. 2230 applyPattern(wsPattern, errorCode). 2231 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode). 2232 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode). 2233 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode). 2234 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode); 2235 if(frozen!=idSet || !(frozen==idSet)) { 2236 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set"); 2237 } 2238 2239 frozen. 2240 add(0xd800). 2241 add(0xd802, 0xd805). 2242 add(wsPattern). 2243 addAll(idPattern). 2244 addAll(wsSet); 2245 if(frozen!=idSet || !(frozen==idSet)) { 2246 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set"); 2247 } 2248 2249 frozen. 2250 retain(0x62). 2251 retain(0x64, 0x69). 2252 retainAll(wsPattern). 2253 retainAll(wsSet); 2254 if(frozen!=idSet || !(frozen==idSet)) { 2255 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set"); 2256 } 2257 2258 frozen. 2259 remove(0x62). 2260 remove(0x64, 0x69). 2261 remove(idPattern). 2262 removeAll(idPattern). 2263 removeAll(idSet); 2264 if(frozen!=idSet || !(frozen==idSet)) { 2265 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set"); 2266 } 2267 2268 frozen. 2269 complement(). 2270 complement(0x62). 2271 complement(0x64, 0x69). 2272 complement(idPattern). 2273 complementAll(idPattern). 2274 complementAll(idSet); 2275 if(frozen!=idSet || !(frozen==idSet)) { 2276 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set"); 2277 } 2278} 2279 2280// Test span() etc. -------------------------------------------------------- *** 2281 2282// Append the UTF-8 version of the string to t and return the appended UTF-8 length. 2283static int32_t 2284appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) { 2285 UErrorCode errorCode=U_ZERO_ERROR; 2286 int32_t length8=0; 2287 u_strToUTF8(t, capacity, &length8, s, length, &errorCode); 2288 if(U_SUCCESS(errorCode)) { 2289 return length8; 2290 } else { 2291 // The string contains an unpaired surrogate. 2292 // Ignore this string. 2293 return 0; 2294 } 2295} 2296 2297class UnicodeSetWithStringsIterator; 2298 2299// Make the strings in a UnicodeSet easily accessible. 2300class UnicodeSetWithStrings { 2301public: 2302 UnicodeSetWithStrings(const UnicodeSet &normalSet) : 2303 set(normalSet), stringsLength(0), hasSurrogates(FALSE) { 2304 int32_t size=set.size(); 2305 if(size>0 && set.charAt(size-1)<0) { 2306 // If a set's last element is not a code point, then it must contain strings. 2307 // Iterate over the set, skip all code point ranges, and cache the strings. 2308 // Convert them to UTF-8 for spanUTF8(). 2309 UnicodeSetIterator iter(set); 2310 const UnicodeString *s; 2311 char *s8=utf8; 2312 int32_t length8, utf8Count=0; 2313 while(iter.nextRange() && stringsLength<LENGTHOF(strings)) { 2314 if(iter.isString()) { 2315 // Store the pointer to the set's string element 2316 // which we happen to know is a stable pointer. 2317 strings[stringsLength]=s=&iter.getString(); 2318 utf8Count+= 2319 utf8Lengths[stringsLength]=length8= 2320 appendUTF8(s->getBuffer(), s->length(), 2321 s8, (int32_t)(sizeof(utf8)-utf8Count)); 2322 if(length8==0) { 2323 hasSurrogates=TRUE; // Contains unpaired surrogates. 2324 } 2325 s8+=length8; 2326 ++stringsLength; 2327 } 2328 } 2329 } 2330 } 2331 2332 const UnicodeSet &getSet() const { 2333 return set; 2334 } 2335 2336 UBool hasStrings() const { 2337 return (UBool)(stringsLength>0); 2338 } 2339 2340 UBool hasStringsWithSurrogates() const { 2341 return hasSurrogates; 2342 } 2343 2344private: 2345 friend class UnicodeSetWithStringsIterator; 2346 2347 const UnicodeSet &set; 2348 2349 const UnicodeString *strings[20]; 2350 int32_t stringsLength; 2351 UBool hasSurrogates; 2352 2353 char utf8[1024]; 2354 int32_t utf8Lengths[20]; 2355 2356 int32_t nextStringIndex; 2357 int32_t nextUTF8Start; 2358}; 2359 2360class UnicodeSetWithStringsIterator { 2361public: 2362 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) : 2363 fSet(set), nextStringIndex(0), nextUTF8Start(0) { 2364 } 2365 2366 void reset() { 2367 nextStringIndex=nextUTF8Start=0; 2368 } 2369 2370 const UnicodeString *nextString() { 2371 if(nextStringIndex<fSet.stringsLength) { 2372 return fSet.strings[nextStringIndex++]; 2373 } else { 2374 return NULL; 2375 } 2376 } 2377 2378 // Do not mix with calls to nextString(). 2379 const char *nextUTF8(int32_t &length) { 2380 if(nextStringIndex<fSet.stringsLength) { 2381 const char *s8=fSet.utf8+nextUTF8Start; 2382 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++]; 2383 return s8; 2384 } else { 2385 length=0; 2386 return NULL; 2387 } 2388 } 2389 2390private: 2391 const UnicodeSetWithStrings &fSet; 2392 int32_t nextStringIndex; 2393 int32_t nextUTF8Start; 2394}; 2395 2396// Compare 16-bit Unicode strings (which may be malformed UTF-16) 2397// at code point boundaries. 2398// That is, each edge of a match must not be in the middle of a surrogate pair. 2399static inline UBool 2400matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) { 2401 s+=start; 2402 limit-=start; 2403 int32_t length=t.length(); 2404 return 0==t.compare(s, length) && 2405 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 2406 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 2407} 2408 2409// Implement span() with contains() for comparison. 2410static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length, 2411 USetSpanCondition spanCondition) { 2412 const UnicodeSet &realSet(set.getSet()); 2413 if(!set.hasStrings()) { 2414 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2415 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2416 } 2417 2418 UChar32 c; 2419 int32_t start=0, prev; 2420 while((prev=start)<length) { 2421 U16_NEXT(s, start, length, c); 2422 if(realSet.contains(c)!=spanCondition) { 2423 break; 2424 } 2425 } 2426 return prev; 2427 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2428 UnicodeSetWithStringsIterator iter(set); 2429 UChar32 c; 2430 int32_t start, next; 2431 for(start=next=0; start<length;) { 2432 U16_NEXT(s, next, length, c); 2433 if(realSet.contains(c)) { 2434 break; 2435 } 2436 const UnicodeString *str; 2437 iter.reset(); 2438 while((str=iter.nextString())!=NULL) { 2439 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) { 2440 // spanNeedsStrings=TRUE; 2441 return start; 2442 } 2443 } 2444 start=next; 2445 } 2446 return start; 2447 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2448 UnicodeSetWithStringsIterator iter(set); 2449 UChar32 c; 2450 int32_t start, next, maxSpanLimit=0; 2451 for(start=next=0; start<length;) { 2452 U16_NEXT(s, next, length, c); 2453 if(!realSet.contains(c)) { 2454 next=start; // Do not span this single, not-contained code point. 2455 } 2456 const UnicodeString *str; 2457 iter.reset(); 2458 while((str=iter.nextString())!=NULL) { 2459 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) { 2460 // spanNeedsStrings=TRUE; 2461 int32_t matchLimit=start+str->length(); 2462 if(matchLimit==length) { 2463 return length; 2464 } 2465 if(spanCondition==USET_SPAN_CONTAINED) { 2466 // Iterate for the shortest match at each position. 2467 // Recurse for each but the shortest match. 2468 if(next==start) { 2469 next=matchLimit; // First match from start. 2470 } else { 2471 if(matchLimit<next) { 2472 // Remember shortest match from start for iteration. 2473 int32_t temp=next; 2474 next=matchLimit; 2475 matchLimit=temp; 2476 } 2477 // Recurse for non-shortest match from start. 2478 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit, 2479 USET_SPAN_CONTAINED); 2480 if((matchLimit+spanLength)>maxSpanLimit) { 2481 maxSpanLimit=matchLimit+spanLength; 2482 if(maxSpanLimit==length) { 2483 return length; 2484 } 2485 } 2486 } 2487 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2488 if(matchLimit>next) { 2489 // Remember longest match from start. 2490 next=matchLimit; 2491 } 2492 } 2493 } 2494 } 2495 if(next==start) { 2496 break; // No match from start. 2497 } 2498 start=next; 2499 } 2500 if(start>maxSpanLimit) { 2501 return start; 2502 } else { 2503 return maxSpanLimit; 2504 } 2505 } 2506} 2507 2508static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length, 2509 USetSpanCondition spanCondition) { 2510 if(length==0) { 2511 return 0; 2512 } 2513 const UnicodeSet &realSet(set.getSet()); 2514 if(!set.hasStrings()) { 2515 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2516 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2517 } 2518 2519 UChar32 c; 2520 int32_t prev=length; 2521 do { 2522 U16_PREV(s, 0, length, c); 2523 if(realSet.contains(c)!=spanCondition) { 2524 break; 2525 } 2526 } while((prev=length)>0); 2527 return prev; 2528 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2529 UnicodeSetWithStringsIterator iter(set); 2530 UChar32 c; 2531 int32_t prev=length, length0=length; 2532 do { 2533 U16_PREV(s, 0, length, c); 2534 if(realSet.contains(c)) { 2535 break; 2536 } 2537 const UnicodeString *str; 2538 iter.reset(); 2539 while((str=iter.nextString())!=NULL) { 2540 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) { 2541 // spanNeedsStrings=TRUE; 2542 return prev; 2543 } 2544 } 2545 } while((prev=length)>0); 2546 return prev; 2547 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2548 UnicodeSetWithStringsIterator iter(set); 2549 UChar32 c; 2550 int32_t prev=length, minSpanStart=length, length0=length; 2551 do { 2552 U16_PREV(s, 0, length, c); 2553 if(!realSet.contains(c)) { 2554 length=prev; // Do not span this single, not-contained code point. 2555 } 2556 const UnicodeString *str; 2557 iter.reset(); 2558 while((str=iter.nextString())!=NULL) { 2559 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) { 2560 // spanNeedsStrings=TRUE; 2561 int32_t matchStart=prev-str->length(); 2562 if(matchStart==0) { 2563 return 0; 2564 } 2565 if(spanCondition==USET_SPAN_CONTAINED) { 2566 // Iterate for the shortest match at each position. 2567 // Recurse for each but the shortest match. 2568 if(length==prev) { 2569 length=matchStart; // First match from prev. 2570 } else { 2571 if(matchStart>length) { 2572 // Remember shortest match from prev for iteration. 2573 int32_t temp=length; 2574 length=matchStart; 2575 matchStart=temp; 2576 } 2577 // Recurse for non-shortest match from prev. 2578 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart, 2579 USET_SPAN_CONTAINED); 2580 if(spanStart<minSpanStart) { 2581 minSpanStart=spanStart; 2582 if(minSpanStart==0) { 2583 return 0; 2584 } 2585 } 2586 } 2587 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2588 if(matchStart<length) { 2589 // Remember longest match from prev. 2590 length=matchStart; 2591 } 2592 } 2593 } 2594 } 2595 if(length==prev) { 2596 break; // No match from prev. 2597 } 2598 } while((prev=length)>0); 2599 if(prev<minSpanStart) { 2600 return prev; 2601 } else { 2602 return minSpanStart; 2603 } 2604 } 2605} 2606 2607static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length, 2608 USetSpanCondition spanCondition) { 2609 const UnicodeSet &realSet(set.getSet()); 2610 if(!set.hasStrings()) { 2611 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2612 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2613 } 2614 2615 UChar32 c; 2616 int32_t start=0, prev; 2617 while((prev=start)<length) { 2618 U8_NEXT(s, start, length, c); 2619 if(c<0) { 2620 c=0xfffd; 2621 } 2622 if(realSet.contains(c)!=spanCondition) { 2623 break; 2624 } 2625 } 2626 return prev; 2627 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2628 UnicodeSetWithStringsIterator iter(set); 2629 UChar32 c; 2630 int32_t start, next; 2631 for(start=next=0; start<length;) { 2632 U8_NEXT(s, next, length, c); 2633 if(c<0) { 2634 c=0xfffd; 2635 } 2636 if(realSet.contains(c)) { 2637 break; 2638 } 2639 const char *s8; 2640 int32_t length8; 2641 iter.reset(); 2642 while((s8=iter.nextUTF8(length8))!=NULL) { 2643 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) { 2644 // spanNeedsStrings=TRUE; 2645 return start; 2646 } 2647 } 2648 start=next; 2649 } 2650 return start; 2651 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2652 UnicodeSetWithStringsIterator iter(set); 2653 UChar32 c; 2654 int32_t start, next, maxSpanLimit=0; 2655 for(start=next=0; start<length;) { 2656 U8_NEXT(s, next, length, c); 2657 if(c<0) { 2658 c=0xfffd; 2659 } 2660 if(!realSet.contains(c)) { 2661 next=start; // Do not span this single, not-contained code point. 2662 } 2663 const char *s8; 2664 int32_t length8; 2665 iter.reset(); 2666 while((s8=iter.nextUTF8(length8))!=NULL) { 2667 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) { 2668 // spanNeedsStrings=TRUE; 2669 int32_t matchLimit=start+length8; 2670 if(matchLimit==length) { 2671 return length; 2672 } 2673 if(spanCondition==USET_SPAN_CONTAINED) { 2674 // Iterate for the shortest match at each position. 2675 // Recurse for each but the shortest match. 2676 if(next==start) { 2677 next=matchLimit; // First match from start. 2678 } else { 2679 if(matchLimit<next) { 2680 // Remember shortest match from start for iteration. 2681 int32_t temp=next; 2682 next=matchLimit; 2683 matchLimit=temp; 2684 } 2685 // Recurse for non-shortest match from start. 2686 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit, 2687 USET_SPAN_CONTAINED); 2688 if((matchLimit+spanLength)>maxSpanLimit) { 2689 maxSpanLimit=matchLimit+spanLength; 2690 if(maxSpanLimit==length) { 2691 return length; 2692 } 2693 } 2694 } 2695 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2696 if(matchLimit>next) { 2697 // Remember longest match from start. 2698 next=matchLimit; 2699 } 2700 } 2701 } 2702 } 2703 if(next==start) { 2704 break; // No match from start. 2705 } 2706 start=next; 2707 } 2708 if(start>maxSpanLimit) { 2709 return start; 2710 } else { 2711 return maxSpanLimit; 2712 } 2713 } 2714} 2715 2716static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length, 2717 USetSpanCondition spanCondition) { 2718 if(length==0) { 2719 return 0; 2720 } 2721 const UnicodeSet &realSet(set.getSet()); 2722 if(!set.hasStrings()) { 2723 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2724 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2725 } 2726 2727 UChar32 c; 2728 int32_t prev=length; 2729 do { 2730 U8_PREV(s, 0, length, c); 2731 if(c<0) { 2732 c=0xfffd; 2733 } 2734 if(realSet.contains(c)!=spanCondition) { 2735 break; 2736 } 2737 } while((prev=length)>0); 2738 return prev; 2739 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) { 2740 UnicodeSetWithStringsIterator iter(set); 2741 UChar32 c; 2742 int32_t prev=length; 2743 do { 2744 U8_PREV(s, 0, length, c); 2745 if(c<0) { 2746 c=0xfffd; 2747 } 2748 if(realSet.contains(c)) { 2749 break; 2750 } 2751 const char *s8; 2752 int32_t length8; 2753 iter.reset(); 2754 while((s8=iter.nextUTF8(length8))!=NULL) { 2755 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) { 2756 // spanNeedsStrings=TRUE; 2757 return prev; 2758 } 2759 } 2760 } while((prev=length)>0); 2761 return prev; 2762 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ { 2763 UnicodeSetWithStringsIterator iter(set); 2764 UChar32 c; 2765 int32_t prev=length, minSpanStart=length; 2766 do { 2767 U8_PREV(s, 0, length, c); 2768 if(c<0) { 2769 c=0xfffd; 2770 } 2771 if(!realSet.contains(c)) { 2772 length=prev; // Do not span this single, not-contained code point. 2773 } 2774 const char *s8; 2775 int32_t length8; 2776 iter.reset(); 2777 while((s8=iter.nextUTF8(length8))!=NULL) { 2778 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) { 2779 // spanNeedsStrings=TRUE; 2780 int32_t matchStart=prev-length8; 2781 if(matchStart==0) { 2782 return 0; 2783 } 2784 if(spanCondition==USET_SPAN_CONTAINED) { 2785 // Iterate for the shortest match at each position. 2786 // Recurse for each but the shortest match. 2787 if(length==prev) { 2788 length=matchStart; // First match from prev. 2789 } else { 2790 if(matchStart>length) { 2791 // Remember shortest match from prev for iteration. 2792 int32_t temp=length; 2793 length=matchStart; 2794 matchStart=temp; 2795 } 2796 // Recurse for non-shortest match from prev. 2797 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart, 2798 USET_SPAN_CONTAINED); 2799 if(spanStart<minSpanStart) { 2800 minSpanStart=spanStart; 2801 if(minSpanStart==0) { 2802 return 0; 2803 } 2804 } 2805 } 2806 } else /* spanCondition==USET_SPAN_SIMPLE */ { 2807 if(matchStart<length) { 2808 // Remember longest match from prev. 2809 length=matchStart; 2810 } 2811 } 2812 } 2813 } 2814 if(length==prev) { 2815 break; // No match from prev. 2816 } 2817 } while((prev=length)>0); 2818 if(prev<minSpanStart) { 2819 return prev; 2820 } else { 2821 return minSpanStart; 2822 } 2823 } 2824} 2825 2826// spans to be performed and compared 2827enum { 2828 SPAN_UTF16 =1, 2829 SPAN_UTF8 =2, 2830 SPAN_UTFS =3, 2831 2832 SPAN_SET =4, 2833 SPAN_COMPLEMENT =8, 2834 SPAN_POLARITY =0xc, 2835 2836 SPAN_FWD =0x10, 2837 SPAN_BACK =0x20, 2838 SPAN_DIRS =0x30, 2839 2840 SPAN_CONTAINED =0x100, 2841 SPAN_SIMPLE =0x200, 2842 SPAN_CONDITION =0x300, 2843 2844 SPAN_ALL =0x33f 2845}; 2846 2847static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) { 2848 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED; 2849} 2850 2851static inline int32_t slen(const void *s, UBool isUTF16) { 2852 return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s); 2853} 2854 2855/* 2856 * Count spans on a string with the method according to type and set the span limits. 2857 * The set may be the complement of the original. 2858 * When using spanBack() and comparing with span(), use a span condition for the first spanBack() 2859 * according to the expected number of spans. 2860 * Sets typeName to an empty string if there is no such type. 2861 * Returns -1 if the span option is filtered out. 2862 */ 2863static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement, 2864 const void *s, int32_t length, UBool isUTF16, 2865 uint32_t whichSpans, 2866 int type, const char *&typeName, 2867 int32_t limits[], int32_t limitsCapacity, 2868 int32_t expectCount) { 2869 const UnicodeSet &realSet(set.getSet()); 2870 int32_t start, count; 2871 USetSpanCondition spanCondition, firstSpanCondition, contained; 2872 UBool isForward; 2873 2874 if(type<0 || 7<type) { 2875 typeName=""; 2876 return 0; 2877 } 2878 2879 static const char *const typeNames16[]={ 2880 "contains", "contains(LM)", 2881 "span", "span(LM)", 2882 "containsBack", "containsBack(LM)", 2883 "spanBack", "spanBack(LM)" 2884 }; 2885 2886 static const char *const typeNames8[]={ 2887 "containsUTF8", "containsUTF8(LM)", 2888 "spanUTF8", "spanUTF8(LM)", 2889 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented 2890 "spanBackUTF8", "spanBackUTF8(LM)" 2891 }; 2892 2893 typeName= isUTF16 ? typeNames16[type] : typeNames8[type]; 2894 2895 // filter span options 2896 if(type<=3) { 2897 // span forward 2898 if((whichSpans&SPAN_FWD)==0) { 2899 return -1; 2900 } 2901 isForward=TRUE; 2902 } else { 2903 // span backward 2904 if((whichSpans&SPAN_BACK)==0) { 2905 return -1; 2906 } 2907 isForward=FALSE; 2908 } 2909 if((type&1)==0) { 2910 // use USET_SPAN_CONTAINED 2911 if((whichSpans&SPAN_CONTAINED)==0) { 2912 return -1; 2913 } 2914 contained=USET_SPAN_CONTAINED; 2915 } else { 2916 // use USET_SPAN_SIMPLE 2917 if((whichSpans&SPAN_SIMPLE)==0) { 2918 return -1; 2919 } 2920 contained=USET_SPAN_SIMPLE; 2921 } 2922 2923 // Default first span condition for going forward with an uncomplemented set. 2924 spanCondition=USET_SPAN_NOT_CONTAINED; 2925 if(isComplement) { 2926 spanCondition=invertSpanCondition(spanCondition, contained); 2927 } 2928 2929 // First span condition for span(), used to terminate the spanBack() iteration. 2930 firstSpanCondition=spanCondition; 2931 2932 // spanBack(): Its initial span condition is span()'s last span condition, 2933 // which is the opposite of span()'s first span condition 2934 // if we expect an even number of spans. 2935 // (The loop inverts spanCondition (expectCount-1) times 2936 // before the expectCount'th span() call.) 2937 // If we do not compare forward and backward directions, then we do not have an 2938 // expectCount and just start with firstSpanCondition. 2939 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) { 2940 spanCondition=invertSpanCondition(spanCondition, contained); 2941 } 2942 2943 count=0; 2944 switch(type) { 2945 case 0: 2946 case 1: 2947 start=0; 2948 if(length<0) { 2949 length=slen(s, isUTF16); 2950 } 2951 for(;;) { 2952 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) : 2953 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition); 2954 if(count<limitsCapacity) { 2955 limits[count]=start; 2956 } 2957 ++count; 2958 if(start>=length) { 2959 break; 2960 } 2961 spanCondition=invertSpanCondition(spanCondition, contained); 2962 } 2963 break; 2964 case 2: 2965 case 3: 2966 start=0; 2967 for(;;) { 2968 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) : 2969 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition); 2970 if(count<limitsCapacity) { 2971 limits[count]=start; 2972 } 2973 ++count; 2974 if(length>=0 ? start>=length : 2975 isUTF16 ? ((const UChar *)s)[start]==0 : 2976 ((const char *)s)[start]==0 2977 ) { 2978 break; 2979 } 2980 spanCondition=invertSpanCondition(spanCondition, contained); 2981 } 2982 break; 2983 case 4: 2984 case 5: 2985 if(length<0) { 2986 length=slen(s, isUTF16); 2987 } 2988 for(;;) { 2989 ++count; 2990 if(count<=limitsCapacity) { 2991 limits[limitsCapacity-count]=length; 2992 } 2993 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) : 2994 containsSpanBackUTF8(set, (const char *)s, length, spanCondition); 2995 if(length==0 && spanCondition==firstSpanCondition) { 2996 break; 2997 } 2998 spanCondition=invertSpanCondition(spanCondition, contained); 2999 } 3000 if(count<limitsCapacity) { 3001 memmove(limits, limits+(limitsCapacity-count), count*4); 3002 } 3003 break; 3004 case 6: 3005 case 7: 3006 for(;;) { 3007 ++count; 3008 if(count<=limitsCapacity) { 3009 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16); 3010 } 3011 // Note: Length<0 is tested only for the first spanBack(). 3012 // If we wanted to keep length<0 for all spanBack()s, we would have to 3013 // temporarily modify the string by placing a NUL where the previous spanBack() stopped. 3014 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) : 3015 realSet.spanBackUTF8((const char *)s, length, spanCondition); 3016 if(length==0 && spanCondition==firstSpanCondition) { 3017 break; 3018 } 3019 spanCondition=invertSpanCondition(spanCondition, contained); 3020 } 3021 if(count<limitsCapacity) { 3022 memmove(limits, limits+(limitsCapacity-count), count*4); 3023 } 3024 break; 3025 default: 3026 typeName=""; 3027 return -1; 3028 } 3029 3030 return count; 3031} 3032 3033// sets to be tested; odd index=isComplement 3034enum { 3035 SLOW, 3036 SLOW_NOT, 3037 FAST, 3038 FAST_NOT, 3039 SET_COUNT 3040}; 3041 3042static const char *const setNames[SET_COUNT]={ 3043 "slow", 3044 "slow.not", 3045 "fast", 3046 "fast.not" 3047}; 3048 3049/* 3050 * Verify that we get the same results whether we look at text with contains(), 3051 * span() or spanBack(), using unfrozen or frozen versions of the set, 3052 * and using the set or its complement (switching the spanConditions accordingly). 3053 * The latter verifies that 3054 * set.span(spanCondition) == set.complement().span(!spanCondition). 3055 * 3056 * The expectLimits[] are either provided by the caller (with expectCount>=0) 3057 * or returned to the caller (with an input expectCount<0). 3058 */ 3059void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4], 3060 const void *s, int32_t length, UBool isUTF16, 3061 uint32_t whichSpans, 3062 int32_t expectLimits[], int32_t &expectCount, 3063 const char *testName, int32_t index) { 3064 int32_t limits[500]; 3065 int32_t limitsCount; 3066 int i, j; 3067 3068 const char *typeName; 3069 int type; 3070 3071 for(i=0; i<SET_COUNT; ++i) { 3072 if((i&1)==0) { 3073 // Even-numbered sets are original, uncomplemented sets. 3074 if((whichSpans&SPAN_SET)==0) { 3075 continue; 3076 } 3077 } else { 3078 // Odd-numbered sets are complemented. 3079 if((whichSpans&SPAN_COMPLEMENT)==0) { 3080 continue; 3081 } 3082 } 3083 for(type=0;; ++type) { 3084 limitsCount=getSpans(*sets[i], (UBool)(i&1), 3085 s, length, isUTF16, 3086 whichSpans, 3087 type, typeName, 3088 limits, LENGTHOF(limits), expectCount); 3089 if(typeName[0]==0) { 3090 break; // All types tried. 3091 } 3092 if(limitsCount<0) { 3093 continue; // Span option filtered out. 3094 } 3095 if(expectCount<0) { 3096 expectCount=limitsCount; 3097 if(limitsCount>LENGTHOF(limits)) { 3098 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans", 3099 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits)); 3100 return; 3101 } 3102 memcpy(expectLimits, limits, limitsCount*4); 3103 } else if(limitsCount!=expectCount) { 3104 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld", 3105 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount); 3106 } else { 3107 for(j=0; j<limitsCount; ++j) { 3108 if(limits[j]!=expectLimits[j]) { 3109 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld", 3110 testName, (long)index, setNames[i], typeName, (long)limitsCount, 3111 j, (long)limits[j], (long)expectLimits[j]); 3112 break; 3113 } 3114 } 3115 } 3116 } 3117 } 3118 3119 // Compare span() with containsAll()/containsNone(), 3120 // but only if we have expectLimits[] from the uncomplemented set. 3121 if(isUTF16 && (whichSpans&SPAN_SET)!=0) { 3122 const UChar *s16=(const UChar *)s; 3123 UnicodeString string; 3124 int32_t prev=0, limit, length; 3125 for(i=0; i<expectCount; ++i) { 3126 limit=expectLimits[i]; 3127 length=limit-prev; 3128 if(length>0) { 3129 string.setTo(FALSE, s16+prev, length); // read-only alias 3130 if(i&1) { 3131 if(!sets[SLOW]->getSet().containsAll(string)) { 3132 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()", 3133 testName, (long)index, setNames[SLOW], (long)prev, (long)limit); 3134 return; 3135 } 3136 if(!sets[FAST]->getSet().containsAll(string)) { 3137 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()", 3138 testName, (long)index, setNames[FAST], (long)prev, (long)limit); 3139 return; 3140 } 3141 } else { 3142 if(!sets[SLOW]->getSet().containsNone(string)) { 3143 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()", 3144 testName, (long)index, setNames[SLOW], (long)prev, (long)limit); 3145 return; 3146 } 3147 if(!sets[FAST]->getSet().containsNone(string)) { 3148 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()", 3149 testName, (long)index, setNames[FAST], (long)prev, (long)limit); 3150 return; 3151 } 3152 } 3153 } 3154 prev=limit; 3155 } 3156 } 3157} 3158 3159// Specifically test either UTF-16 or UTF-8. 3160void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4], 3161 const void *s, int32_t length, UBool isUTF16, 3162 uint32_t whichSpans, 3163 const char *testName, int32_t index) { 3164 int32_t expectLimits[500]; 3165 int32_t expectCount=-1; 3166 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index); 3167} 3168 3169UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) { 3170 UChar c, c2; 3171 3172 if(length>=0) { 3173 while(length>0) { 3174 c=*s++; 3175 --length; 3176 if(0xd800<=c && c<0xe000) { 3177 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) { 3178 return TRUE; 3179 } 3180 --length; 3181 } 3182 } 3183 } else { 3184 while((c=*s++)!=0) { 3185 if(0xd800<=c && c<0xe000) { 3186 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) { 3187 return TRUE; 3188 } 3189 } 3190 } 3191 } 3192 return FALSE; 3193} 3194 3195// Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text, 3196// unless either UTF is turned off in whichSpans. 3197// Testing UTF-16 and UTF-8 together requires that surrogate code points 3198// have the same contains(c) value as U+FFFD. 3199void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4], 3200 const UChar *s16, int32_t length16, 3201 uint32_t whichSpans, 3202 const char *testName, int32_t index) { 3203 int32_t expectLimits[500]; 3204 int32_t expectCount; 3205 3206 expectCount=-1; // Get expectLimits[] from testSpan(). 3207 3208 if((whichSpans&SPAN_UTF16)!=0) { 3209 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index); 3210 } 3211 if((whichSpans&SPAN_UTF8)==0) { 3212 return; 3213 } 3214 3215 // Convert s16[] and expectLimits[] to UTF-8. 3216 uint8_t s8[3000]; 3217 int32_t offsets[3000]; 3218 3219 const UChar *s16Limit=s16+length16; 3220 char *t=(char *)s8; 3221 char *tLimit=t+sizeof(s8); 3222 int32_t *o=offsets; 3223 UErrorCode errorCode=U_ZERO_ERROR; 3224 3225 // Convert with substitution: Turn unpaired surrogates into U+FFFD. 3226 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode); 3227 if(U_FAILURE(errorCode)) { 3228 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s", 3229 testName, (long)index, u_errorName(errorCode)); 3230 ucnv_resetFromUnicode(utf8Cnv); 3231 return; 3232 } 3233 int32_t length8=(int32_t)(t-(char *)s8); 3234 3235 // Convert expectLimits[]. 3236 int32_t i, j, expect; 3237 for(i=j=0; i<expectCount; ++i) { 3238 expect=expectLimits[i]; 3239 if(expect==length16) { 3240 expectLimits[i]=length8; 3241 } else { 3242 while(offsets[j]<expect) { 3243 ++j; 3244 } 3245 expectLimits[i]=j; 3246 } 3247 } 3248 3249 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index); 3250} 3251 3252static UChar32 nextCodePoint(UChar32 c) { 3253 // Skip some large and boring ranges. 3254 switch(c) { 3255 case 0x3441: 3256 return 0x4d7f; 3257 case 0x5100: 3258 return 0x9f00; 3259 case 0xb040: 3260 return 0xd780; 3261 case 0xe041: 3262 return 0xf8fe; 3263 case 0x10100: 3264 return 0x20000; 3265 case 0x20041: 3266 return 0xe0000; 3267 case 0xe0101: 3268 return 0x10fffd; 3269 default: 3270 return c+1; 3271 } 3272} 3273 3274// Verify that all implementations represent the same set. 3275void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3276 // contains(U+FFFD) is inconsistent with contains(some surrogates), 3277 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8: 3278 // Skip the UTF-8 part of the test - if the string contains surrogates - 3279 // because it is likely to produce a different result. 3280 UBool inconsistentSurrogates= 3281 (!(sets[0]->getSet().contains(0xfffd) ? 3282 sets[0]->getSet().contains(0xd800, 0xdfff) : 3283 sets[0]->getSet().containsNone(0xd800, 0xdfff)) || 3284 sets[0]->hasStringsWithSurrogates()); 3285 3286 UChar s[1000]; 3287 int32_t length=0; 3288 uint32_t localWhichSpans; 3289 3290 UChar32 c, first; 3291 for(first=c=0;; c=nextCodePoint(c)) { 3292 if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) { 3293 localWhichSpans=whichSpans; 3294 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) { 3295 localWhichSpans&=~SPAN_UTF8; 3296 } 3297 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first); 3298 if(c>0x10ffff) { 3299 break; 3300 } 3301 length=0; 3302 first=c; 3303 } 3304 U16_APPEND_UNSAFE(s, length, c); 3305 } 3306} 3307 3308// Test with a particular, interesting string. 3309// Specify length and try NUL-termination. 3310void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3311 static const UChar s[]={ 3312 0x61, 0x62, 0x20, // Latin, space 3313 0x3b1, 0x3b2, 0x3b3, // Greek 3314 0xd900, // lead surrogate 3315 0x3000, 0x30ab, 0x30ad, // wide space, Katakana 3316 0xdc05, // trail surrogate 3317 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul 3318 0xd900, 0xdc05, // unassigned supplementary 3319 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary 3320 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS 3321 0 // NUL 3322 }; 3323 3324 if((whichSpans&SPAN_UTF16)==0) { 3325 return; 3326 } 3327 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0); 3328 testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1); 3329} 3330 3331void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) { 3332 static const char s[]={ 3333 "abc" // Latin 3334 3335 /* trail byte in lead position */ 3336 "\x80" 3337 3338 " " // space 3339 3340 /* truncated multi-byte sequences */ 3341 "\xd0" 3342 "\xe0" 3343 "\xe1" 3344 "\xed" 3345 "\xee" 3346 "\xf0" 3347 "\xf1" 3348 "\xf4" 3349 "\xf8" 3350 "\xfc" 3351 3352 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek 3353 3354 /* trail byte in lead position */ 3355 "\x80" 3356 3357 "\xe0\x80" 3358 "\xe0\xa0" 3359 "\xe1\x80" 3360 "\xed\x80" 3361 "\xed\xa0" 3362 "\xee\x80" 3363 "\xf0\x80" 3364 "\xf0\x90" 3365 "\xf1\x80" 3366 "\xf4\x80" 3367 "\xf4\x90" 3368 "\xf8\x80" 3369 "\xfc\x80" 3370 3371 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana 3372 3373 /* trail byte in lead position */ 3374 "\x80" 3375 3376 "\xf0\x80\x80" 3377 "\xf0\x90\x80" 3378 "\xf1\x80\x80" 3379 "\xf4\x80\x80" 3380 "\xf4\x90\x80" 3381 "\xf8\x80\x80" 3382 "\xfc\x80\x80" 3383 3384 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul 3385 3386 /* trail byte in lead position */ 3387 "\x80" 3388 3389 "\xf8\x80\x80\x80" 3390 "\xfc\x80\x80\x80" 3391 3392 "\xF1\x90\x80\x85" // unassigned supplementary 3393 3394 /* trail byte in lead position */ 3395 "\x80" 3396 3397 "\xfc\x80\x80\x80\x80" 3398 3399 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary 3400 3401 /* trail byte in lead position */ 3402 "\x80" 3403 3404 /* complete sequences but non-shortest forms or out of range etc. */ 3405 "\xc0\x80" 3406 "\xe0\x80\x80" 3407 "\xed\xa0\x80" 3408 "\xf0\x80\x80\x80" 3409 "\xf4\x90\x80\x80" 3410 "\xf8\x80\x80\x80\x80" 3411 "\xfc\x80\x80\x80\x80\x80" 3412 "\xfe" 3413 "\xff" 3414 3415 /* trail byte in lead position */ 3416 "\x80" 3417 3418 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated 3419 }; 3420 3421 if((whichSpans&SPAN_UTF8)==0) { 3422 return; 3423 } 3424 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0); 3425 testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1); 3426} 3427 3428// Take a set of span options and multiply them so that 3429// each portion only has one of the options a, b and c. 3430// If b==0, then the set of options is just modified with mask and a. 3431// If b!=0 and c==0, then the set of options is just modified with mask, a and b. 3432static int32_t 3433addAlternative(uint32_t whichSpans[], int32_t whichSpansCount, 3434 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) { 3435 uint32_t s; 3436 int32_t i; 3437 3438 for(i=0; i<whichSpansCount; ++i) { 3439 s=whichSpans[i]&mask; 3440 whichSpans[i]=s|a; 3441 if(b!=0) { 3442 whichSpans[whichSpansCount+i]=s|b; 3443 if(c!=0) { 3444 whichSpans[2*whichSpansCount+i]=s|c; 3445 } 3446 } 3447 } 3448 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount; 3449} 3450 3451#define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 3452#define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 3453#define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" 3454#define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" 3455 3456void UnicodeSetTest::TestSpan() { 3457 // "[...]" is a UnicodeSet pattern. 3458 // "*" performs tests on all Unicode code points and on a selection of 3459 // malformed UTF-8/16 strings. 3460 // "-options" limits the scope of testing for the current set. 3461 // By default, the test verifies that equivalent boundaries are found 3462 // for UTF-16 and UTF-8, going forward and backward, 3463 // alternating USET_SPAN_NOT_CONTAINED with 3464 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE. 3465 // Single-character options: 3466 // 8 -- UTF-16 and UTF-8 boundaries may differ. 3467 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates), 3468 // or the set contains strings with unpaired surrogates 3469 // which do not translate to valid UTF-8. 3470 // c -- set.span() and set.complement().span() boundaries may differ. 3471 // Cause: Set strings are not complemented. 3472 // b -- span() and spanBack() boundaries may differ. 3473 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED) 3474 // and spanBack(USET_SPAN_SIMPLE) are defined to 3475 // match with non-overlapping substrings. 3476 // For example, with a set containing "ab" and "ba", 3477 // span() of "aba" yields boundaries { 0, 2, 3 } 3478 // because the initial "ab" matches from 0 to 2, 3479 // while spanBack() yields boundaries { 0, 1, 3 } 3480 // because the final "ba" matches from 1 to 3. 3481 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ. 3482 // Cause: Strings in the set overlap, and a longer match may 3483 // require a sequence including non-longest substrings. 3484 // For example, with a set containing "ab", "abc" and "cd", 3485 // span(contained) of "abcd" spans the entire string 3486 // but span(longest match) only spans the first 3 characters. 3487 // Each "-options" first resets all options and then applies the specified options. 3488 // A "-" without options resets the options. 3489 // The options are also reset for each new set. 3490 // Other strings will be spanned. 3491 static const char *const testdata[]={ 3492 "[:ID_Continue:]", 3493 "*", 3494 "[:White_Space:]", 3495 "*", 3496 "[]", 3497 "*", 3498 "[\\u0000-\\U0010FFFF]", 3499 "*", 3500 "[\\u0000\\u0080\\u0800\\U00010000]", 3501 "*", 3502 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]", 3503 "*", 3504 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]", 3505 "-c", 3506 "*", 3507 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]", 3508 "-c", 3509 "*", 3510 3511 // Overlapping strings cause overlapping attempts to match. 3512 "[x{xy}{xya}{axy}{ax}]", 3513 "-cl", 3514 3515 // More repetitions of "xya" would take too long with the recursive 3516 // reference implementation. 3517 // containsAll()=FALSE 3518 // test_string 0x14 3519 "xx" 3520 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here. 3521 "xx" // set.complement().span(contained) will stop between the two 'x'es. 3522 "xyaxyaxyaxya" 3523 "xx" 3524 "xyaxyaxyaxya" // span() ends here. 3525 "aaa", 3526 3527 // containsAll()=TRUE 3528 // test_string 0x15 3529 "xx" 3530 "xyaxyaxyaxya" 3531 "xx" 3532 "xyaxyaxyaxya" 3533 "xx" 3534 "xyaxyaxyaxy", 3535 3536 "-bc", 3537 // test_string 0x17 3538 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 } 3539 "-c", 3540 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 } 3541 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 } 3542 "-", 3543 "byaya", // span() -> { 5 } 3544 "byay", // span() -> { 4 } 3545 "bya", // span() -> { 3 } 3546 3547 // span(longest match) will not span the whole string. 3548 "[a{ab}{bc}]", 3549 "-cl", 3550 // test_string 0x21 3551 "abc", 3552 3553 "[a{ab}{abc}{cd}]", 3554 "-cl", 3555 "acdabcdabccd", 3556 3557 // spanBack(longest match) will not span the whole string. 3558 "[c{ab}{bc}]", 3559 "-cl", 3560 "abc", 3561 3562 "[d{cd}{bcd}{ab}]", 3563 "-cl", 3564 "abbcdabcdabd", 3565 3566 // Test with non-ASCII set strings - test proper handling of surrogate pairs 3567 // and UTF-8 trail bytes. 3568 // Copies of above test sets and strings, but transliterated to have 3569 // different code points with similar trail units. 3570 // Previous: a b c d 3571 // Unicode: 042B 30AB 200AB 204AB 3572 // UTF-16: 042B 30AB D840 DCAB D841 DCAB 3573 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB 3574 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]", 3575 "-cl", 3576 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB", 3577 3578 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]", 3579 "-cl", 3580 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB", 3581 3582 // Stress bookkeeping and recursion. 3583 // The following strings are barely doable with the recursive 3584 // reference implementation. 3585 // The not-contained character at the end prevents an early exit from the span(). 3586 "[b{bb}]", 3587 "-c", 3588 // test_string 0x33 3589 "bbbbbbbbbbbbbbbbbbbbbbbb-", 3590 // On complement sets, span() and spanBack() get different results 3591 // because b is not in the complement set and there is an odd number of b's 3592 // in the test string. 3593 "-bc", 3594 "bbbbbbbbbbbbbbbbbbbbbbbbb-", 3595 3596 // Test with set strings with an initial or final code point span 3597 // longer than 254. 3598 "[a{" _64_a _64_a _64_a _64_a "b}" 3599 "{a" _64_b _64_b _64_b _64_b "}]", 3600 "-c", 3601 _64_a _64_a _64_a _63_a "b", 3602 _64_a _64_a _64_a _64_a "b", 3603 _64_a _64_a _64_a _64_a "aaaabbbb", 3604 "a" _64_b _64_b _64_b _63_b, 3605 "a" _64_b _64_b _64_b _64_b, 3606 "aaaabbbb" _64_b _64_b _64_b _64_b, 3607 3608 // Test with strings containing unpaired surrogates. 3609 // They are not representable in UTF-8, and a leading trail surrogate 3610 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair. 3611 // U+20001 == \\uD840\\uDC01 3612 // U+20400 == \\uD841\\uDC00 3613 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]", 3614 "-8cl", 3615 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb" 3616 }; 3617 uint32_t whichSpans[96]={ SPAN_ALL }; 3618 int32_t whichSpansCount=1; 3619 3620 UnicodeSet *sets[SET_COUNT]={ NULL }; 3621 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL }; 3622 3623 char testName[1024]; 3624 char *testNameLimit=testName; 3625 3626 int32_t i, j; 3627 for(i=0; i<LENGTHOF(testdata); ++i) { 3628 const char *s=testdata[i]; 3629 if(s[0]=='[') { 3630 // Create new test sets from this pattern. 3631 for(j=0; j<SET_COUNT; ++j) { 3632 delete sets_with_str[j]; 3633 delete sets[j]; 3634 } 3635 UErrorCode errorCode=U_ZERO_ERROR; 3636 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode); 3637 if(U_FAILURE(errorCode)) { 3638 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode)); 3639 break; 3640 } 3641 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]); 3642 sets[SLOW_NOT]->complement(); 3643 // Intermediate set: Test cloning of a frozen set. 3644 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]); 3645 fast->freeze(); 3646 sets[FAST]=(UnicodeSet *)fast->clone(); 3647 delete fast; 3648 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]); 3649 fastNot->freeze(); 3650 sets[FAST_NOT]=(UnicodeSet *)fastNot->clone(); 3651 delete fastNot; 3652 3653 for(j=0; j<SET_COUNT; ++j) { 3654 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]); 3655 } 3656 3657 strcpy(testName, s); 3658 testNameLimit=strchr(testName, 0); 3659 *testNameLimit++=':'; 3660 *testNameLimit=0; 3661 3662 whichSpans[0]=SPAN_ALL; 3663 whichSpansCount=1; 3664 } else if(s[0]=='-') { 3665 whichSpans[0]=SPAN_ALL; 3666 whichSpansCount=1; 3667 3668 while(*++s!=0) { 3669 switch(*s) { 3670 case 'c': 3671 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3672 ~SPAN_POLARITY, 3673 SPAN_SET, 3674 SPAN_COMPLEMENT, 3675 0); 3676 break; 3677 case 'b': 3678 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3679 ~SPAN_DIRS, 3680 SPAN_FWD, 3681 SPAN_BACK, 3682 0); 3683 break; 3684 case 'l': 3685 // test USET_SPAN_CONTAINED FWD & BACK, and separately 3686 // USET_SPAN_SIMPLE only FWD, and separately 3687 // USET_SPAN_SIMPLE only BACK 3688 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3689 ~(SPAN_DIRS|SPAN_CONDITION), 3690 SPAN_DIRS|SPAN_CONTAINED, 3691 SPAN_FWD|SPAN_SIMPLE, 3692 SPAN_BACK|SPAN_SIMPLE); 3693 break; 3694 case '8': 3695 whichSpansCount=addAlternative(whichSpans, whichSpansCount, 3696 ~SPAN_UTFS, 3697 SPAN_UTF16, 3698 SPAN_UTF8, 3699 0); 3700 break; 3701 default: 3702 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]); 3703 break; 3704 } 3705 } 3706 } else if(0==strcmp(s, "*")) { 3707 strcpy(testNameLimit, "bad_string"); 3708 for(j=0; j<whichSpansCount; ++j) { 3709 if(whichSpansCount>1) { 3710 sprintf(testNameLimit+10 /* strlen("bad_string") */, 3711 "%%0x%3x", 3712 whichSpans[j]); 3713 } 3714 testSpanUTF16String(sets_with_str, whichSpans[j], testName); 3715 testSpanUTF8String(sets_with_str, whichSpans[j], testName); 3716 } 3717 3718 strcpy(testNameLimit, "contents"); 3719 for(j=0; j<whichSpansCount; ++j) { 3720 if(whichSpansCount>1) { 3721 sprintf(testNameLimit+8 /* strlen("contents") */, 3722 "%%0x%3x", 3723 whichSpans[j]); 3724 } 3725 testSpanContents(sets_with_str, whichSpans[j], testName); 3726 } 3727 } else { 3728 UnicodeString string=UnicodeString(s, -1, US_INV).unescape(); 3729 strcpy(testNameLimit, "test_string"); 3730 for(j=0; j<whichSpansCount; ++j) { 3731 if(whichSpansCount>1) { 3732 sprintf(testNameLimit+11 /* strlen("test_string") */, 3733 "%%0x%3x", 3734 whichSpans[j]); 3735 } 3736 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i); 3737 } 3738 } 3739 } 3740 for(j=0; j<SET_COUNT; ++j) { 3741 delete sets_with_str[j]; 3742 delete sets[j]; 3743 } 3744} 3745 3746// Test select patterns and strings, and test USET_SPAN_SIMPLE. 3747void UnicodeSetTest::TestStringSpan() { 3748 static const char *pattern="[x{xy}{xya}{axy}{ax}]"; 3749 static const char *const string= 3750 "xx" 3751 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" 3752 "xx" 3753 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" 3754 "xx" 3755 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy" 3756 "aaaa"; 3757 3758 UErrorCode errorCode=U_ZERO_ERROR; 3759 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV); 3760 UnicodeSet set(pattern16, errorCode); 3761 if(U_FAILURE(errorCode)) { 3762 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3763 return; 3764 } 3765 3766 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape(); 3767 3768 if(set.containsAll(string16)) { 3769 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string); 3770 } 3771 3772 // Remove trailing "aaaa". 3773 string16.truncate(string16.length()-4); 3774 if(!set.containsAll(string16)) { 3775 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string); 3776 } 3777 3778 string16=UNICODE_STRING_SIMPLE("byayaxya"); 3779 const UChar *s16=string16.getBuffer(); 3780 int32_t length16=string16.length(); 3781 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 || 3782 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 || 3783 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 || 3784 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 || 3785 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 || 3786 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3 3787 ) { 3788 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern); 3789 } 3790 3791 pattern="[a{ab}{abc}{cd}]"; 3792 pattern16=UnicodeString(pattern, -1, US_INV); 3793 set.applyPattern(pattern16, errorCode); 3794 if(U_FAILURE(errorCode)) { 3795 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3796 return; 3797 } 3798 string16=UNICODE_STRING_SIMPLE("acdabcdabccd"); 3799 s16=string16.getBuffer(); 3800 length16=string16.length(); 3801 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 || 3802 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 || 3803 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5 3804 ) { 3805 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern); 3806 } 3807 3808 pattern="[d{cd}{bcd}{ab}]"; 3809 pattern16=UnicodeString(pattern, -1, US_INV); 3810 set.applyPattern(pattern16, errorCode).freeze(); 3811 if(U_FAILURE(errorCode)) { 3812 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode)); 3813 return; 3814 } 3815 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd"); 3816 s16=string16.getBuffer(); 3817 length16=string16.length(); 3818 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 || 3819 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 || 3820 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0 3821 ) { 3822 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern); 3823 } 3824} 3825