1/* GENERATED SOURCE. DO NOT MODIFY. */ 2// © 2016 and later: Unicode, Inc. and others. 3// License & terms of use: http://www.unicode.org/copyright.html#License 4/* 5 ******************************************************************************* 6 * Copyright (C) 2009-2014, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10package android.icu.dev.test.lang; 11 12import java.util.Collection; 13 14import org.junit.Test; 15 16import android.icu.dev.test.TestFmwk; 17import android.icu.impl.Utility; 18import android.icu.text.UTF16; 19import android.icu.text.UnicodeSet; 20import android.icu.text.UnicodeSet.SpanCondition; 21import android.icu.util.OutputInt; 22 23/** 24 * @test 25 * @summary General test of UnicodeSet string span. 26 */ 27public class UnicodeSetStringSpanTest extends TestFmwk { 28 // Simple test first, easier to debug. 29 @Test 30 public void TestSimpleStringSpan() { 31 String pattern = "[a{ab}{bc}]"; 32 String string = "abc"; 33 UnicodeSet set = new UnicodeSet(pattern); 34 set.complement(); 35 int pos = set.spanBack(string, 3, SpanCondition.SIMPLE); 36 if (pos != 1) { 37 errln(String.format("FAIL: UnicodeSet(%s).spanBack(%s) returns the wrong value pos %d (!= 1)", 38 set.toString(), string, pos)); 39 } 40 pos = set.span(string, SpanCondition.SIMPLE); 41 if (pos != 3) { 42 errln(String.format("FAIL: UnicodeSet(%s).span(%s) returns the wrong value pos %d (!= 3)", 43 set.toString(), string, pos)); 44 } 45 pos = set.span(string, 1, SpanCondition.SIMPLE); 46 if (pos != 3) { 47 errln(String.format("FAIL: UnicodeSet(%s).span(%s, 1) returns the wrong value pos %d (!= 3)", 48 set.toString(), string, pos)); 49 } 50 } 51 52 // test our slow implementation 53 @Test 54 public void TestSimpleStringSpanSlow() { 55 String pattern = "[a{ab}{bc}]"; 56 String string = "abc"; 57 UnicodeSet uset = new UnicodeSet(pattern); 58 uset.complement(); 59 UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset); 60 61 int length = containsSpanBackUTF16(set, string, 3, SpanCondition.SIMPLE); 62 if (length != 1) { 63 errln(String.format("FAIL: UnicodeSet(%s) containsSpanBackUTF16(%s) returns the wrong value length %d (!= 1)", 64 set.toString(), string, length)); 65 } 66 length = containsSpanUTF16(set, string, SpanCondition.SIMPLE); 67 if (length != 3) { 68 errln(String.format("FAIL: UnicodeSet(%s) containsSpanUTF16(%s) returns the wrong value length %d (!= 3)", 69 set.toString(), string, length)); 70 } 71 length = containsSpanUTF16(set, string.substring(1), SpanCondition.SIMPLE); 72 if (length != 2) { 73 errln(String.format("FAIL: UnicodeSet(%s) containsSpanUTF16(%s) returns the wrong value length %d (!= 2)", 74 set.toString(), string, length)); 75 } 76 } 77 78 // Test select patterns and strings, and test SIMPLE. 79 @Test 80 public void TestSimpleStringSpanAndFreeze() { 81 String pattern = "[x{xy}{xya}{axy}{ax}]"; 82 final String string = "xx" 83 + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" + "xx" 84 + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" + "xx" 85 + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy" + "aaaa"; 86 87 UnicodeSet set = new UnicodeSet(pattern); 88 if (set.containsAll(string)) { 89 errln("FAIL: UnicodeSet(" + pattern + ").containsAll(" + string + ") should be FALSE"); 90 } 91 92 // Remove trailing "aaaa". 93 String string16 = string.substring(0, string.length() - 4); 94 if (!set.containsAll(string16)) { 95 errln("FAIL: UnicodeSet(" + pattern + ").containsAll(" + string + "[:-4]) should be TRUE"); 96 } 97 98 String s16 = "byayaxya"; 99 if ( set.span(s16.substring(0, 8), SpanCondition.NOT_CONTAINED) != 4 100 || set.span(s16.substring(0, 7), SpanCondition.NOT_CONTAINED) != 4 101 || set.span(s16.substring(0, 6), SpanCondition.NOT_CONTAINED) != 4 102 || set.span(s16.substring(0, 5), SpanCondition.NOT_CONTAINED) != 5 103 || set.span(s16.substring(0, 4), SpanCondition.NOT_CONTAINED) != 4 104 || set.span(s16.substring(0, 3), SpanCondition.NOT_CONTAINED) != 3) { 105 errln("FAIL: UnicodeSet(" + pattern + ").span(while not) returns the wrong value"); 106 } 107 108 pattern = "[a{ab}{abc}{cd}]"; 109 set.applyPattern(pattern); 110 s16 = "acdabcdabccd"; 111 if ( set.span(s16.substring(0, 12), SpanCondition.CONTAINED) != 12 112 || set.span(s16.substring(0, 12), SpanCondition.SIMPLE) != 6 113 || set.span(s16.substring(7), SpanCondition.SIMPLE) != 5) { 114 errln("FAIL: UnicodeSet(" + pattern + ").span(while longest match) returns the wrong value"); 115 } 116 set.freeze(); 117 if ( set.span(s16.substring(0, 12), SpanCondition.CONTAINED) != 12 118 || set.span(s16.substring(0, 12), SpanCondition.SIMPLE) != 6 119 || set.span(s16.substring(7), SpanCondition.SIMPLE) != 5) { 120 errln("FAIL: UnicodeSet(" + pattern + ").span(while longest match) returns the wrong value"); 121 } 122 123 pattern = "[d{cd}{bcd}{ab}]"; 124 set = (UnicodeSet)set.cloneAsThawed(); 125 set.applyPattern(pattern).freeze(); 126 s16 = "abbcdabcdabd"; 127 if ( set.spanBack(s16, 12, SpanCondition.CONTAINED) != 0 128 || set.spanBack(s16, 12, SpanCondition.SIMPLE) != 6 129 || set.spanBack(s16, 5, SpanCondition.SIMPLE) != 0) { 130 errln("FAIL: UnicodeSet(" + pattern + ").spanBack(while longest match) returns the wrong value"); 131 } 132 } 133 134 // more complex test. -------------------------------------------------------- 135 136 // Make the strings in a UnicodeSet easily accessible. 137 private static class UnicodeSetWithStrings { 138 private UnicodeSet set; 139 private Collection<String> setStrings; 140 private int stringsLength; 141 142 public UnicodeSetWithStrings(final UnicodeSet normalSet) { 143 set = normalSet; 144 setStrings = normalSet.strings(); 145 stringsLength = setStrings.size(); 146 } 147 148 public final UnicodeSet getSet() { 149 return set; 150 } 151 152 public boolean hasStrings() { 153 return (stringsLength > 0); 154 } 155 156 public Iterable<String> strings() { 157 return setStrings; 158 } 159 } 160 161 // Compare 16-bit Unicode strings (which may be malformed UTF-16) 162 // at code point boundaries. 163 // That is, each edge of a match must not be in the middle of a surrogate pair. 164 static boolean matches16CPB(final String s, int start, int limit, final String t) { 165 limit -= start; 166 int length = t.length(); 167 return t.equals(s.substring(start, start + length)) 168 && !(0 < start && UTF16.isLeadSurrogate (s.charAt(start - 1)) && 169 UTF16.isTrailSurrogate(s.charAt(start))) 170 && !(length < limit && UTF16.isLeadSurrogate (s.charAt(start + length - 1)) && 171 UTF16.isTrailSurrogate(s.charAt(start + length))); 172 } 173 174 // Implement span() with contains() for comparison. 175 static int containsSpanUTF16(final UnicodeSetWithStrings set, final String s, 176 SpanCondition spanCondition) { 177 final UnicodeSet realSet = set.getSet(); 178 int length = s.length(); 179 if (!set.hasStrings()) { 180 boolean spanContained = false; 181 if (spanCondition != SpanCondition.NOT_CONTAINED) { 182 spanContained = true; // Pin to 0/1 values. 183 } 184 185 int c; 186 int start = 0, prev; 187 while ((prev = start) < length) { 188 c = s.codePointAt(start); 189 start = s.offsetByCodePoints(start, 1); 190 if (realSet.contains(c) != spanContained) { 191 break; 192 } 193 } 194 return prev; 195 } else if (spanCondition == SpanCondition.NOT_CONTAINED) { 196 int c; 197 int start, next; 198 for (start = next = 0; start < length;) { 199 c = s.codePointAt(next); 200 next = s.offsetByCodePoints(next, 1); 201 if (realSet.contains(c)) { 202 break; 203 } 204 for (String str : set.strings()) { 205 if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) { 206 // spanNeedsStrings=true; 207 return start; 208 } 209 } 210 start = next; 211 } 212 return start; 213 } else /* CONTAINED or SIMPLE */{ 214 int c; 215 int start, next, maxSpanLimit = 0; 216 for (start = next = 0; start < length;) { 217 c = s.codePointAt(next); 218 next = s.offsetByCodePoints(next, 1); 219 if (!realSet.contains(c)) { 220 next = start; // Do not span this single, not-contained code point. 221 } 222 for (String str : set.strings()) { 223 if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) { 224 // spanNeedsStrings=true; 225 int matchLimit = start + str.length(); 226 if (matchLimit == length) { 227 return length; 228 } 229 if (spanCondition == SpanCondition.CONTAINED) { 230 // Iterate for the shortest match at each position. 231 // Recurse for each but the shortest match. 232 if (next == start) { 233 next = matchLimit; // First match from start. 234 } else { 235 if (matchLimit < next) { 236 // Remember shortest match from start for iteration. 237 int temp = next; 238 next = matchLimit; 239 matchLimit = temp; 240 } 241 // Recurse for non-shortest match from start. 242 int spanLength = containsSpanUTF16(set, s.substring(matchLimit), 243 SpanCondition.CONTAINED); 244 if ((matchLimit + spanLength) > maxSpanLimit) { 245 maxSpanLimit = matchLimit + spanLength; 246 if (maxSpanLimit == length) { 247 return length; 248 } 249 } 250 } 251 } else /* spanCondition==SIMPLE */{ 252 if (matchLimit > next) { 253 // Remember longest match from start. 254 next = matchLimit; 255 } 256 } 257 } 258 } 259 if (next == start) { 260 break; // No match from start. 261 } 262 start = next; 263 } 264 if (start > maxSpanLimit) { 265 return start; 266 } else { 267 return maxSpanLimit; 268 } 269 } 270 } 271 272 static int containsSpanBackUTF16(final UnicodeSetWithStrings set, final String s, int length, 273 SpanCondition spanCondition) { 274 if (length == 0) { 275 return 0; 276 } 277 final UnicodeSet realSet = set.getSet(); 278 if (!set.hasStrings()) { 279 boolean spanContained = false; 280 if (spanCondition != SpanCondition.NOT_CONTAINED) { 281 spanContained = true; // Pin to 0/1 values. 282 } 283 284 int c; 285 int prev = length; 286 do { 287 c = s.codePointBefore(prev); 288 if (realSet.contains(c) != spanContained) { 289 break; 290 } 291 prev = s.offsetByCodePoints(prev, -1); 292 } while (prev > 0); 293 return prev; 294 } else if (spanCondition == SpanCondition.NOT_CONTAINED) { 295 int c; 296 int prev = length, length0 = length; 297 do { 298 c = s.codePointBefore(prev); 299 if (realSet.contains(c)) { 300 break; 301 } 302 for (String str : set.strings()) { 303 if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) { 304 // spanNeedsStrings=true; 305 return prev; 306 } 307 } 308 prev = s.offsetByCodePoints(prev, -1); 309 } while (prev > 0); 310 return prev; 311 } else /* SpanCondition.CONTAINED or SIMPLE */{ 312 int c; 313 int prev = length, minSpanStart = length, length0 = length; 314 do { 315 c = s.codePointBefore(length); 316 length = s.offsetByCodePoints(length, -1); 317 if (!realSet.contains(c)) { 318 length = prev; // Do not span this single, not-contained code point. 319 } 320 for (String str : set.strings()) { 321 if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) { 322 // spanNeedsStrings=true; 323 int matchStart = prev - str.length(); 324 if (matchStart == 0) { 325 return 0; 326 } 327 if (spanCondition == SpanCondition.CONTAINED) { 328 // Iterate for the shortest match at each position. 329 // Recurse for each but the shortest match. 330 if (length == prev) { 331 length = matchStart; // First match from prev. 332 } else { 333 if (matchStart > length) { 334 // Remember shortest match from prev for iteration. 335 int temp = length; 336 length = matchStart; 337 matchStart = temp; 338 } 339 // Recurse for non-shortest match from prev. 340 int spanStart = containsSpanBackUTF16(set, s, matchStart, 341 SpanCondition.CONTAINED); 342 if (spanStart < minSpanStart) { 343 minSpanStart = spanStart; 344 if (minSpanStart == 0) { 345 return 0; 346 } 347 } 348 } 349 } else /* spanCondition==SIMPLE */{ 350 if (matchStart < length) { 351 // Remember longest match from prev. 352 length = matchStart; 353 } 354 } 355 } 356 } 357 if (length == prev) { 358 break; // No match from prev. 359 } 360 } while ((prev = length) > 0); 361 if (prev < minSpanStart) { 362 return prev; 363 } else { 364 return minSpanStart; 365 } 366 } 367 } 368 369 // spans to be performed and compared 370 static final int SPAN_UTF16 = 1; 371 static final int SPAN_UTF8 = 2; 372 static final int SPAN_UTFS = 3; 373 374 static final int SPAN_SET = 4; 375 static final int SPAN_COMPLEMENT = 8; 376 static final int SPAN_POLARITY = 0xc; 377 378 static final int SPAN_FWD = 0x10; 379 static final int SPAN_BACK = 0x20; 380 static final int SPAN_DIRS = 0x30; 381 382 static final int SPAN_CONTAINED = 0x100; 383 static final int SPAN_SIMPLE = 0x200; 384 static final int SPAN_CONDITION = 0x300; 385 386 static final int SPAN_ALL = 0x33f; 387 388 static SpanCondition invertSpanCondition(SpanCondition spanCondition, SpanCondition contained) { 389 return spanCondition == SpanCondition.NOT_CONTAINED ? contained 390 : SpanCondition.NOT_CONTAINED; 391 } 392 393 /* 394 * Count spans on a string with the method according to type and set the span limits. The set may be the complement 395 * of the original. When using spanBack() and comparing with span(), use a span condition for the first spanBack() 396 * according to the expected number of spans. Sets typeName to an empty string if there is no such type. Returns -1 397 * if the span option is filtered out. 398 */ 399 static int getSpans(final UnicodeSetWithStrings set, boolean isComplement, final String s, 400 int whichSpans, int type, String[] typeName, int limits[], int limitsCapacity, 401 int expectCount) { 402 final UnicodeSet realSet = set.getSet(); 403 int start, count, i; 404 SpanCondition spanCondition, firstSpanCondition, contained; 405 boolean isForward; 406 407 int length = s.length(); 408 if (type < 0 || 7 < type) { 409 typeName[0] = null; 410 return 0; 411 } 412 413 final String typeNames16[] = { 414 "contains", 415 "contains(LM)", 416 "span", 417 "span(LM)", 418 "containsBack", 419 "containsBack(LM)", 420 "spanBack", 421 "spanBack(LM)" }; 422 423 typeName[0] = typeNames16[type]; 424 425 // filter span options 426 if (type <= 3) { 427 // span forward 428 if ((whichSpans & SPAN_FWD) == 0) { 429 return -1; 430 } 431 isForward = true; 432 } else { 433 // span backward 434 if ((whichSpans & SPAN_BACK) == 0) { 435 return -1; 436 } 437 isForward = false; 438 } 439 if ((type & 1) == 0) { 440 // use SpanCondition.CONTAINED 441 if ((whichSpans & SPAN_CONTAINED) == 0) { 442 return -1; 443 } 444 contained = SpanCondition.CONTAINED; 445 } else { 446 // use SIMPLE 447 if ((whichSpans & SPAN_SIMPLE) == 0) { 448 return -1; 449 } 450 contained = SpanCondition.SIMPLE; 451 } 452 453 // Default first span condition for going forward with an uncomplemented set. 454 spanCondition = SpanCondition.NOT_CONTAINED; 455 if (isComplement) { 456 spanCondition = invertSpanCondition(spanCondition, contained); 457 } 458 459 // First span condition for span(), used to terminate the spanBack() iteration. 460 firstSpanCondition = spanCondition; 461 462 // spanBack(): Its initial span condition is span()'s last span condition, 463 // which is the opposite of span()'s first span condition 464 // if we expect an even number of spans. 465 // (The loop inverts spanCondition (expectCount-1) times 466 // before the expectCount'th span() call.) 467 // If we do not compare forward and backward directions, then we do not have an 468 // expectCount and just start with firstSpanCondition. 469 if (!isForward && (whichSpans & SPAN_FWD) != 0 && (expectCount & 1) == 0) { 470 spanCondition = invertSpanCondition(spanCondition, contained); 471 } 472 473 count = 0; 474 switch (type) { 475 case 0: 476 case 1: 477 start = 0; 478 for (;;) { 479 start += containsSpanUTF16(set, s.substring(start), spanCondition); 480 if (count < limitsCapacity) { 481 limits[count] = start; 482 } 483 ++count; 484 if (start >= length) { 485 break; 486 } 487 spanCondition = invertSpanCondition(spanCondition, contained); 488 } 489 break; 490 case 2: 491 case 3: 492 start = 0; 493 for (;;) { 494 start = realSet.span(s, start, spanCondition); 495 if (count < limitsCapacity) { 496 limits[count] = start; 497 } 498 ++count; 499 if (start >= length) { 500 break; 501 } 502 spanCondition = invertSpanCondition(spanCondition, contained); 503 } 504 break; 505 case 4: 506 case 5: 507 for (;;) { 508 ++count; 509 if (count <= limitsCapacity) { 510 limits[limitsCapacity - count] = length; 511 } 512 length = containsSpanBackUTF16(set, s, length, spanCondition); 513 if (length == 0 && spanCondition == firstSpanCondition) { 514 break; 515 } 516 spanCondition = invertSpanCondition(spanCondition, contained); 517 } 518 if (count < limitsCapacity) { 519 for (i = count; i-- > 0;) { 520 limits[i] = limits[limitsCapacity - count + i]; 521 } 522 } 523 break; 524 case 6: 525 case 7: 526 for (;;) { 527 ++count; 528 if (count <= limitsCapacity) { 529 limits[limitsCapacity - count] = length >= 0 ? length : s.length(); 530 } 531 length = realSet.spanBack(s, length, spanCondition); 532 if (length == 0 && spanCondition == firstSpanCondition) { 533 break; 534 } 535 spanCondition = invertSpanCondition(spanCondition, contained); 536 } 537 if (count < limitsCapacity) { 538 for (i = count; i-- > 0;) { 539 limits[i] = limits[limitsCapacity - count + i]; 540 } 541 } 542 break; 543 default: 544 typeName = null; 545 return -1; 546 } 547 548 return count; 549 } 550 551 // sets to be tested; odd index=isComplement 552 static final int SLOW = 0; 553 static final int SLOW_NOT = 1; 554 static final int FAST = 2; 555 static final int FAST_NOT = 3; 556 static final int SET_COUNT = 4; 557 558 static final String setNames[] = { "slow", "slow.not", "fast", "fast.not" }; 559 560 /* 561 * Verify that we get the same results whether we look at text with contains(), span() or spanBack(), using unfrozen 562 * or frozen versions of the set, and using the set or its complement (switching the spanConditions accordingly). 563 * The latter verifies that set.span(spanCondition) == set.complement().span(!spanCondition). 564 * 565 * The expectLimits[] are either provided by the caller (with expectCount>=0) or returned to the caller (with an 566 * input expectCount<0). 567 */ 568 void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans, 569 int expectLimits[], int expectCount, 570 final String testName, int index) { 571 int[] limits = new int[500]; 572 int limitsCount; 573 int i, j; 574 String[] typeName = new String[1]; 575 int type; 576 577 for (i = 0; i < SET_COUNT; ++i) { 578 if ((i & 1) == 0) { 579 // Even-numbered sets are original, uncomplemented sets. 580 if ((whichSpans & SPAN_SET) == 0) { 581 continue; 582 } 583 } else { 584 // Odd-numbered sets are complemented. 585 if ((whichSpans & SPAN_COMPLEMENT) == 0) { 586 continue; 587 } 588 } 589 for (type = 0;; ++type) { 590 limitsCount = getSpans(sets[i], (0 != (i & 1)), s, whichSpans, type, typeName, limits, 591 limits.length, expectCount); 592 if (typeName[0] == null) { 593 break; // All types tried. 594 } 595 if (limitsCount < 0) { 596 continue; // Span option filtered out. 597 } 598 if (expectCount < 0) { 599 expectCount = limitsCount; 600 if (limitsCount > limits.length) { 601 errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d > %d capacity - too many spans", 602 testName, index, setNames[i], typeName[0], limitsCount, limits.length)); 603 return; 604 } 605 for (j = limitsCount; j-- > 0;) { 606 expectLimits[j] = limits[j]; 607 } 608 } else if (limitsCount != expectCount) { 609 errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d != %d", testName, index, setNames[i], 610 typeName[0], limitsCount, expectCount)); 611 } else { 612 for (j = 0; j < limitsCount; ++j) { 613 if (limits[j] != expectLimits[j]) { 614 errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d limits[%d]=%d != %d", testName, 615 index, setNames[i], typeName[0], limitsCount, j, limits[j], expectLimits[j])); 616 break; 617 } 618 } 619 } 620 } 621 } 622 623 // Compare span() with containsAll()/containsNone(), 624 // but only if we have expectLimits[] from the uncomplemented set. 625 if ((whichSpans & SPAN_SET) != 0) { 626 final String s16 = s; 627 String string; 628 int prev = 0, limit, len; 629 for (i = 0; i < expectCount; ++i) { 630 limit = expectLimits[i]; 631 len = limit - prev; 632 if (len > 0) { 633 string = s16.substring(prev, prev + len); // read-only alias 634 if (0 != (i & 1)) { 635 if (!sets[SLOW].getSet().containsAll(string)) { 636 errln(String.format("FAIL: %s[0x%x].%s.containsAll(%d..%d)==false contradicts span()", 637 testName, index, setNames[SLOW], prev, limit)); 638 return; 639 } 640 if (!sets[FAST].getSet().containsAll(string)) { 641 errln(String.format("FAIL: %s[0x%x].%s.containsAll(%d..%d)==false contradicts span()", 642 testName, index, setNames[FAST], prev, limit)); 643 return; 644 } 645 } else { 646 if (!sets[SLOW].getSet().containsNone(string)) { 647 errln(String.format("FAIL: %s[0x%x].%s.containsNone(%d..%d)==false contradicts span()", 648 testName, index, setNames[SLOW], prev, limit)); 649 return; 650 } 651 if (!sets[FAST].getSet().containsNone(string)) { 652 errln(String.format("FAIL: %s[0x%x].%s.containsNone(%d..%d)==false contradicts span()", 653 testName, index, setNames[FAST], prev, limit)); 654 return; 655 } 656 } 657 } 658 prev = limit; 659 } 660 } 661 } 662 663 // Specifically test either UTF-16 or UTF-8. 664 void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans, 665 final String testName, int index) { 666 int[] expectLimits = new int[500]; 667 int expectCount = -1; 668 verifySpan(sets, s, whichSpans, expectLimits, expectCount, testName, index); 669 } 670 671 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text, 672 // unless either UTF is turned off in whichSpans. 673 // Testing UTF-16 and UTF-8 together requires that surrogate code points 674 // have the same contains(c) value as U+FFFD. 675 void verifySpanBothUTFs(final UnicodeSetWithStrings sets[], final String s16, int whichSpans, 676 final String testName, int index) { 677 int[] expectLimits = new int[500]; 678 int expectCount; 679 680 expectCount = -1; // Get expectLimits[] from verifySpan(). 681 682 if ((whichSpans & SPAN_UTF16) != 0) { 683 verifySpan(sets, s16, whichSpans, expectLimits, expectCount, testName, index); 684 } 685 } 686 687 static int nextCodePoint(int c) { 688 // Skip some large and boring ranges. 689 switch (c) { 690 case 0x3441: 691 return 0x4d7f; 692 case 0x5100: 693 return 0x9f00; 694 case 0xb040: 695 return 0xd780; 696 case 0xe041: 697 return 0xf8fe; 698 case 0x10100: 699 return 0x20000; 700 case 0x20041: 701 return 0xe0000; 702 case 0xe0101: 703 return 0x10fffd; 704 default: 705 return c + 1; 706 } 707 } 708 709 // Verify that all implementations represent the same set. 710 void verifySpanContents(final UnicodeSetWithStrings sets[], int whichSpans, final String testName) { 711 StringBuffer s = new StringBuffer(); 712 int localWhichSpans; 713 int c, first; 714 for (first = c = 0;; c = nextCodePoint(c)) { 715 if (c > 0x10ffff || s.length() > 1024) { 716 localWhichSpans = whichSpans; 717 verifySpanBothUTFs(sets, s.toString(), localWhichSpans, testName, first); 718 if (c > 0x10ffff) { 719 break; 720 } 721 s.delete(0, s.length()); 722 first = c; 723 } 724 UTF16.append(s, c); 725 } 726 } 727 728 // Test with a particular, interesting string. 729 // Specify length and try NUL-termination. 730 static final char interestingStringChars[] = { 0x61, 0x62, 0x20, // Latin, space 731 0x3b1, 0x3b2, 0x3b3, // Greek 732 0xd900, // lead surrogate 733 0x3000, 0x30ab, 0x30ad, // wide space, Katakana 734 0xdc05, // trail surrogate 735 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul 736 0xd900, 0xdc05, // unassigned supplementary 737 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary 738 0xd7a4, 0xdc05, 0xd900, 0x2028 // unassigned, surrogates in wrong order, LS 739 }; 740 static String interestingString = new String(interestingStringChars); 741 static final String unicodeSet1 = "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]"; 742 743 @Test 744 public void TestInterestingStringSpan() { 745 UnicodeSet uset = new UnicodeSet(Utility.unescape(unicodeSet1)); 746 SpanCondition spanCondition = SpanCondition.NOT_CONTAINED; 747 int expect = 2; 748 int start = 14; 749 750 int c = 0xd840; 751 boolean contains = uset.contains(c); 752 if (false != contains) { 753 errln(String.format("FAIL: UnicodeSet(unicodeSet1).contains(%d) = true (expect false)", 754 c)); 755 } 756 757 UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset); 758 int len = containsSpanUTF16(set, interestingString.substring(start), spanCondition); 759 if (expect != len) { 760 errln(String.format("FAIL: containsSpanUTF16(unicodeSet1, \"%s(%d)\") = %d (expect %d)", 761 interestingString, start, len, expect)); 762 } 763 764 len = uset.span(interestingString, start, spanCondition) - start; 765 if (expect != len) { 766 errln(String.format("FAIL: UnicodeSet(unicodeSet1).span(\"%s\", %d) = %d (expect %d)", 767 interestingString, start, len, expect)); 768 } 769 } 770 771 void verifySpanUTF16String(final UnicodeSetWithStrings sets[], int whichSpans, final String testName) { 772 if ((whichSpans & SPAN_UTF16) == 0) { 773 return; 774 } 775 verifySpan(sets, interestingString, (whichSpans & ~SPAN_UTF8), testName, 1); 776 } 777 778 // Take a set of span options and multiply them so that 779 // each portion only has one of the options a, b and c. 780 // If b==0, then the set of options is just modified with mask and a. 781 // If b!=0 and c==0, then the set of options is just modified with mask, a and b. 782 static int addAlternative(int whichSpans[], int whichSpansCount, int mask, int a, int b, int c) { 783 int s; 784 int i; 785 786 for (i = 0; i < whichSpansCount; ++i) { 787 s = whichSpans[i] & mask; 788 whichSpans[i] = s | a; 789 if (b != 0) { 790 whichSpans[whichSpansCount + i] = s | b; 791 if (c != 0) { 792 whichSpans[2 * whichSpansCount + i] = s | c; 793 } 794 } 795 } 796 return b == 0 ? whichSpansCount : c == 0 ? 2 * whichSpansCount : 3 * whichSpansCount; 797 } 798 799 // They are not representable in UTF-8, and a leading trail surrogate 800 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair. 801 // U+20001 == \\uD840\\uDC01 802 // U+20400 == \\uD841\\uDC00 803 static final String patternWithUnpairedSurrogate = 804 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]"; 805 static final String stringWithUnpairedSurrogate = 806 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"; 807 808 static final String _63_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; 809 static final String _64_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; 810 static final String _63_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; 811 static final String _64_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; 812 static final String longPattern = 813 "[a{" + _64_a + _64_a + _64_a + _64_a + "b}" + "{a" + _64_b + _64_b + _64_b + _64_b + "}]"; 814 815 @Test 816 public void TestStringWithUnpairedSurrogateSpan() { 817 String string = Utility.unescape(stringWithUnpairedSurrogate); 818 UnicodeSet uset = new UnicodeSet(Utility.unescape(patternWithUnpairedSurrogate)); 819 SpanCondition spanCondition = SpanCondition.NOT_CONTAINED; 820 int start = 17; 821 int expect = 5; 822 823 UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset); 824 int len = containsSpanUTF16(set, string.substring(start), spanCondition); 825 if (expect != len) { 826 errln(String.format("FAIL: containsSpanUTF16(patternWithUnpairedSurrogate, \"%s(%d)\") = %d (expect %d)", 827 string, start, len, expect)); 828 } 829 830 len = uset.span(string, start, spanCondition) - start; 831 if (expect != len) { 832 errln(String.format("FAIL: UnicodeSet(patternWithUnpairedSurrogate).span(\"%s\", %d) = %d (expect %d)", 833 string, start, len, expect)); 834 } 835 } 836 837 @Test 838 public void TestSpan() { 839 // "[...]" is a UnicodeSet pattern. 840 // "*" performs tests on all Unicode code points and on a selection of 841 // malformed UTF-8/16 strings. 842 // "-options" limits the scope of testing for the current set. 843 // By default, the test verifies that equivalent boundaries are found 844 // for UTF-16 and UTF-8, going forward and backward, 845 // alternating NOT_CONTAINED with 846 // either CONTAINED or SIMPLE. 847 // Single-character options: 848 // 8 -- UTF-16 and UTF-8 boundaries may differ. 849 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates), 850 // or the set contains strings with unpaired surrogates 851 // which do not translate to valid UTF-8. 852 // c -- set.span() and set.complement().span() boundaries may differ. 853 // Cause: Set strings are not complemented. 854 // b -- span() and spanBack() boundaries may differ. 855 // Cause: Strings in the set overlap, and spanBack(CONTAINED) 856 // and spanBack(SIMPLE) are defined to 857 // match with non-overlapping substrings. 858 // For example, with a set containing "ab" and "ba", 859 // span() of "aba" yields boundaries { 0, 2, 3 } 860 // because the initial "ab" matches from 0 to 2, 861 // while spanBack() yields boundaries { 0, 1, 3 } 862 // because the final "ba" matches from 1 to 3. 863 // l -- CONTAINED and SIMPLE boundaries may differ. 864 // Cause: Strings in the set overlap, and a longer match may 865 // require a sequence including non-longest substrings. 866 // For example, with a set containing "ab", "abc" and "cd", 867 // span(contained) of "abcd" spans the entire string 868 // but span(longest match) only spans the first 3 characters. 869 // Each "-options" first resets all options and then applies the specified options. 870 // A "-" without options resets the options. 871 // The options are also reset for each new set. 872 // Other strings will be spanned. 873 final String testdata[] = { 874 "[:ID_Continue:]", 875 "*", 876 "[:White_Space:]", 877 "*", 878 "[]", 879 "*", 880 "[\\u0000-\\U0010FFFF]", 881 "*", 882 "[\\u0000\\u0080\\u0800\\U00010000]", 883 "*", 884 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]", 885 "*", 886 unicodeSet1, 887 "-c", 888 "*", 889 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]", 890 "-c", 891 "*", 892 893 // Overlapping strings cause overlapping attempts to match. 894 "[x{xy}{xya}{axy}{ax}]", 895 "-cl", 896 897 // More repetitions of "xya" would take too long with the recursive 898 // reference implementation. 899 // containsAll()=false 900 // test_string 0x14 901 "xx" + "xyaxyaxyaxya" + // set.complement().span(longest match) will stop here. 902 "xx" + // set.complement().span(contained) will stop between the two 'x'es. 903 "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxya" + // span() ends here. 904 "aaa", 905 906 // containsAll()=true 907 // test_string 0x15 908 "xx" + "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxy", 909 910 "-bc", 911 // test_string 0x17 912 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 } 913 "-c", 914 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 } 915 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 } 916 "-", 917 "byaya", // span() -> { 5 } 918 "byay", // span() -> { 4 } 919 "bya", // span() -> { 3 } 920 921 // span(longest match) will not span the whole string. 922 "[a{ab}{bc}]", 923 "-cl", 924 // test_string 0x21 925 "abc", 926 927 "[a{ab}{abc}{cd}]", 928 "-cl", 929 "acdabcdabccd", 930 931 // spanBack(longest match) will not span the whole string. 932 "[c{ab}{bc}]", 933 "-cl", 934 "abc", 935 936 "[d{cd}{bcd}{ab}]", 937 "-cl", 938 "abbcdabcdabd", 939 940 // Test with non-ASCII set strings - test proper handling of surrogate pairs 941 // and UTF-8 trail bytes. 942 // Copies of above test sets and strings, but transliterated to have 943 // different code points with similar trail units. 944 // Previous: a b c d 945 // Unicode: 042B 30AB 200AB 204AB 946 // UTF-16: 042B 30AB D840 DCAB D841 DCAB 947 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB 948 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]", 949 "-cl", 950 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB", 951 952 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]", 953 "-cl", 954 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB", 955 956 // Stress bookkeeping and recursion. 957 // The following strings are barely doable with the recursive 958 // reference implementation. 959 // The not-contained character at the end prevents an early exit from the span(). 960 "[b{bb}]", 961 "-c", 962 // test_string 0x33 963 "bbbbbbbbbbbbbbbbbbbbbbbb-", 964 // On complement sets, span() and spanBack() get different results 965 // because b is not in the complement set and there is an odd number of b's 966 // in the test string. 967 "-bc", 968 "bbbbbbbbbbbbbbbbbbbbbbbbb-", 969 970 // Test with set strings with an initial or final code point span 971 // longer than 254. 972 longPattern, 973 "-c", 974 _64_a + _64_a + _64_a + _63_a + "b", 975 _64_a + _64_a + _64_a + _64_a + "b", 976 _64_a + _64_a + _64_a + _64_a + "aaaabbbb", 977 "a" + _64_b + _64_b + _64_b + _63_b, 978 "a" + _64_b + _64_b + _64_b + _64_b, 979 "aaaabbbb" + _64_b + _64_b + _64_b + _64_b, 980 981 // Test with strings containing unpaired surrogates. 982 patternWithUnpairedSurrogate, "-8cl", 983 stringWithUnpairedSurrogate }; 984 int i, j; 985 int whichSpansCount = 1; 986 int[] whichSpans = new int[96]; 987 for (i = whichSpans.length; i-- > 0;) { 988 whichSpans[i] = SPAN_ALL; 989 } 990 991 UnicodeSet[] sets = new UnicodeSet[SET_COUNT]; 992 UnicodeSetWithStrings[] sets_with_str = new UnicodeSetWithStrings[SET_COUNT]; 993 994 String testName = null; 995 @SuppressWarnings("unused") 996 String testNameLimit; 997 998 for (i = 0; i < testdata.length; ++i) { 999 final String s = testdata[i]; 1000 if (s.charAt(0) == '[') { 1001 // Create new test sets from this pattern. 1002 for (j = 0; j < SET_COUNT; ++j) { 1003 sets_with_str[j] = null; 1004 sets[j] = null; 1005 } 1006 sets[SLOW] = new UnicodeSet(Utility.unescape(s)); 1007 sets[SLOW_NOT] = new UnicodeSet(sets[SLOW]); 1008 sets[SLOW_NOT].complement(); 1009 // Intermediate set: Test cloning of a frozen set. 1010 UnicodeSet fast = new UnicodeSet(sets[SLOW]); 1011 fast.freeze(); 1012 sets[FAST] = (UnicodeSet) fast.clone(); 1013 fast = null; 1014 UnicodeSet fastNot = new UnicodeSet(sets[SLOW_NOT]); 1015 fastNot.freeze(); 1016 sets[FAST_NOT] = (UnicodeSet) fastNot.clone(); 1017 fastNot = null; 1018 1019 for (j = 0; j < SET_COUNT; ++j) { 1020 sets_with_str[j] = new UnicodeSetWithStrings(sets[j]); 1021 } 1022 1023 testName = s + ':'; 1024 whichSpans[0] = SPAN_ALL; 1025 whichSpansCount = 1; 1026 } else if (s.charAt(0) == '-') { 1027 whichSpans[0] = SPAN_ALL; 1028 whichSpansCount = 1; 1029 1030 for (j = 1; j < s.length(); j++) { 1031 switch (s.charAt(j)) { 1032 case 'c': 1033 whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_POLARITY, SPAN_SET, 1034 SPAN_COMPLEMENT, 0); 1035 break; 1036 case 'b': 1037 whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_DIRS, SPAN_FWD, SPAN_BACK, 1038 0); 1039 break; 1040 case 'l': 1041 // test CONTAINED FWD & BACK, and separately 1042 // SIMPLE only FWD, and separately 1043 // SIMPLE only BACK 1044 whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~(SPAN_DIRS | SPAN_CONDITION), 1045 SPAN_DIRS | SPAN_CONTAINED, SPAN_FWD | SPAN_SIMPLE, SPAN_BACK | SPAN_SIMPLE); 1046 break; 1047 case '8': 1048 whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_UTFS, SPAN_UTF16, 1049 SPAN_UTF8, 0); 1050 break; 1051 default: 1052 errln(String.format("FAIL: unrecognized span set option in \"%s\"", testdata[i])); 1053 break; 1054 } 1055 } 1056 } else if (s.equals("*")) { 1057 testNameLimit = "bad_string"; 1058 for (j = 0; j < whichSpansCount; ++j) { 1059 if (whichSpansCount > 1) { 1060 testNameLimit += String.format("%%0x%3x", whichSpans[j]); 1061 } 1062 verifySpanUTF16String(sets_with_str, whichSpans[j], testName); 1063 } 1064 1065 testNameLimit = "contents"; 1066 for (j = 0; j < whichSpansCount; ++j) { 1067 if (whichSpansCount > 1) { 1068 testNameLimit += String.format("%%0x%3x", whichSpans[j]); 1069 } 1070 verifySpanContents(sets_with_str, whichSpans[j], testName); 1071 } 1072 } else { 1073 String string = Utility.unescape(s); 1074 testNameLimit = "test_string"; 1075 for (j = 0; j < whichSpansCount; ++j) { 1076 if (whichSpansCount > 1) { 1077 testNameLimit += String.format("%%0x%3x", whichSpans[j]); 1078 } 1079 verifySpanBothUTFs(sets_with_str, string, whichSpans[j], testName, i); 1080 } 1081 } 1082 } 1083 } 1084 1085 @Test 1086 public void TestSpanAndCount() { 1087 // a set with no strings 1088 UnicodeSet abc = new UnicodeSet('a', 'c'); 1089 // a set with an "irrelevant" string (fully contained in the code point set) 1090 UnicodeSet crlf = new UnicodeSet().add('\n').add('\r').add("\r\n"); 1091 // a set with no "irrelevant" string but some interesting overlaps 1092 UnicodeSet ab_cd = new UnicodeSet().add('a').add("ab").add("abc").add("cd"); 1093 String s = "ab\n\r\r\n" + UTF16.valueOf(0x50000) + "abcde"; 1094 OutputInt count = new OutputInt(); 1095 assertEquals("abc span[8, 11[", 11, 1096 abc.spanAndCount(s, 8, SpanCondition.SIMPLE, count)); 1097 assertEquals("abc count=3", 3, count.value); 1098 assertEquals("no abc span[2, 8[", 8, 1099 abc.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count)); 1100 assertEquals("no abc count=5", 5, count.value); 1101 assertEquals("line endings span[2, 6[", 6, 1102 crlf.spanAndCount(s, 2, SpanCondition.CONTAINED, count)); 1103 assertEquals("line endings count=3", 3, count.value); 1104 assertEquals("no ab+cd span[2, 8[", 8, 1105 ab_cd.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count)); 1106 assertEquals("no ab+cd count=5", 5, count.value); 1107 assertEquals("ab+cd span[8, 12[", 12, 1108 ab_cd.spanAndCount(s, 8, SpanCondition.CONTAINED, count)); 1109 assertEquals("ab+cd count=2", 2, count.value); 1110 assertEquals("1x abc span[8, 11[", 11, 1111 ab_cd.spanAndCount(s, 8, SpanCondition.SIMPLE, count)); 1112 assertEquals("1x abc count=1", 1, count.value); 1113 1114 abc.freeze(); 1115 crlf.freeze(); 1116 ab_cd.freeze(); 1117 assertEquals("abc span[8, 11[ (frozen)", 11, 1118 abc.spanAndCount(s, 8, SpanCondition.SIMPLE, count)); 1119 assertEquals("abc count=3 (frozen)", 3, count.value); 1120 assertEquals("no abc span[2, 8[ (frozen)", 8, 1121 abc.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count)); 1122 assertEquals("no abc count=5 (frozen)", 5, count.value); 1123 assertEquals("line endings span[2, 6[ (frozen)", 6, 1124 crlf.spanAndCount(s, 2, SpanCondition.CONTAINED, count)); 1125 assertEquals("line endings count=3 (frozen)", 3, count.value); 1126 assertEquals("no ab+cd span[2, 8[ (frozen)", 8, 1127 ab_cd.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count)); 1128 assertEquals("no ab+cd count=5 (frozen)", 5, count.value); 1129 assertEquals("ab+cd span[8, 12[ (frozen)", 12, 1130 ab_cd.spanAndCount(s, 8, SpanCondition.CONTAINED, count)); 1131 assertEquals("ab+cd count=2 (frozen)", 2, count.value); 1132 assertEquals("1x abc span[8, 11[ (frozen)", 11, 1133 ab_cd.spanAndCount(s, 8, SpanCondition.SIMPLE, count)); 1134 assertEquals("1x abc count=1 (frozen)", 1, count.value); 1135 } 1136} 1137