1/* 2 * Copyright (C) 2008 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.google.common.base; 18 19import static com.google.common.base.Preconditions.checkArgument; 20import static com.google.common.base.Preconditions.checkNotNull; 21 22import com.google.common.annotations.Beta; 23import com.google.common.annotations.GwtCompatible; 24import com.google.common.annotations.GwtIncompatible; 25 26import java.util.Arrays; 27import java.util.BitSet; 28 29import javax.annotation.CheckReturnValue; 30 31/** 32 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 33 * for any {@link Object}. Also offers basic text processing methods based on this function. 34 * Implementations are strongly encouraged to be side-effect-free and immutable. 35 * 36 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 37 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}". 38 * 39 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand 40 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical 41 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher} 42 * treats these just as two separate characters. 43 * 44 * <p>Example usages: <pre> 45 * String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput); 46 * if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 47 * 48 * <p>See the Guava User Guide article on <a href= 49 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher"> 50 * {@code CharMatcher}</a>. 51 * 52 * @author Kevin Bourrillion 53 * @since 1.0 54 */ 55@Beta // Possibly change from chars to code points; decide constants vs. methods 56@GwtCompatible(emulated = true) 57public abstract class CharMatcher implements Predicate<Character> { 58 59 // Constants 60 /** 61 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 62 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a 63 * discussion of that term. 64 * 65 * @since 2.0 66 */ 67 public static final CharMatcher BREAKING_WHITESPACE = new CharMatcher() { 68 @Override 69 public boolean matches(char c) { 70 switch (c) { 71 case '\t': 72 case '\n': 73 case '\013': 74 case '\f': 75 case '\r': 76 case ' ': 77 case '\u0085': 78 case '\u1680': 79 case '\u2028': 80 case '\u2029': 81 case '\u205f': 82 case '\u3000': 83 return true; 84 case '\u2007': 85 return false; 86 default: 87 return c >= '\u2000' && c <= '\u200a'; 88 } 89 } 90 91 @Override 92 public String toString() { 93 return "CharMatcher.BREAKING_WHITESPACE"; 94 } 95 }; 96 97 /** 98 * Determines whether a character is ASCII, meaning that its code point is less than 128. 99 */ 100 public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII"); 101 102 private static class RangesMatcher extends CharMatcher { 103 private final char[] rangeStarts; 104 private final char[] rangeEnds; 105 106 RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { 107 super(description); 108 this.rangeStarts = rangeStarts; 109 this.rangeEnds = rangeEnds; 110 checkArgument(rangeStarts.length == rangeEnds.length); 111 for (int i = 0; i < rangeStarts.length; i++) { 112 checkArgument(rangeStarts[i] <= rangeEnds[i]); 113 if (i + 1 < rangeStarts.length) { 114 checkArgument(rangeEnds[i] < rangeStarts[i + 1]); 115 } 116 } 117 } 118 119 @Override 120 public boolean matches(char c) { 121 int index = Arrays.binarySearch(rangeStarts, c); 122 if (index >= 0) { 123 return true; 124 } else { 125 index = ~index - 1; 126 return index >= 0 && c <= rangeEnds[index]; 127 } 128 } 129 } 130 131 // Must be in ascending order. 132 private static final String ZEROES = "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6" 133 + "\u0c66\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1b50\u1bb0" 134 + "\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10"; 135 136 private static final String NINES; 137 static { 138 StringBuilder builder = new StringBuilder(ZEROES.length()); 139 for (int i = 0; i < ZEROES.length(); i++) { 140 builder.append((char) (ZEROES.charAt(i) + 9)); 141 } 142 NINES = builder.toString(); 143 } 144 145 /** 146 * Determines whether a character is a digit according to 147 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. 148 * If you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 149 */ 150 public static final CharMatcher DIGIT = new RangesMatcher( 151 "CharMatcher.DIGIT", ZEROES.toCharArray(), NINES.toCharArray()); 152 153 /** 154 * Determines whether a character is a digit according to {@linkplain Character#isDigit(char) 155 * Java's definition}. If you only care to match ASCII digits, you can use {@code 156 * inRange('0', '9')}. 157 */ 158 public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") { 159 @Override public boolean matches(char c) { 160 return Character.isDigit(c); 161 } 162 }; 163 164 /** 165 * Determines whether a character is a letter according to {@linkplain Character#isLetter(char) 166 * Java's definition}. If you only care to match letters of the Latin alphabet, you can use {@code 167 * inRange('a', 'z').or(inRange('A', 'Z'))}. 168 */ 169 public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") { 170 @Override public boolean matches(char c) { 171 return Character.isLetter(c); 172 } 173 }; 174 175 /** 176 * Determines whether a character is a letter or digit according to {@linkplain 177 * Character#isLetterOrDigit(char) Java's definition}. 178 */ 179 public static final CharMatcher JAVA_LETTER_OR_DIGIT = 180 new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") { 181 @Override public boolean matches(char c) { 182 return Character.isLetterOrDigit(c); 183 } 184 }; 185 186 /** 187 * Determines whether a character is upper case according to {@linkplain 188 * Character#isUpperCase(char) Java's definition}. 189 */ 190 public static final CharMatcher JAVA_UPPER_CASE = 191 new CharMatcher("CharMatcher.JAVA_UPPER_CASE") { 192 @Override public boolean matches(char c) { 193 return Character.isUpperCase(c); 194 } 195 }; 196 197 /** 198 * Determines whether a character is lower case according to {@linkplain 199 * Character#isLowerCase(char) Java's definition}. 200 */ 201 public static final CharMatcher JAVA_LOWER_CASE = 202 new CharMatcher("CharMatcher.JAVA_LOWER_CASE") { 203 @Override public boolean matches(char c) { 204 return Character.isLowerCase(c); 205 } 206 }; 207 208 /** 209 * Determines whether a character is an ISO control character as specified by {@link 210 * Character#isISOControl(char)}. 211 */ 212 public static final CharMatcher JAVA_ISO_CONTROL = 213 inRange('\u0000', '\u001f') 214 .or(inRange('\u007f', '\u009f')) 215 .withToString("CharMatcher.JAVA_ISO_CONTROL"); 216 217 /** 218 * Determines whether a character is invisible; that is, if its Unicode category is any of 219 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 220 * PRIVATE_USE according to ICU4J. 221 */ 222 public static final CharMatcher INVISIBLE = new RangesMatcher("CharMatcher.INVISIBLE", ( 223 "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u1680\u180e\u2000\u2028\u205f\u2066\u2067\u2068" 224 + "\u2069\u206a\u3000\ud800\ufeff\ufff9\ufffa").toCharArray(), ( 225 "\u0020\u00a0\u00ad\u0604\u061c\u06dd\u070f\u1680\u180e\u200f\u202f\u2064\u2066\u2067\u2068" 226 + "\u2069\u206f\u3000\uf8ff\ufeff\ufff9\ufffb").toCharArray()); 227 228 private static String showCharacter(char c) { 229 String hex = "0123456789ABCDEF"; 230 char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; 231 for (int i = 0; i < 4; i++) { 232 tmp[5 - i] = hex.charAt(c & 0xF); 233 c >>= 4; 234 } 235 return String.copyValueOf(tmp); 236 237 } 238 239 /** 240 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 241 * errs on the side of returning {@code false} (that is, it tends to assume a character is 242 * double-width). 243 * 244 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to 245 * date. 246 */ 247 public static final CharMatcher SINGLE_WIDTH = new RangesMatcher("CharMatcher.SINGLE_WIDTH", 248 "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), 249 "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); 250 251 /** Matches any character. */ 252 public static final CharMatcher ANY = 253 new FastMatcher("CharMatcher.ANY") { 254 @Override public boolean matches(char c) { 255 return true; 256 } 257 258 @Override public int indexIn(CharSequence sequence) { 259 return (sequence.length() == 0) ? -1 : 0; 260 } 261 262 @Override public int indexIn(CharSequence sequence, int start) { 263 int length = sequence.length(); 264 Preconditions.checkPositionIndex(start, length); 265 return (start == length) ? -1 : start; 266 } 267 268 @Override public int lastIndexIn(CharSequence sequence) { 269 return sequence.length() - 1; 270 } 271 272 @Override public boolean matchesAllOf(CharSequence sequence) { 273 checkNotNull(sequence); 274 return true; 275 } 276 277 @Override public boolean matchesNoneOf(CharSequence sequence) { 278 return sequence.length() == 0; 279 } 280 281 @Override public String removeFrom(CharSequence sequence) { 282 checkNotNull(sequence); 283 return ""; 284 } 285 286 @Override public String replaceFrom(CharSequence sequence, char replacement) { 287 char[] array = new char[sequence.length()]; 288 Arrays.fill(array, replacement); 289 return new String(array); 290 } 291 292 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 293 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length()); 294 for (int i = 0; i < sequence.length(); i++) { 295 retval.append(replacement); 296 } 297 return retval.toString(); 298 } 299 300 @Override public String collapseFrom(CharSequence sequence, char replacement) { 301 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 302 } 303 304 @Override public String trimFrom(CharSequence sequence) { 305 checkNotNull(sequence); 306 return ""; 307 } 308 309 @Override public int countIn(CharSequence sequence) { 310 return sequence.length(); 311 } 312 313 @Override public CharMatcher and(CharMatcher other) { 314 return checkNotNull(other); 315 } 316 317 @Override public CharMatcher or(CharMatcher other) { 318 checkNotNull(other); 319 return this; 320 } 321 322 @Override public CharMatcher negate() { 323 return NONE; 324 } 325 }; 326 327 /** Matches no characters. */ 328 public static final CharMatcher NONE = 329 new FastMatcher("CharMatcher.NONE") { 330 @Override public boolean matches(char c) { 331 return false; 332 } 333 334 @Override public int indexIn(CharSequence sequence) { 335 checkNotNull(sequence); 336 return -1; 337 } 338 339 @Override public int indexIn(CharSequence sequence, int start) { 340 int length = sequence.length(); 341 Preconditions.checkPositionIndex(start, length); 342 return -1; 343 } 344 345 @Override public int lastIndexIn(CharSequence sequence) { 346 checkNotNull(sequence); 347 return -1; 348 } 349 350 @Override public boolean matchesAllOf(CharSequence sequence) { 351 return sequence.length() == 0; 352 } 353 354 @Override public boolean matchesNoneOf(CharSequence sequence) { 355 checkNotNull(sequence); 356 return true; 357 } 358 359 @Override public String removeFrom(CharSequence sequence) { 360 return sequence.toString(); 361 } 362 363 @Override public String replaceFrom(CharSequence sequence, char replacement) { 364 return sequence.toString(); 365 } 366 367 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 368 checkNotNull(replacement); 369 return sequence.toString(); 370 } 371 372 @Override public String collapseFrom(CharSequence sequence, char replacement) { 373 return sequence.toString(); 374 } 375 376 @Override public String trimFrom(CharSequence sequence) { 377 return sequence.toString(); 378 } 379 380 @Override 381 public String trimLeadingFrom(CharSequence sequence) { 382 return sequence.toString(); 383 } 384 385 @Override 386 public String trimTrailingFrom(CharSequence sequence) { 387 return sequence.toString(); 388 } 389 390 @Override public int countIn(CharSequence sequence) { 391 checkNotNull(sequence); 392 return 0; 393 } 394 395 @Override public CharMatcher and(CharMatcher other) { 396 checkNotNull(other); 397 return this; 398 } 399 400 @Override public CharMatcher or(CharMatcher other) { 401 return checkNotNull(other); 402 } 403 404 @Override public CharMatcher negate() { 405 return ANY; 406 } 407 }; 408 409 // Static factories 410 411 /** 412 * Returns a {@code char} matcher that matches only one specified character. 413 */ 414 public static CharMatcher is(final char match) { 415 String description = "CharMatcher.is('" + showCharacter(match) + "')"; 416 return new FastMatcher(description) { 417 @Override public boolean matches(char c) { 418 return c == match; 419 } 420 421 @Override public String replaceFrom(CharSequence sequence, char replacement) { 422 return sequence.toString().replace(match, replacement); 423 } 424 425 @Override public CharMatcher and(CharMatcher other) { 426 return other.matches(match) ? this : NONE; 427 } 428 429 @Override public CharMatcher or(CharMatcher other) { 430 return other.matches(match) ? other : super.or(other); 431 } 432 433 @Override public CharMatcher negate() { 434 return isNot(match); 435 } 436 437 @GwtIncompatible("java.util.BitSet") 438 @Override 439 void setBits(BitSet table) { 440 table.set(match); 441 } 442 }; 443 } 444 445 /** 446 * Returns a {@code char} matcher that matches any character except the one specified. 447 * 448 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 449 */ 450 public static CharMatcher isNot(final char match) { 451 String description = "CharMatcher.isNot('" + showCharacter(match) + "')"; 452 return new FastMatcher(description) { 453 @Override public boolean matches(char c) { 454 return c != match; 455 } 456 457 @Override public CharMatcher and(CharMatcher other) { 458 return other.matches(match) ? super.and(other) : other; 459 } 460 461 @Override public CharMatcher or(CharMatcher other) { 462 return other.matches(match) ? ANY : this; 463 } 464 465 @GwtIncompatible("java.util.BitSet") 466 @Override 467 void setBits(BitSet table) { 468 table.set(0, match); 469 table.set(match + 1, Character.MAX_VALUE + 1); 470 } 471 472 @Override public CharMatcher negate() { 473 return is(match); 474 } 475 }; 476 } 477 478 /** 479 * Returns a {@code char} matcher that matches any character present in the given character 480 * sequence. 481 */ 482 public static CharMatcher anyOf(final CharSequence sequence) { 483 switch (sequence.length()) { 484 case 0: 485 return NONE; 486 case 1: 487 return is(sequence.charAt(0)); 488 case 2: 489 return isEither(sequence.charAt(0), sequence.charAt(1)); 490 default: 491 // continue below to handle the general case 492 } 493 // TODO(user): is it potentially worth just going ahead and building a precomputed matcher? 494 final char[] chars = sequence.toString().toCharArray(); 495 Arrays.sort(chars); 496 StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); 497 for (char c : chars) { 498 description.append(showCharacter(c)); 499 } 500 description.append("\")"); 501 return new CharMatcher(description.toString()) { 502 @Override public boolean matches(char c) { 503 return Arrays.binarySearch(chars, c) >= 0; 504 } 505 506 @Override 507 @GwtIncompatible("java.util.BitSet") 508 void setBits(BitSet table) { 509 for (char c : chars) { 510 table.set(c); 511 } 512 } 513 }; 514 } 515 516 private static CharMatcher isEither( 517 final char match1, 518 final char match2) { 519 String description = "CharMatcher.anyOf(\"" + 520 showCharacter(match1) + showCharacter(match2) + "\")"; 521 return new FastMatcher(description) { 522 @Override public boolean matches(char c) { 523 return c == match1 || c == match2; 524 } 525 526 @GwtIncompatible("java.util.BitSet") 527 @Override void setBits(BitSet table) { 528 table.set(match1); 529 table.set(match2); 530 } 531 }; 532 } 533 534 /** 535 * Returns a {@code char} matcher that matches any character not present in the given character 536 * sequence. 537 */ 538 public static CharMatcher noneOf(CharSequence sequence) { 539 return anyOf(sequence).negate(); 540 } 541 542 /** 543 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are 544 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 545 * CharMatcher.inRange('a', 'z')}. 546 * 547 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 548 */ 549 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 550 checkArgument(endInclusive >= startInclusive); 551 String description = "CharMatcher.inRange('" + 552 showCharacter(startInclusive) + "', '" + 553 showCharacter(endInclusive) + "')"; 554 return inRange(startInclusive, endInclusive, description); 555 } 556 557 static CharMatcher inRange(final char startInclusive, final char endInclusive, 558 String description) { 559 return new FastMatcher(description) { 560 @Override public boolean matches(char c) { 561 return startInclusive <= c && c <= endInclusive; 562 } 563 564 @GwtIncompatible("java.util.BitSet") 565 @Override void setBits(BitSet table) { 566 table.set(startInclusive, endInclusive + 1); 567 } 568 }; 569 } 570 571 /** 572 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 573 * which operates on primitive {@code char} instances instead. 574 */ 575 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 576 checkNotNull(predicate); 577 if (predicate instanceof CharMatcher) { 578 return (CharMatcher) predicate; 579 } 580 String description = "CharMatcher.forPredicate(" + predicate + ")"; 581 return new CharMatcher(description) { 582 @Override public boolean matches(char c) { 583 return predicate.apply(c); 584 } 585 586 @Override public boolean apply(Character character) { 587 return predicate.apply(checkNotNull(character)); 588 } 589 }; 590 } 591 592 // State 593 final String description; 594 595 // Constructors 596 597 /** 598 * Sets the {@code toString()} from the given description. 599 */ 600 CharMatcher(String description) { 601 this.description = description; 602 } 603 604 /** 605 * Constructor for use by subclasses. When subclassing, you may want to override 606 * {@code toString()} to provide a useful description. 607 */ 608 protected CharMatcher() { 609 description = super.toString(); 610 } 611 612 // Abstract methods 613 614 /** Determines a true or false value for the given character. */ 615 public abstract boolean matches(char c); 616 617 // Non-static factories 618 619 /** 620 * Returns a matcher that matches any character not matched by this matcher. 621 */ 622 public CharMatcher negate() { 623 return new NegatedMatcher(this); 624 } 625 626 private static class NegatedMatcher extends CharMatcher { 627 final CharMatcher original; 628 629 NegatedMatcher(String toString, CharMatcher original) { 630 super(toString); 631 this.original = original; 632 } 633 634 NegatedMatcher(CharMatcher original) { 635 this(original + ".negate()", original); 636 } 637 638 @Override public boolean matches(char c) { 639 return !original.matches(c); 640 } 641 642 @Override public boolean matchesAllOf(CharSequence sequence) { 643 return original.matchesNoneOf(sequence); 644 } 645 646 @Override public boolean matchesNoneOf(CharSequence sequence) { 647 return original.matchesAllOf(sequence); 648 } 649 650 @Override public int countIn(CharSequence sequence) { 651 return sequence.length() - original.countIn(sequence); 652 } 653 654 @GwtIncompatible("java.util.BitSet") 655 @Override 656 void setBits(BitSet table) { 657 BitSet tmp = new BitSet(); 658 original.setBits(tmp); 659 tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 660 table.or(tmp); 661 } 662 663 @Override public CharMatcher negate() { 664 return original; 665 } 666 667 @Override 668 CharMatcher withToString(String description) { 669 return new NegatedMatcher(description, original); 670 } 671 } 672 673 /** 674 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 675 */ 676 public CharMatcher and(CharMatcher other) { 677 return new And(this, checkNotNull(other)); 678 } 679 680 private static class And extends CharMatcher { 681 final CharMatcher first; 682 final CharMatcher second; 683 684 And(CharMatcher a, CharMatcher b) { 685 this(a, b, "CharMatcher.and(" + a + ", " + b + ")"); 686 } 687 688 And(CharMatcher a, CharMatcher b, String description) { 689 super(description); 690 first = checkNotNull(a); 691 second = checkNotNull(b); 692 } 693 694 @Override 695 public boolean matches(char c) { 696 return first.matches(c) && second.matches(c); 697 } 698 699 @GwtIncompatible("java.util.BitSet") 700 @Override 701 void setBits(BitSet table) { 702 BitSet tmp1 = new BitSet(); 703 first.setBits(tmp1); 704 BitSet tmp2 = new BitSet(); 705 second.setBits(tmp2); 706 tmp1.and(tmp2); 707 table.or(tmp1); 708 } 709 710 @Override 711 CharMatcher withToString(String description) { 712 return new And(first, second, description); 713 } 714 } 715 716 /** 717 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 718 */ 719 public CharMatcher or(CharMatcher other) { 720 return new Or(this, checkNotNull(other)); 721 } 722 723 private static class Or extends CharMatcher { 724 final CharMatcher first; 725 final CharMatcher second; 726 727 Or(CharMatcher a, CharMatcher b, String description) { 728 super(description); 729 first = checkNotNull(a); 730 second = checkNotNull(b); 731 } 732 733 Or(CharMatcher a, CharMatcher b) { 734 this(a, b, "CharMatcher.or(" + a + ", " + b + ")"); 735 } 736 737 @GwtIncompatible("java.util.BitSet") 738 @Override 739 void setBits(BitSet table) { 740 first.setBits(table); 741 second.setBits(table); 742 } 743 744 @Override 745 public boolean matches(char c) { 746 return first.matches(c) || second.matches(c); 747 } 748 749 @Override 750 CharMatcher withToString(String description) { 751 return new Or(first, second, description); 752 } 753 } 754 755 /** 756 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 757 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 758 * worthwhile only if the precomputed matcher is queried many thousands of times. 759 * 760 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 761 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 762 * worthwhile tradeoff in a browser. 763 */ 764 public CharMatcher precomputed() { 765 return Platform.precomputeCharMatcher(this); 766 } 767 768 /** 769 * Subclasses should provide a new CharMatcher with the same characteristics as {@code this}, 770 * but with their {@code toString} method overridden with the new description. 771 * 772 * <p>This is unsupported by default. 773 */ 774 CharMatcher withToString(String description) { 775 throw new UnsupportedOperationException(); 776 } 777 778 private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; 779 780 /** 781 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a 782 * method on {@link Platform} so that we can have different behavior in GWT. 783 * 784 * <p>This implementation tries to be smart in a number of ways. It recognizes cases where 785 * the negation is cheaper to precompute than the matcher itself; it tries to build small 786 * hash tables for matchers that only match a few characters, and so on. In the worst-case 787 * scenario, it constructs an eight-kilobyte bit array and queries that. 788 * In many situations this produces a matcher which is faster to query than the original. 789 */ 790 @GwtIncompatible("java.util.BitSet") 791 CharMatcher precomputedInternal() { 792 final BitSet table = new BitSet(); 793 setBits(table); 794 int totalCharacters = table.cardinality(); 795 if (totalCharacters * 2 <= DISTINCT_CHARS) { 796 return precomputedPositive(totalCharacters, table, description); 797 } else { 798 // TODO(user): is it worth it to worry about the last character of large matchers? 799 table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 800 int negatedCharacters = DISTINCT_CHARS - totalCharacters; 801 String suffix = ".negate()"; 802 String negatedDescription = description.endsWith(suffix) 803 ? description.substring(0, description.length() - suffix.length()) 804 : description + suffix; 805 return new NegatedFastMatcher(toString(), 806 precomputedPositive(negatedCharacters, table, negatedDescription)); 807 } 808 } 809 810 /** 811 * A matcher for which precomputation will not yield any significant benefit. 812 */ 813 abstract static class FastMatcher extends CharMatcher { 814 FastMatcher() { 815 super(); 816 } 817 818 FastMatcher(String description) { 819 super(description); 820 } 821 822 @Override 823 public final CharMatcher precomputed() { 824 return this; 825 } 826 827 @Override 828 public CharMatcher negate() { 829 return new NegatedFastMatcher(this); 830 } 831 } 832 833 static final class NegatedFastMatcher extends NegatedMatcher { 834 NegatedFastMatcher(CharMatcher original) { 835 super(original); 836 } 837 838 NegatedFastMatcher(String toString, CharMatcher original) { 839 super(toString, original); 840 } 841 842 @Override 843 public final CharMatcher precomputed() { 844 return this; 845 } 846 847 @Override 848 CharMatcher withToString(String description) { 849 return new NegatedFastMatcher(description, original); 850 } 851 } 852 853 /** 854 * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. 855 */ 856 @GwtIncompatible("java.util.BitSet") 857 private static CharMatcher precomputedPositive( 858 int totalCharacters, 859 BitSet table, 860 String description) { 861 switch (totalCharacters) { 862 case 0: 863 return NONE; 864 case 1: 865 return is((char) table.nextSetBit(0)); 866 case 2: 867 char c1 = (char) table.nextSetBit(0); 868 char c2 = (char) table.nextSetBit(c1 + 1); 869 return isEither(c1, c2); 870 default: 871 return isSmall(totalCharacters, table.length()) 872 ? SmallCharMatcher.from(table, description) 873 : new BitSetMatcher(table, description); 874 } 875 } 876 877 @GwtIncompatible("SmallCharMatcher") 878 private static boolean isSmall(int totalCharacters, int tableLength) { 879 return totalCharacters <= SmallCharMatcher.MAX_SIZE 880 && tableLength > (totalCharacters * 4 * Character.SIZE); 881 // err on the side of BitSetMatcher 882 } 883 884 @GwtIncompatible("java.util.BitSet") 885 private static class BitSetMatcher extends FastMatcher { 886 private final BitSet table; 887 888 private BitSetMatcher(BitSet table, String description) { 889 super(description); 890 if (table.length() + Long.SIZE < table.size()) { 891 table = (BitSet) table.clone(); 892 // If only we could actually call BitSet.trimToSize() ourselves... 893 } 894 this.table = table; 895 } 896 897 @Override public boolean matches(char c) { 898 return table.get(c); 899 } 900 901 @Override 902 void setBits(BitSet bitSet) { 903 bitSet.or(table); 904 } 905 } 906 907 /** 908 * Sets bits in {@code table} matched by this matcher. 909 */ 910 @GwtIncompatible("java.util.BitSet") 911 void setBits(BitSet table) { 912 for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { 913 if (matches((char) c)) { 914 table.set(c); 915 } 916 } 917 } 918 919 // Text processing routines 920 921 /** 922 * Returns {@code true} if a character sequence contains at least one matching character. 923 * Equivalent to {@code !matchesNoneOf(sequence)}. 924 * 925 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 926 * character, until this returns {@code true} or the end is reached. 927 * 928 * @param sequence the character sequence to examine, possibly empty 929 * @return {@code true} if this matcher matches at least one character in the sequence 930 * @since 8.0 931 */ 932 public boolean matchesAnyOf(CharSequence sequence) { 933 return !matchesNoneOf(sequence); 934 } 935 936 /** 937 * Returns {@code true} if a character sequence contains only matching characters. 938 * 939 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 940 * character, until this returns {@code false} or the end is reached. 941 * 942 * @param sequence the character sequence to examine, possibly empty 943 * @return {@code true} if this matcher matches every character in the sequence, including when 944 * the sequence is empty 945 */ 946 public boolean matchesAllOf(CharSequence sequence) { 947 for (int i = sequence.length() - 1; i >= 0; i--) { 948 if (!matches(sequence.charAt(i))) { 949 return false; 950 } 951 } 952 return true; 953 } 954 955 /** 956 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to 957 * {@code !matchesAnyOf(sequence)}. 958 * 959 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 960 * character, until this returns {@code false} or the end is reached. 961 * 962 * @param sequence the character sequence to examine, possibly empty 963 * @return {@code true} if this matcher matches every character in the sequence, including when 964 * the sequence is empty 965 */ 966 public boolean matchesNoneOf(CharSequence sequence) { 967 return indexIn(sequence) == -1; 968 } 969 970 /** 971 * Returns the index of the first matching character in a character sequence, or {@code -1} if no 972 * matching character is present. 973 * 974 * <p>The default implementation iterates over the sequence in forward order calling {@link 975 * #matches} for each character. 976 * 977 * @param sequence the character sequence to examine from the beginning 978 * @return an index, or {@code -1} if no character matches 979 */ 980 public int indexIn(CharSequence sequence) { 981 int length = sequence.length(); 982 for (int i = 0; i < length; i++) { 983 if (matches(sequence.charAt(i))) { 984 return i; 985 } 986 } 987 return -1; 988 } 989 990 /** 991 * Returns the index of the first matching character in a character sequence, starting from a 992 * given position, or {@code -1} if no character matches after that position. 993 * 994 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 995 * start}, calling {@link #matches} for each character. 996 * 997 * @param sequence the character sequence to examine 998 * @param start the first index to examine; must be nonnegative and no greater than {@code 999 * sequence.length()} 1000 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 1001 * or {@code -1} if no character matches 1002 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 1003 * sequence.length()} 1004 */ 1005 public int indexIn(CharSequence sequence, int start) { 1006 int length = sequence.length(); 1007 Preconditions.checkPositionIndex(start, length); 1008 for (int i = start; i < length; i++) { 1009 if (matches(sequence.charAt(i))) { 1010 return i; 1011 } 1012 } 1013 return -1; 1014 } 1015 1016 /** 1017 * Returns the index of the last matching character in a character sequence, or {@code -1} if no 1018 * matching character is present. 1019 * 1020 * <p>The default implementation iterates over the sequence in reverse order calling {@link 1021 * #matches} for each character. 1022 * 1023 * @param sequence the character sequence to examine from the end 1024 * @return an index, or {@code -1} if no character matches 1025 */ 1026 public int lastIndexIn(CharSequence sequence) { 1027 for (int i = sequence.length() - 1; i >= 0; i--) { 1028 if (matches(sequence.charAt(i))) { 1029 return i; 1030 } 1031 } 1032 return -1; 1033 } 1034 1035 /** 1036 * Returns the number of matching characters found in a character sequence. 1037 */ 1038 public int countIn(CharSequence sequence) { 1039 int count = 0; 1040 for (int i = 0; i < sequence.length(); i++) { 1041 if (matches(sequence.charAt(i))) { 1042 count++; 1043 } 1044 } 1045 return count; 1046 } 1047 1048 /** 1049 * Returns a string containing all non-matching characters of a character sequence, in order. For 1050 * example: <pre> {@code 1051 * 1052 * CharMatcher.is('a').removeFrom("bazaar")}</pre> 1053 * 1054 * ... returns {@code "bzr"}. 1055 */ 1056 @CheckReturnValue 1057 public String removeFrom(CharSequence sequence) { 1058 String string = sequence.toString(); 1059 int pos = indexIn(string); 1060 if (pos == -1) { 1061 return string; 1062 } 1063 1064 char[] chars = string.toCharArray(); 1065 int spread = 1; 1066 1067 // This unusual loop comes from extensive benchmarking 1068 OUT: while (true) { 1069 pos++; 1070 while (true) { 1071 if (pos == chars.length) { 1072 break OUT; 1073 } 1074 if (matches(chars[pos])) { 1075 break; 1076 } 1077 chars[pos - spread] = chars[pos]; 1078 pos++; 1079 } 1080 spread++; 1081 } 1082 return new String(chars, 0, pos - spread); 1083 } 1084 1085 /** 1086 * Returns a string containing all matching characters of a character sequence, in order. For 1087 * example: <pre> {@code 1088 * 1089 * CharMatcher.is('a').retainFrom("bazaar")}</pre> 1090 * 1091 * ... returns {@code "aaa"}. 1092 */ 1093 @CheckReturnValue 1094 public String retainFrom(CharSequence sequence) { 1095 return negate().removeFrom(sequence); 1096 } 1097 1098 /** 1099 * Returns a string copy of the input character sequence, with each character that matches this 1100 * matcher replaced by a given replacement character. For example: <pre> {@code 1101 * 1102 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre> 1103 * 1104 * ... returns {@code "rodor"}. 1105 * 1106 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1107 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1108 * character. 1109 * 1110 * @param sequence the character sequence to replace matching characters in 1111 * @param replacement the character to append to the result string in place of each matching 1112 * character in {@code sequence} 1113 * @return the new string 1114 */ 1115 @CheckReturnValue 1116 public String replaceFrom(CharSequence sequence, char replacement) { 1117 String string = sequence.toString(); 1118 int pos = indexIn(string); 1119 if (pos == -1) { 1120 return string; 1121 } 1122 char[] chars = string.toCharArray(); 1123 chars[pos] = replacement; 1124 for (int i = pos + 1; i < chars.length; i++) { 1125 if (matches(chars[i])) { 1126 chars[i] = replacement; 1127 } 1128 } 1129 return new String(chars); 1130 } 1131 1132 /** 1133 * Returns a string copy of the input character sequence, with each character that matches this 1134 * matcher replaced by a given replacement sequence. For example: <pre> {@code 1135 * 1136 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre> 1137 * 1138 * ... returns {@code "yoohoo"}. 1139 * 1140 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 1141 * off calling {@link #replaceFrom(CharSequence, char)} directly. 1142 * 1143 * @param sequence the character sequence to replace matching characters in 1144 * @param replacement the characters to append to the result string in place of each matching 1145 * character in {@code sequence} 1146 * @return the new string 1147 */ 1148 @CheckReturnValue 1149 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1150 int replacementLen = replacement.length(); 1151 if (replacementLen == 0) { 1152 return removeFrom(sequence); 1153 } 1154 if (replacementLen == 1) { 1155 return replaceFrom(sequence, replacement.charAt(0)); 1156 } 1157 1158 String string = sequence.toString(); 1159 int pos = indexIn(string); 1160 if (pos == -1) { 1161 return string; 1162 } 1163 1164 int len = string.length(); 1165 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 1166 1167 int oldpos = 0; 1168 do { 1169 buf.append(string, oldpos, pos); 1170 buf.append(replacement); 1171 oldpos = pos + 1; 1172 pos = indexIn(string, oldpos); 1173 } while (pos != -1); 1174 1175 buf.append(string, oldpos, len); 1176 return buf.toString(); 1177 } 1178 1179 /** 1180 * Returns a substring of the input character sequence that omits all characters this matcher 1181 * matches from the beginning and from the end of the string. For example: <pre> {@code 1182 * 1183 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre> 1184 * 1185 * ... returns {@code "cat"}. 1186 * 1187 * <p>Note that: <pre> {@code 1188 * 1189 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre> 1190 * 1191 * ... is equivalent to {@link String#trim()}. 1192 */ 1193 @CheckReturnValue 1194 public String trimFrom(CharSequence sequence) { 1195 int len = sequence.length(); 1196 int first; 1197 int last; 1198 1199 for (first = 0; first < len; first++) { 1200 if (!matches(sequence.charAt(first))) { 1201 break; 1202 } 1203 } 1204 for (last = len - 1; last > first; last--) { 1205 if (!matches(sequence.charAt(last))) { 1206 break; 1207 } 1208 } 1209 1210 return sequence.subSequence(first, last + 1).toString(); 1211 } 1212 1213 /** 1214 * Returns a substring of the input character sequence that omits all characters this matcher 1215 * matches from the beginning of the string. For example: <pre> {@code 1216 * 1217 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre> 1218 * 1219 * ... returns {@code "catbab"}. 1220 */ 1221 @CheckReturnValue 1222 public String trimLeadingFrom(CharSequence sequence) { 1223 int len = sequence.length(); 1224 for (int first = 0; first < len; first++) { 1225 if (!matches(sequence.charAt(first))) { 1226 return sequence.subSequence(first, len).toString(); 1227 } 1228 } 1229 return ""; 1230 } 1231 1232 /** 1233 * Returns a substring of the input character sequence that omits all characters this matcher 1234 * matches from the end of the string. For example: <pre> {@code 1235 * 1236 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre> 1237 * 1238 * ... returns {@code "abacat"}. 1239 */ 1240 @CheckReturnValue 1241 public String trimTrailingFrom(CharSequence sequence) { 1242 int len = sequence.length(); 1243 for (int last = len - 1; last >= 0; last--) { 1244 if (!matches(sequence.charAt(last))) { 1245 return sequence.subSequence(0, last + 1).toString(); 1246 } 1247 } 1248 return ""; 1249 } 1250 1251 /** 1252 * Returns a string copy of the input character sequence, with each group of consecutive 1253 * characters that match this matcher replaced by a single replacement character. For example: 1254 * <pre> {@code 1255 * 1256 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre> 1257 * 1258 * ... returns {@code "b-p-r"}. 1259 * 1260 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1261 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1262 * character. 1263 * 1264 * @param sequence the character sequence to replace matching groups of characters in 1265 * @param replacement the character to append to the result string in place of each group of 1266 * matching characters in {@code sequence} 1267 * @return the new string 1268 */ 1269 @CheckReturnValue 1270 public String collapseFrom(CharSequence sequence, char replacement) { 1271 // This implementation avoids unnecessary allocation. 1272 int len = sequence.length(); 1273 for (int i = 0; i < len; i++) { 1274 char c = sequence.charAt(i); 1275 if (matches(c)) { 1276 if (c == replacement 1277 && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { 1278 // a no-op replacement 1279 i++; 1280 } else { 1281 StringBuilder builder = new StringBuilder(len) 1282 .append(sequence.subSequence(0, i)) 1283 .append(replacement); 1284 return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); 1285 } 1286 } 1287 } 1288 // no replacement needed 1289 return sequence.toString(); 1290 } 1291 1292 /** 1293 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 1294 * groups of matching characters at the start or end of the sequence are removed without 1295 * replacement. 1296 */ 1297 @CheckReturnValue 1298 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1299 // This implementation avoids unnecessary allocation. 1300 int len = sequence.length(); 1301 int first; 1302 int last; 1303 1304 for (first = 0; first < len && matches(sequence.charAt(first)); first++) {} 1305 for (last = len - 1; last > first && matches(sequence.charAt(last)); last--) {} 1306 1307 return (first == 0 && last == len - 1) 1308 ? collapseFrom(sequence, replacement) 1309 : finishCollapseFrom( 1310 sequence, first, last + 1, replacement, 1311 new StringBuilder(last + 1 - first), 1312 false); 1313 } 1314 1315 private String finishCollapseFrom( 1316 CharSequence sequence, int start, int end, char replacement, 1317 StringBuilder builder, boolean inMatchingGroup) { 1318 for (int i = start; i < end; i++) { 1319 char c = sequence.charAt(i); 1320 if (matches(c)) { 1321 if (!inMatchingGroup) { 1322 builder.append(replacement); 1323 inMatchingGroup = true; 1324 } 1325 } else { 1326 builder.append(c); 1327 inMatchingGroup = false; 1328 } 1329 } 1330 return builder.toString(); 1331 } 1332 1333 /** 1334 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} 1335 * instead. 1336 */ 1337 @Deprecated 1338 @Override 1339 public boolean apply(Character character) { 1340 return matches(character); 1341 } 1342 1343 /** 1344 * Returns a string representation of this {@code CharMatcher}, such as 1345 * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 1346 */ 1347 @Override 1348 public String toString() { 1349 return description; 1350 } 1351 1352 static final String WHITESPACE_TABLE = "" 1353 + "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" 1354 + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" 1355 + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" 1356 + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; 1357 static final int WHITESPACE_MULTIPLIER = 1682554634; 1358 static final int WHITESPACE_SHIFT = Integer.numberOfLeadingZeros(WHITESPACE_TABLE.length() - 1); 1359 1360 /** 1361 * Determines whether a character is whitespace according to the latest Unicode standard, as 1362 * illustrated 1363 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 1364 * This is not the same definition used by other Java APIs. (See a 1365 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several 1366 * definitions of "whitespace"</a>.) 1367 * 1368 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up 1369 * to date. 1370 */ 1371 public static final CharMatcher WHITESPACE = new FastMatcher("WHITESPACE") { 1372 @Override 1373 public boolean matches(char c) { 1374 return WHITESPACE_TABLE.charAt((WHITESPACE_MULTIPLIER * c) >>> WHITESPACE_SHIFT) == c; 1375 } 1376 1377 @GwtIncompatible("java.util.BitSet") 1378 @Override 1379 void setBits(BitSet table) { 1380 for (int i = 0; i < WHITESPACE_TABLE.length(); i++) { 1381 table.set(WHITESPACE_TABLE.charAt(i)); 1382 } 1383 } 1384 }; 1385} 1386