1/* 2 * Copyright (C) 2008 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.google.common.base; 18 19import static com.google.common.base.Preconditions.checkArgument; 20import static com.google.common.base.Preconditions.checkNotNull; 21 22import com.google.common.annotations.Beta; 23import com.google.common.annotations.GwtCompatible; 24 25import java.util.Arrays; 26 27import javax.annotation.CheckReturnValue; 28 29/** 30 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 31 * for any {@link Object}. Also offers basic text processing methods based on this function. 32 * Implementations are strongly encouraged to be side-effect-free and immutable. 33 * 34 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 35 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}". 36 * 37 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand 38 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical 39 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher} 40 * treats these just as two separate characters. 41 * 42 * <p>Example usages: <pre> 43 * String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput); 44 * if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 45 * 46 * <p>See the Guava User Guide article on <a href= 47 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher"> 48 * {@code CharMatcher}</a>. 49 * 50 * @author Kevin Bourrillion 51 * @since 1.0 52 */ 53@Beta // Possibly change from chars to code points; decide constants vs. methods 54@GwtCompatible(emulated = true) 55public abstract class CharMatcher implements Predicate<Character> { 56 57 // Constants 58 /** 59 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 60 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a 61 * discussion of that term. 62 * 63 * @since 2.0 64 */ 65 public static final CharMatcher BREAKING_WHITESPACE = new CharMatcher() { 66 @Override 67 public boolean matches(char c) { 68 switch (c) { 69 case '\t': 70 case '\n': 71 case '\013': 72 case '\f': 73 case '\r': 74 case ' ': 75 case '\u0085': 76 case '\u1680': 77 case '\u2028': 78 case '\u2029': 79 case '\u205f': 80 case '\u3000': 81 return true; 82 case '\u2007': 83 return false; 84 default: 85 return c >= '\u2000' && c <= '\u200a'; 86 } 87 } 88 89 @Override 90 public String toString() { 91 return "CharMatcher.BREAKING_WHITESPACE"; 92 } 93 }; 94 95 /** 96 * Determines whether a character is ASCII, meaning that its code point is less than 128. 97 */ 98 public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII"); 99 100 private static class RangesMatcher extends CharMatcher { 101 private final char[] rangeStarts; 102 private final char[] rangeEnds; 103 104 RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { 105 super(description); 106 this.rangeStarts = rangeStarts; 107 this.rangeEnds = rangeEnds; 108 checkArgument(rangeStarts.length == rangeEnds.length); 109 for (int i = 0; i < rangeStarts.length; i++) { 110 checkArgument(rangeStarts[i] <= rangeEnds[i]); 111 if (i + 1 < rangeStarts.length) { 112 checkArgument(rangeEnds[i] < rangeStarts[i + 1]); 113 } 114 } 115 } 116 117 @Override 118 public boolean matches(char c) { 119 int index = Arrays.binarySearch(rangeStarts, c); 120 if (index >= 0) { 121 return true; 122 } else { 123 index = ~index - 1; 124 return index >= 0 && c <= rangeEnds[index]; 125 } 126 } 127 } 128 129 // Must be in ascending order. 130 private static final String ZEROES = "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6" 131 + "\u0c66\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1b50\u1bb0" 132 + "\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10"; 133 134 private static final String NINES; 135 static { 136 StringBuilder builder = new StringBuilder(ZEROES.length()); 137 for (int i = 0; i < ZEROES.length(); i++) { 138 builder.append((char) (ZEROES.charAt(i) + 9)); 139 } 140 NINES = builder.toString(); 141 } 142 143 /** 144 * Determines whether a character is a digit according to 145 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. 146 * If you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 147 */ 148 public static final CharMatcher DIGIT = new RangesMatcher( 149 "CharMatcher.DIGIT", ZEROES.toCharArray(), NINES.toCharArray()); 150 151 /** 152 * Determines whether a character is a digit according to {@linkplain Character#isDigit(char) 153 * Java's definition}. If you only care to match ASCII digits, you can use {@code 154 * inRange('0', '9')}. 155 */ 156 public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") { 157 @Override public boolean matches(char c) { 158 return Character.isDigit(c); 159 } 160 }; 161 162 /** 163 * Determines whether a character is a letter according to {@linkplain Character#isLetter(char) 164 * Java's definition}. If you only care to match letters of the Latin alphabet, you can use {@code 165 * inRange('a', 'z').or(inRange('A', 'Z'))}. 166 */ 167 public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") { 168 @Override public boolean matches(char c) { 169 return Character.isLetter(c); 170 } 171 }; 172 173 /** 174 * Determines whether a character is a letter or digit according to {@linkplain 175 * Character#isLetterOrDigit(char) Java's definition}. 176 */ 177 public static final CharMatcher JAVA_LETTER_OR_DIGIT = 178 new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") { 179 @Override public boolean matches(char c) { 180 return Character.isLetterOrDigit(c); 181 } 182 }; 183 184 /** 185 * Determines whether a character is upper case according to {@linkplain 186 * Character#isUpperCase(char) Java's definition}. 187 */ 188 public static final CharMatcher JAVA_UPPER_CASE = 189 new CharMatcher("CharMatcher.JAVA_UPPER_CASE") { 190 @Override public boolean matches(char c) { 191 return Character.isUpperCase(c); 192 } 193 }; 194 195 /** 196 * Determines whether a character is lower case according to {@linkplain 197 * Character#isLowerCase(char) Java's definition}. 198 */ 199 public static final CharMatcher JAVA_LOWER_CASE = 200 new CharMatcher("CharMatcher.JAVA_LOWER_CASE") { 201 @Override public boolean matches(char c) { 202 return Character.isLowerCase(c); 203 } 204 }; 205 206 /** 207 * Determines whether a character is an ISO control character as specified by {@link 208 * Character#isISOControl(char)}. 209 */ 210 public static final CharMatcher JAVA_ISO_CONTROL = 211 inRange('\u0000', '\u001f') 212 .or(inRange('\u007f', '\u009f')) 213 .withToString("CharMatcher.JAVA_ISO_CONTROL"); 214 215 /** 216 * Determines whether a character is invisible; that is, if its Unicode category is any of 217 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 218 * PRIVATE_USE according to ICU4J. 219 */ 220 public static final CharMatcher INVISIBLE = new RangesMatcher("CharMatcher.INVISIBLE", ( 221 "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u1680\u180e\u2000\u2028\u205f\u2066\u2067\u2068" 222 + "\u2069\u206a\u3000\ud800\ufeff\ufff9\ufffa").toCharArray(), ( 223 "\u0020\u00a0\u00ad\u0604\u061c\u06dd\u070f\u1680\u180e\u200f\u202f\u2064\u2066\u2067\u2068" 224 + "\u2069\u206f\u3000\uf8ff\ufeff\ufff9\ufffb").toCharArray()); 225 226 private static String showCharacter(char c) { 227 String hex = "0123456789ABCDEF"; 228 char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; 229 for (int i = 0; i < 4; i++) { 230 tmp[5 - i] = hex.charAt(c & 0xF); 231 c >>= 4; 232 } 233 return String.copyValueOf(tmp); 234 235 } 236 237 /** 238 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 239 * errs on the side of returning {@code false} (that is, it tends to assume a character is 240 * double-width). 241 * 242 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to 243 * date. 244 */ 245 public static final CharMatcher SINGLE_WIDTH = new RangesMatcher("CharMatcher.SINGLE_WIDTH", 246 "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), 247 "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); 248 249 /** Matches any character. */ 250 public static final CharMatcher ANY = 251 new FastMatcher("CharMatcher.ANY") { 252 @Override public boolean matches(char c) { 253 return true; 254 } 255 256 @Override public int indexIn(CharSequence sequence) { 257 return (sequence.length() == 0) ? -1 : 0; 258 } 259 260 @Override public int indexIn(CharSequence sequence, int start) { 261 int length = sequence.length(); 262 Preconditions.checkPositionIndex(start, length); 263 return (start == length) ? -1 : start; 264 } 265 266 @Override public int lastIndexIn(CharSequence sequence) { 267 return sequence.length() - 1; 268 } 269 270 @Override public boolean matchesAllOf(CharSequence sequence) { 271 checkNotNull(sequence); 272 return true; 273 } 274 275 @Override public boolean matchesNoneOf(CharSequence sequence) { 276 return sequence.length() == 0; 277 } 278 279 @Override public String removeFrom(CharSequence sequence) { 280 checkNotNull(sequence); 281 return ""; 282 } 283 284 @Override public String replaceFrom(CharSequence sequence, char replacement) { 285 char[] array = new char[sequence.length()]; 286 Arrays.fill(array, replacement); 287 return new String(array); 288 } 289 290 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 291 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length()); 292 for (int i = 0; i < sequence.length(); i++) { 293 retval.append(replacement); 294 } 295 return retval.toString(); 296 } 297 298 @Override public String collapseFrom(CharSequence sequence, char replacement) { 299 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 300 } 301 302 @Override public String trimFrom(CharSequence sequence) { 303 checkNotNull(sequence); 304 return ""; 305 } 306 307 @Override public int countIn(CharSequence sequence) { 308 return sequence.length(); 309 } 310 311 @Override public CharMatcher and(CharMatcher other) { 312 return checkNotNull(other); 313 } 314 315 @Override public CharMatcher or(CharMatcher other) { 316 checkNotNull(other); 317 return this; 318 } 319 320 @Override public CharMatcher negate() { 321 return NONE; 322 } 323 }; 324 325 /** Matches no characters. */ 326 public static final CharMatcher NONE = 327 new FastMatcher("CharMatcher.NONE") { 328 @Override public boolean matches(char c) { 329 return false; 330 } 331 332 @Override public int indexIn(CharSequence sequence) { 333 checkNotNull(sequence); 334 return -1; 335 } 336 337 @Override public int indexIn(CharSequence sequence, int start) { 338 int length = sequence.length(); 339 Preconditions.checkPositionIndex(start, length); 340 return -1; 341 } 342 343 @Override public int lastIndexIn(CharSequence sequence) { 344 checkNotNull(sequence); 345 return -1; 346 } 347 348 @Override public boolean matchesAllOf(CharSequence sequence) { 349 return sequence.length() == 0; 350 } 351 352 @Override public boolean matchesNoneOf(CharSequence sequence) { 353 checkNotNull(sequence); 354 return true; 355 } 356 357 @Override public String removeFrom(CharSequence sequence) { 358 return sequence.toString(); 359 } 360 361 @Override public String replaceFrom(CharSequence sequence, char replacement) { 362 return sequence.toString(); 363 } 364 365 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 366 checkNotNull(replacement); 367 return sequence.toString(); 368 } 369 370 @Override public String collapseFrom(CharSequence sequence, char replacement) { 371 return sequence.toString(); 372 } 373 374 @Override public String trimFrom(CharSequence sequence) { 375 return sequence.toString(); 376 } 377 378 @Override 379 public String trimLeadingFrom(CharSequence sequence) { 380 return sequence.toString(); 381 } 382 383 @Override 384 public String trimTrailingFrom(CharSequence sequence) { 385 return sequence.toString(); 386 } 387 388 @Override public int countIn(CharSequence sequence) { 389 checkNotNull(sequence); 390 return 0; 391 } 392 393 @Override public CharMatcher and(CharMatcher other) { 394 checkNotNull(other); 395 return this; 396 } 397 398 @Override public CharMatcher or(CharMatcher other) { 399 return checkNotNull(other); 400 } 401 402 @Override public CharMatcher negate() { 403 return ANY; 404 } 405 }; 406 407 // Static factories 408 409 /** 410 * Returns a {@code char} matcher that matches only one specified character. 411 */ 412 public static CharMatcher is(final char match) { 413 String description = "CharMatcher.is('" + showCharacter(match) + "')"; 414 return new FastMatcher(description) { 415 @Override public boolean matches(char c) { 416 return c == match; 417 } 418 419 @Override public String replaceFrom(CharSequence sequence, char replacement) { 420 return sequence.toString().replace(match, replacement); 421 } 422 423 @Override public CharMatcher and(CharMatcher other) { 424 return other.matches(match) ? this : NONE; 425 } 426 427 @Override public CharMatcher or(CharMatcher other) { 428 return other.matches(match) ? other : super.or(other); 429 } 430 431 @Override public CharMatcher negate() { 432 return isNot(match); 433 } 434 }; 435 } 436 437 /** 438 * Returns a {@code char} matcher that matches any character except the one specified. 439 * 440 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 441 */ 442 public static CharMatcher isNot(final char match) { 443 String description = "CharMatcher.isNot('" + showCharacter(match) + "')"; 444 return new FastMatcher(description) { 445 @Override public boolean matches(char c) { 446 return c != match; 447 } 448 449 @Override public CharMatcher and(CharMatcher other) { 450 return other.matches(match) ? super.and(other) : other; 451 } 452 453 @Override public CharMatcher or(CharMatcher other) { 454 return other.matches(match) ? ANY : this; 455 } 456 457 @Override public CharMatcher negate() { 458 return is(match); 459 } 460 }; 461 } 462 463 /** 464 * Returns a {@code char} matcher that matches any character present in the given character 465 * sequence. 466 */ 467 public static CharMatcher anyOf(final CharSequence sequence) { 468 switch (sequence.length()) { 469 case 0: 470 return NONE; 471 case 1: 472 return is(sequence.charAt(0)); 473 case 2: 474 return isEither(sequence.charAt(0), sequence.charAt(1)); 475 default: 476 // continue below to handle the general case 477 } 478 // TODO(user): is it potentially worth just going ahead and building a precomputed matcher? 479 final char[] chars = sequence.toString().toCharArray(); 480 Arrays.sort(chars); 481 StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); 482 for (char c : chars) { 483 description.append(showCharacter(c)); 484 } 485 description.append("\")"); 486 return new CharMatcher(description.toString()) { 487 @Override public boolean matches(char c) { 488 return Arrays.binarySearch(chars, c) >= 0; 489 } 490 }; 491 } 492 493 private static CharMatcher isEither( 494 final char match1, 495 final char match2) { 496 String description = "CharMatcher.anyOf(\"" + 497 showCharacter(match1) + showCharacter(match2) + "\")"; 498 return new FastMatcher(description) { 499 @Override public boolean matches(char c) { 500 return c == match1 || c == match2; 501 } 502 }; 503 } 504 505 /** 506 * Returns a {@code char} matcher that matches any character not present in the given character 507 * sequence. 508 */ 509 public static CharMatcher noneOf(CharSequence sequence) { 510 return anyOf(sequence).negate(); 511 } 512 513 /** 514 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are 515 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 516 * CharMatcher.inRange('a', 'z')}. 517 * 518 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 519 */ 520 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 521 checkArgument(endInclusive >= startInclusive); 522 String description = "CharMatcher.inRange('" + 523 showCharacter(startInclusive) + "', '" + 524 showCharacter(endInclusive) + "')"; 525 return inRange(startInclusive, endInclusive, description); 526 } 527 528 static CharMatcher inRange(final char startInclusive, final char endInclusive, 529 String description) { 530 return new FastMatcher(description) { 531 @Override public boolean matches(char c) { 532 return startInclusive <= c && c <= endInclusive; 533 } 534 }; 535 } 536 537 /** 538 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 539 * which operates on primitive {@code char} instances instead. 540 */ 541 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 542 checkNotNull(predicate); 543 if (predicate instanceof CharMatcher) { 544 return (CharMatcher) predicate; 545 } 546 String description = "CharMatcher.forPredicate(" + predicate + ")"; 547 return new CharMatcher(description) { 548 @Override public boolean matches(char c) { 549 return predicate.apply(c); 550 } 551 552 @Override public boolean apply(Character character) { 553 return predicate.apply(checkNotNull(character)); 554 } 555 }; 556 } 557 558 // State 559 final String description; 560 561 // Constructors 562 563 /** 564 * Sets the {@code toString()} from the given description. 565 */ 566 CharMatcher(String description) { 567 this.description = description; 568 } 569 570 /** 571 * Constructor for use by subclasses. When subclassing, you may want to override 572 * {@code toString()} to provide a useful description. 573 */ 574 protected CharMatcher() { 575 description = super.toString(); 576 } 577 578 // Abstract methods 579 580 /** Determines a true or false value for the given character. */ 581 public abstract boolean matches(char c); 582 583 // Non-static factories 584 585 /** 586 * Returns a matcher that matches any character not matched by this matcher. 587 */ 588 public CharMatcher negate() { 589 return new NegatedMatcher(this); 590 } 591 592 private static class NegatedMatcher extends CharMatcher { 593 final CharMatcher original; 594 595 NegatedMatcher(String toString, CharMatcher original) { 596 super(toString); 597 this.original = original; 598 } 599 600 NegatedMatcher(CharMatcher original) { 601 this(original + ".negate()", original); 602 } 603 604 @Override public boolean matches(char c) { 605 return !original.matches(c); 606 } 607 608 @Override public boolean matchesAllOf(CharSequence sequence) { 609 return original.matchesNoneOf(sequence); 610 } 611 612 @Override public boolean matchesNoneOf(CharSequence sequence) { 613 return original.matchesAllOf(sequence); 614 } 615 616 @Override public int countIn(CharSequence sequence) { 617 return sequence.length() - original.countIn(sequence); 618 } 619 620 @Override public CharMatcher negate() { 621 return original; 622 } 623 624 @Override 625 CharMatcher withToString(String description) { 626 return new NegatedMatcher(description, original); 627 } 628 } 629 630 /** 631 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 632 */ 633 public CharMatcher and(CharMatcher other) { 634 return new And(this, checkNotNull(other)); 635 } 636 637 private static class And extends CharMatcher { 638 final CharMatcher first; 639 final CharMatcher second; 640 641 And(CharMatcher a, CharMatcher b) { 642 this(a, b, "CharMatcher.and(" + a + ", " + b + ")"); 643 } 644 645 And(CharMatcher a, CharMatcher b, String description) { 646 super(description); 647 first = checkNotNull(a); 648 second = checkNotNull(b); 649 } 650 651 @Override 652 public boolean matches(char c) { 653 return first.matches(c) && second.matches(c); 654 } 655 656 @Override 657 CharMatcher withToString(String description) { 658 return new And(first, second, description); 659 } 660 } 661 662 /** 663 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 664 */ 665 public CharMatcher or(CharMatcher other) { 666 return new Or(this, checkNotNull(other)); 667 } 668 669 private static class Or extends CharMatcher { 670 final CharMatcher first; 671 final CharMatcher second; 672 673 Or(CharMatcher a, CharMatcher b, String description) { 674 super(description); 675 first = checkNotNull(a); 676 second = checkNotNull(b); 677 } 678 679 Or(CharMatcher a, CharMatcher b) { 680 this(a, b, "CharMatcher.or(" + a + ", " + b + ")"); 681 } 682 683 @Override 684 public boolean matches(char c) { 685 return first.matches(c) || second.matches(c); 686 } 687 688 @Override 689 CharMatcher withToString(String description) { 690 return new Or(first, second, description); 691 } 692 } 693 694 /** 695 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 696 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 697 * worthwhile only if the precomputed matcher is queried many thousands of times. 698 * 699 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 700 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 701 * worthwhile tradeoff in a browser. 702 */ 703 public CharMatcher precomputed() { 704 return Platform.precomputeCharMatcher(this); 705 } 706 707 /** 708 * Subclasses should provide a new CharMatcher with the same characteristics as {@code this}, 709 * but with their {@code toString} method overridden with the new description. 710 * 711 * <p>This is unsupported by default. 712 */ 713 CharMatcher withToString(String description) { 714 throw new UnsupportedOperationException(); 715 } 716 717 private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; 718 719 /** 720 * A matcher for which precomputation will not yield any significant benefit. 721 */ 722 abstract static class FastMatcher extends CharMatcher { 723 FastMatcher() { 724 super(); 725 } 726 727 FastMatcher(String description) { 728 super(description); 729 } 730 731 @Override 732 public final CharMatcher precomputed() { 733 return this; 734 } 735 736 @Override 737 public CharMatcher negate() { 738 return new NegatedFastMatcher(this); 739 } 740 } 741 742 static final class NegatedFastMatcher extends NegatedMatcher { 743 NegatedFastMatcher(CharMatcher original) { 744 super(original); 745 } 746 747 NegatedFastMatcher(String toString, CharMatcher original) { 748 super(toString, original); 749 } 750 751 @Override 752 public final CharMatcher precomputed() { 753 return this; 754 } 755 756 @Override 757 CharMatcher withToString(String description) { 758 return new NegatedFastMatcher(description, original); 759 } 760 } 761 762 // Text processing routines 763 764 /** 765 * Returns {@code true} if a character sequence contains at least one matching character. 766 * Equivalent to {@code !matchesNoneOf(sequence)}. 767 * 768 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 769 * character, until this returns {@code true} or the end is reached. 770 * 771 * @param sequence the character sequence to examine, possibly empty 772 * @return {@code true} if this matcher matches at least one character in the sequence 773 * @since 8.0 774 */ 775 public boolean matchesAnyOf(CharSequence sequence) { 776 return !matchesNoneOf(sequence); 777 } 778 779 /** 780 * Returns {@code true} if a character sequence contains only matching characters. 781 * 782 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 783 * character, until this returns {@code false} or the end is reached. 784 * 785 * @param sequence the character sequence to examine, possibly empty 786 * @return {@code true} if this matcher matches every character in the sequence, including when 787 * the sequence is empty 788 */ 789 public boolean matchesAllOf(CharSequence sequence) { 790 for (int i = sequence.length() - 1; i >= 0; i--) { 791 if (!matches(sequence.charAt(i))) { 792 return false; 793 } 794 } 795 return true; 796 } 797 798 /** 799 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to 800 * {@code !matchesAnyOf(sequence)}. 801 * 802 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 803 * character, until this returns {@code false} or the end is reached. 804 * 805 * @param sequence the character sequence to examine, possibly empty 806 * @return {@code true} if this matcher matches every character in the sequence, including when 807 * the sequence is empty 808 */ 809 public boolean matchesNoneOf(CharSequence sequence) { 810 return indexIn(sequence) == -1; 811 } 812 813 /** 814 * Returns the index of the first matching character in a character sequence, or {@code -1} if no 815 * matching character is present. 816 * 817 * <p>The default implementation iterates over the sequence in forward order calling {@link 818 * #matches} for each character. 819 * 820 * @param sequence the character sequence to examine from the beginning 821 * @return an index, or {@code -1} if no character matches 822 */ 823 public int indexIn(CharSequence sequence) { 824 int length = sequence.length(); 825 for (int i = 0; i < length; i++) { 826 if (matches(sequence.charAt(i))) { 827 return i; 828 } 829 } 830 return -1; 831 } 832 833 /** 834 * Returns the index of the first matching character in a character sequence, starting from a 835 * given position, or {@code -1} if no character matches after that position. 836 * 837 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 838 * start}, calling {@link #matches} for each character. 839 * 840 * @param sequence the character sequence to examine 841 * @param start the first index to examine; must be nonnegative and no greater than {@code 842 * sequence.length()} 843 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 844 * or {@code -1} if no character matches 845 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 846 * sequence.length()} 847 */ 848 public int indexIn(CharSequence sequence, int start) { 849 int length = sequence.length(); 850 Preconditions.checkPositionIndex(start, length); 851 for (int i = start; i < length; i++) { 852 if (matches(sequence.charAt(i))) { 853 return i; 854 } 855 } 856 return -1; 857 } 858 859 /** 860 * Returns the index of the last matching character in a character sequence, or {@code -1} if no 861 * matching character is present. 862 * 863 * <p>The default implementation iterates over the sequence in reverse order calling {@link 864 * #matches} for each character. 865 * 866 * @param sequence the character sequence to examine from the end 867 * @return an index, or {@code -1} if no character matches 868 */ 869 public int lastIndexIn(CharSequence sequence) { 870 for (int i = sequence.length() - 1; i >= 0; i--) { 871 if (matches(sequence.charAt(i))) { 872 return i; 873 } 874 } 875 return -1; 876 } 877 878 /** 879 * Returns the number of matching characters found in a character sequence. 880 */ 881 public int countIn(CharSequence sequence) { 882 int count = 0; 883 for (int i = 0; i < sequence.length(); i++) { 884 if (matches(sequence.charAt(i))) { 885 count++; 886 } 887 } 888 return count; 889 } 890 891 /** 892 * Returns a string containing all non-matching characters of a character sequence, in order. For 893 * example: <pre> {@code 894 * 895 * CharMatcher.is('a').removeFrom("bazaar")}</pre> 896 * 897 * ... returns {@code "bzr"}. 898 */ 899 @CheckReturnValue 900 public String removeFrom(CharSequence sequence) { 901 String string = sequence.toString(); 902 int pos = indexIn(string); 903 if (pos == -1) { 904 return string; 905 } 906 907 char[] chars = string.toCharArray(); 908 int spread = 1; 909 910 // This unusual loop comes from extensive benchmarking 911 OUT: while (true) { 912 pos++; 913 while (true) { 914 if (pos == chars.length) { 915 break OUT; 916 } 917 if (matches(chars[pos])) { 918 break; 919 } 920 chars[pos - spread] = chars[pos]; 921 pos++; 922 } 923 spread++; 924 } 925 return new String(chars, 0, pos - spread); 926 } 927 928 /** 929 * Returns a string containing all matching characters of a character sequence, in order. For 930 * example: <pre> {@code 931 * 932 * CharMatcher.is('a').retainFrom("bazaar")}</pre> 933 * 934 * ... returns {@code "aaa"}. 935 */ 936 @CheckReturnValue 937 public String retainFrom(CharSequence sequence) { 938 return negate().removeFrom(sequence); 939 } 940 941 /** 942 * Returns a string copy of the input character sequence, with each character that matches this 943 * matcher replaced by a given replacement character. For example: <pre> {@code 944 * 945 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre> 946 * 947 * ... returns {@code "rodor"}. 948 * 949 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 950 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 951 * character. 952 * 953 * @param sequence the character sequence to replace matching characters in 954 * @param replacement the character to append to the result string in place of each matching 955 * character in {@code sequence} 956 * @return the new string 957 */ 958 @CheckReturnValue 959 public String replaceFrom(CharSequence sequence, char replacement) { 960 String string = sequence.toString(); 961 int pos = indexIn(string); 962 if (pos == -1) { 963 return string; 964 } 965 char[] chars = string.toCharArray(); 966 chars[pos] = replacement; 967 for (int i = pos + 1; i < chars.length; i++) { 968 if (matches(chars[i])) { 969 chars[i] = replacement; 970 } 971 } 972 return new String(chars); 973 } 974 975 /** 976 * Returns a string copy of the input character sequence, with each character that matches this 977 * matcher replaced by a given replacement sequence. For example: <pre> {@code 978 * 979 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre> 980 * 981 * ... returns {@code "yoohoo"}. 982 * 983 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 984 * off calling {@link #replaceFrom(CharSequence, char)} directly. 985 * 986 * @param sequence the character sequence to replace matching characters in 987 * @param replacement the characters to append to the result string in place of each matching 988 * character in {@code sequence} 989 * @return the new string 990 */ 991 @CheckReturnValue 992 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 993 int replacementLen = replacement.length(); 994 if (replacementLen == 0) { 995 return removeFrom(sequence); 996 } 997 if (replacementLen == 1) { 998 return replaceFrom(sequence, replacement.charAt(0)); 999 } 1000 1001 String string = sequence.toString(); 1002 int pos = indexIn(string); 1003 if (pos == -1) { 1004 return string; 1005 } 1006 1007 int len = string.length(); 1008 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 1009 1010 int oldpos = 0; 1011 do { 1012 buf.append(string, oldpos, pos); 1013 buf.append(replacement); 1014 oldpos = pos + 1; 1015 pos = indexIn(string, oldpos); 1016 } while (pos != -1); 1017 1018 buf.append(string, oldpos, len); 1019 return buf.toString(); 1020 } 1021 1022 /** 1023 * Returns a substring of the input character sequence that omits all characters this matcher 1024 * matches from the beginning and from the end of the string. For example: <pre> {@code 1025 * 1026 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre> 1027 * 1028 * ... returns {@code "cat"}. 1029 * 1030 * <p>Note that: <pre> {@code 1031 * 1032 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre> 1033 * 1034 * ... is equivalent to {@link String#trim()}. 1035 */ 1036 @CheckReturnValue 1037 public String trimFrom(CharSequence sequence) { 1038 int len = sequence.length(); 1039 int first; 1040 int last; 1041 1042 for (first = 0; first < len; first++) { 1043 if (!matches(sequence.charAt(first))) { 1044 break; 1045 } 1046 } 1047 for (last = len - 1; last > first; last--) { 1048 if (!matches(sequence.charAt(last))) { 1049 break; 1050 } 1051 } 1052 1053 return sequence.subSequence(first, last + 1).toString(); 1054 } 1055 1056 /** 1057 * Returns a substring of the input character sequence that omits all characters this matcher 1058 * matches from the beginning of the string. For example: <pre> {@code 1059 * 1060 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre> 1061 * 1062 * ... returns {@code "catbab"}. 1063 */ 1064 @CheckReturnValue 1065 public String trimLeadingFrom(CharSequence sequence) { 1066 int len = sequence.length(); 1067 for (int first = 0; first < len; first++) { 1068 if (!matches(sequence.charAt(first))) { 1069 return sequence.subSequence(first, len).toString(); 1070 } 1071 } 1072 return ""; 1073 } 1074 1075 /** 1076 * Returns a substring of the input character sequence that omits all characters this matcher 1077 * matches from the end of the string. For example: <pre> {@code 1078 * 1079 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre> 1080 * 1081 * ... returns {@code "abacat"}. 1082 */ 1083 @CheckReturnValue 1084 public String trimTrailingFrom(CharSequence sequence) { 1085 int len = sequence.length(); 1086 for (int last = len - 1; last >= 0; last--) { 1087 if (!matches(sequence.charAt(last))) { 1088 return sequence.subSequence(0, last + 1).toString(); 1089 } 1090 } 1091 return ""; 1092 } 1093 1094 /** 1095 * Returns a string copy of the input character sequence, with each group of consecutive 1096 * characters that match this matcher replaced by a single replacement character. For example: 1097 * <pre> {@code 1098 * 1099 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre> 1100 * 1101 * ... returns {@code "b-p-r"}. 1102 * 1103 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1104 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1105 * character. 1106 * 1107 * @param sequence the character sequence to replace matching groups of characters in 1108 * @param replacement the character to append to the result string in place of each group of 1109 * matching characters in {@code sequence} 1110 * @return the new string 1111 */ 1112 @CheckReturnValue 1113 public String collapseFrom(CharSequence sequence, char replacement) { 1114 // This implementation avoids unnecessary allocation. 1115 int len = sequence.length(); 1116 for (int i = 0; i < len; i++) { 1117 char c = sequence.charAt(i); 1118 if (matches(c)) { 1119 if (c == replacement 1120 && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { 1121 // a no-op replacement 1122 i++; 1123 } else { 1124 StringBuilder builder = new StringBuilder(len) 1125 .append(sequence.subSequence(0, i)) 1126 .append(replacement); 1127 return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); 1128 } 1129 } 1130 } 1131 // no replacement needed 1132 return sequence.toString(); 1133 } 1134 1135 /** 1136 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 1137 * groups of matching characters at the start or end of the sequence are removed without 1138 * replacement. 1139 */ 1140 @CheckReturnValue 1141 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1142 // This implementation avoids unnecessary allocation. 1143 int len = sequence.length(); 1144 int first; 1145 int last; 1146 1147 for (first = 0; first < len && matches(sequence.charAt(first)); first++) {} 1148 for (last = len - 1; last > first && matches(sequence.charAt(last)); last--) {} 1149 1150 return (first == 0 && last == len - 1) 1151 ? collapseFrom(sequence, replacement) 1152 : finishCollapseFrom( 1153 sequence, first, last + 1, replacement, 1154 new StringBuilder(last + 1 - first), 1155 false); 1156 } 1157 1158 private String finishCollapseFrom( 1159 CharSequence sequence, int start, int end, char replacement, 1160 StringBuilder builder, boolean inMatchingGroup) { 1161 for (int i = start; i < end; i++) { 1162 char c = sequence.charAt(i); 1163 if (matches(c)) { 1164 if (!inMatchingGroup) { 1165 builder.append(replacement); 1166 inMatchingGroup = true; 1167 } 1168 } else { 1169 builder.append(c); 1170 inMatchingGroup = false; 1171 } 1172 } 1173 return builder.toString(); 1174 } 1175 1176 /** 1177 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} 1178 * instead. 1179 */ 1180 @Deprecated 1181 @Override 1182 public boolean apply(Character character) { 1183 return matches(character); 1184 } 1185 1186 /** 1187 * Returns a string representation of this {@code CharMatcher}, such as 1188 * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 1189 */ 1190 @Override 1191 public String toString() { 1192 return description; 1193 } 1194 1195 static final String WHITESPACE_TABLE = "" 1196 + "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" 1197 + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" 1198 + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" 1199 + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; 1200 static final int WHITESPACE_MULTIPLIER = 1682554634; 1201 static final int WHITESPACE_SHIFT = Integer.numberOfLeadingZeros(WHITESPACE_TABLE.length() - 1); 1202 1203 /** 1204 * Determines whether a character is whitespace according to the latest Unicode standard, as 1205 * illustrated 1206 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 1207 * This is not the same definition used by other Java APIs. (See a 1208 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several 1209 * definitions of "whitespace"</a>.) 1210 * 1211 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up 1212 * to date. 1213 */ 1214 public static final CharMatcher WHITESPACE = new FastMatcher("WHITESPACE") { 1215 @Override 1216 public boolean matches(char c) { 1217 return WHITESPACE_TABLE.charAt((WHITESPACE_MULTIPLIER * c) >>> WHITESPACE_SHIFT) == c; 1218 } 1219 }; 1220} 1221