CharMatcher.java revision 1bdbfefe4b144c7b031a1d9242a0fa061a0ae6b5
1/*
2 * Copyright (C) 2008 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.android.mail.common.base;
18
19import static com.google.android.mail.common.base.Preconditions.checkArgument;
20import static com.google.android.mail.common.base.Preconditions.checkNotNull;
21
22import java.util.ArrayList;
23import java.util.Arrays;
24import java.util.List;
25
26/**
27 * Determines a true or false value for any Java {@code char} value, just as
28 * {@link Predicate} does for any {@link Object}. Also offers basic text
29 * processing methods based on this function. Implementations are strongly
30 * encouraged to be side-effect-free and immutable.
31 *
32 * <p>Throughout the documentation of this class, the phrase "matching
33 * character" is used to mean "any character {@code c} for which {@code
34 * this.matches(c)} returns {@code true}".
35 *
36 * <p><b>Note:</b> This class deals only with {@code char} values; it does not
37 * understand supplementary Unicode code points in the range {@code 0x10000} to
38 * {@code 0x10FFFF}. Such logical characters are encoded into a {@code String}
39 * using surrogate pairs, and a {@code CharMatcher} treats these just as two
40 * separate characters.
41 *
42 * @author Kevin Bourrillion
43 * @since 2009.09.15 <b>tentative</b>
44 */
45public abstract class CharMatcher implements Predicate<Character> {
46
47  // Constants
48
49  // Excludes 2000-2000a, which is handled as a range
50  private static final String BREAKING_WHITESPACE_CHARS =
51      "\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000";
52
53  // Excludes 2007, which is handled as a gap in a pair of ranges
54  private static final String NON_BREAKING_WHITESPACE_CHARS =
55      "\u00a0\u180e\u202f";
56
57  /**
58   * Determines whether a character is whitespace according to the latest
59   * Unicode standard, as illustrated
60   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
61   * This is not the same definition used by other Java APIs. See a comparison
62   * of several definitions of "whitespace" at
63   * <a href="TODO">(TODO)</a>.
64   *
65   * <p><b>Note:</b> as the Unicode definition evolves, we will modify this
66   * constant to keep it up to date.
67   */
68  public static final CharMatcher WHITESPACE =
69      anyOf(BREAKING_WHITESPACE_CHARS + NON_BREAKING_WHITESPACE_CHARS)
70          .or(inRange('\u2000', '\u200a'));
71
72  /**
73   * Determines whether a character is a breaking whitespace (that is,
74   * a whitespace which can be interpreted as a break between words
75   * for formatting purposes).  See {@link #WHITESPACE} for a discussion
76   * of that term.
77   *
78   * @since 2010.01.04 <b>tentative</b>
79   */
80  public static final CharMatcher BREAKING_WHITESPACE =
81      anyOf(BREAKING_WHITESPACE_CHARS)
82          .or(inRange('\u2000', '\u2006'))
83          .or(inRange('\u2008', '\u200a'));
84
85  /**
86   * Determines whether a character is ASCII, meaning that its code point is
87   * less than 128.
88   */
89  public static final CharMatcher ASCII = inRange('\0', '\u007f');
90
91  /**
92   * Determines whether a character is a digit according to
93   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
94   */
95  public static final CharMatcher DIGIT;
96
97  static {
98    CharMatcher digit = inRange('0', '9');
99    String zeroes =
100        "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66"
101            + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946"
102            + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
103    for (char base : zeroes.toCharArray()) {
104      digit = digit.or(inRange(base, (char) (base + 9)));
105    }
106    DIGIT = digit;
107  }
108
109  /**
110   * Determines whether a character is whitespace according to {@link
111   * Character#isWhitespace(char) Java's definition}; it is usually preferable
112   * to use {@link #WHITESPACE}. See a comparison of several definitions of
113   * "whitespace" at <a href="http://go/white+space">go/white+space</a>.
114   */
115  public static final CharMatcher JAVA_WHITESPACE
116      = inRange('\u0009', (char) 13)  // \\u000d doesn't work as a char literal
117      .or(inRange('\u001c', '\u0020'))
118      .or(is('\u1680'))
119      .or(is('\u180e'))
120      .or(inRange('\u2000', '\u2006'))
121      .or(inRange('\u2008', '\u200b'))
122      .or(inRange('\u2028', '\u2029'))
123      .or(is('\u205f'))
124      .or(is('\u3000'));
125
126  /**
127   * Determines whether a character is a digit according to {@link
128   * Character#isDigit(char) Java's definition}. If you only care to match
129   * ASCII digits, you can use {@code inRange('0', '9')}.
130   */
131  public static final CharMatcher JAVA_DIGIT = new CharMatcher() {
132    @Override public boolean matches(char c) {
133      return Character.isDigit(c);
134    }
135  };
136
137  /**
138   * Determines whether a character is a letter according to {@link
139   * Character#isLetter(char) Java's definition}. If you only care to match
140   * letters of the Latin alphabet, you can use {@code
141   * inRange('a', 'z').or(inRange('A', 'Z'))}.
142   */
143  public static final CharMatcher JAVA_LETTER = new CharMatcher() {
144    @Override public boolean matches(char c) {
145      return Character.isLetter(c);
146    }
147  };
148
149  /**
150   * Determines whether a character is a letter or digit according to {@link
151   * Character#isLetterOrDigit(char) Java's definition}.
152   */
153  public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() {
154    @Override public boolean matches(char c) {
155      return Character.isLetterOrDigit(c);
156    }
157  };
158
159  /**
160   * Determines whether a character is upper case according to {@link
161   * Character#isUpperCase(char) Java's definition}.
162   */
163  public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() {
164    @Override public boolean matches(char c) {
165      return Character.isUpperCase(c);
166    }
167  };
168
169  /**
170   * Determines whether a character is lower case according to {@link
171   * Character#isLowerCase(char) Java's definition}.
172   */
173  public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() {
174    @Override public boolean matches(char c) {
175      return Character.isLowerCase(c);
176    }
177  };
178
179  /**
180   * Determines whether a character is an ISO control character according to
181   * {@link Character#isISOControl(char)}.
182   */
183  public static final CharMatcher JAVA_ISO_CONTROL = inRange('\u0000', '\u001f')
184      .or(inRange('\u007f', '\u009f'));
185
186  /**
187   * Determines whether a character is invisible; that is, if its Unicode
188   * category is any of SPACE_SEPARATOR, LINE_SEPARATOR,
189   * PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and PRIVATE_USE according
190   * to ICU4J.
191   */
192  public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020')
193      .or(inRange('\u007f', '\u00a0'))
194      .or(is('\u00ad'))
195      .or(inRange('\u0600', '\u0603'))
196      .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e"))
197      .or(inRange('\u2000', '\u200f'))
198      .or(inRange('\u2028', '\u202f'))
199      .or(inRange('\u205f', '\u2064'))
200      .or(inRange('\u206a', '\u206f'))
201      .or(is('\u3000'))
202      .or(inRange('\ud800', '\uf8ff'))
203      .or(anyOf("\ufeff\ufff9\ufffa\ufffb"));
204
205  /**
206   * Determines whether a character is single-width (not double-width).  When
207   * in doubt, this matcher errs on the side of returning {@code false} (that
208   * is, it tends to assume a character is double-width).
209   *
210   * <b>Note:</b> as the reference file evolves, we will modify this constant
211   * to keep it up to date.
212   */
213  public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9')
214      .or(is('\u05be'))
215      .or(inRange('\u05d0', '\u05ea'))
216      .or(is('\u05f3'))
217      .or(is('\u05f4'))
218      .or(inRange('\u0600', '\u06ff'))
219      .or(inRange('\u0750', '\u077f'))
220      .or(inRange('\u0e00', '\u0e7f'))
221      .or(inRange('\u1e00', '\u20af'))
222      .or(inRange('\u2100', '\u213a'))
223      .or(inRange('\ufb50', '\ufdff'))
224      .or(inRange('\ufe70', '\ufeff'))
225      .or(inRange('\uff61', '\uffdc'));
226
227  /**
228   * Determines whether a character is whitespace according to an arbitrary definition used by
229   * {@link StringUtil} for years. Most likely you don't want to use this. See a comparison of
230   * several definitions of "whitespace" at <a href="http://goto/white space">goto/white space</a>.
231   *
232   * <p><b>To be deprecated.</b> use {@link #WHITESPACE} to switch to the Unicode definition, or
233   * create a matcher for the specific characters you want. Not deprecated yet because it is a
234   * stepping stone for getting off of many deprecated {@link StringUtil} methods.
235   */
236  @Deprecated
237  public static final CharMatcher LEGACY_WHITESPACE =
238      anyOf(" \r\n\t\u3000\u00A0\u2007\u202F").precomputed();
239
240
241  /** Matches any character. */
242  public static final CharMatcher ANY = new CharMatcher() {
243    @Override public boolean matches(char c) {
244      return true;
245    }
246
247    @Override public int indexIn(CharSequence sequence) {
248      return (sequence.length() == 0) ? -1 : 0;
249    }
250    @Override public int indexIn(CharSequence sequence, int start) {
251      int length = sequence.length();
252      Preconditions.checkPositionIndex(start, length);
253      return (start == length) ? -1 : start;
254    }
255    @Override public int lastIndexIn(CharSequence sequence) {
256      return sequence.length() - 1;
257    }
258    @Override public boolean matchesAllOf(CharSequence sequence) {
259      checkNotNull(sequence);
260      return true;
261    }
262    @Override public boolean matchesNoneOf(CharSequence sequence) {
263      return sequence.length() == 0;
264    }
265    @Override public String removeFrom(CharSequence sequence) {
266      checkNotNull(sequence);
267      return "";
268    }
269    @Override public String replaceFrom(
270        CharSequence sequence, char replacement) {
271      char[] array = new char[sequence.length()];
272      Arrays.fill(array, replacement);
273      return new String(array);
274    }
275    @Override public String replaceFrom(
276        CharSequence sequence, CharSequence replacement) {
277      StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
278      for (int i = 0; i < sequence.length(); i++) {
279        retval.append(replacement);
280      }
281      return retval.toString();
282    }
283    @Override public String collapseFrom(CharSequence sequence, char replacement) {
284      return (sequence.length() == 0) ? "" : String.valueOf(replacement);
285    }
286    @Override public String trimFrom(CharSequence sequence) {
287      checkNotNull(sequence);
288      return "";
289    }
290    @Override public int countIn(CharSequence sequence) {
291      return sequence.length();
292    }
293    @Override public CharMatcher and(CharMatcher other) {
294      return checkNotNull(other);
295    }
296    @Override public CharMatcher or(CharMatcher other) {
297      checkNotNull(other);
298      return this;
299    }
300    @Override public CharMatcher negate() {
301      return NONE;
302    }
303    @Override public CharMatcher precomputed() {
304      return this;
305    }
306  };
307
308  /** Matches no characters. */
309  public static final CharMatcher NONE = new CharMatcher() {
310    @Override public boolean matches(char c) {
311      return false;
312    }
313
314    @Override public int indexIn(CharSequence sequence) {
315      checkNotNull(sequence);
316      return -1;
317    }
318    @Override public int indexIn(CharSequence sequence, int start) {
319      int length = sequence.length();
320      Preconditions.checkPositionIndex(start, length);
321      return -1;
322    }
323    @Override public int lastIndexIn(CharSequence sequence) {
324      checkNotNull(sequence);
325      return -1;
326    }
327    @Override public boolean matchesAllOf(CharSequence sequence) {
328      return sequence.length() == 0;
329    }
330    @Override public boolean matchesNoneOf(CharSequence sequence) {
331      checkNotNull(sequence);
332      return true;
333    }
334    @Override public String removeFrom(CharSequence sequence) {
335      return sequence.toString();
336    }
337    @Override public String replaceFrom(
338        CharSequence sequence, char replacement) {
339      return sequence.toString();
340    }
341    @Override public String replaceFrom(
342        CharSequence sequence, CharSequence replacement) {
343      checkNotNull(replacement);
344      return sequence.toString();
345    }
346    @Override public String collapseFrom(
347        CharSequence sequence, char replacement) {
348      return sequence.toString();
349    }
350    @Override public String trimFrom(CharSequence sequence) {
351      return sequence.toString();
352    }
353    @Override public int countIn(CharSequence sequence) {
354      checkNotNull(sequence);
355      return 0;
356    }
357    @Override public CharMatcher and(CharMatcher other) {
358      checkNotNull(other);
359      return this;
360    }
361    @Override public CharMatcher or(CharMatcher other) {
362      return checkNotNull(other);
363    }
364    @Override public CharMatcher negate() {
365      return ANY;
366    }
367    @Override protected void setBits(LookupTable table) {
368    }
369    @Override public CharMatcher precomputed() {
370      return this;
371    }
372  };
373
374  // Static factories
375
376  /**
377   * Returns a {@code char} matcher that matches only one specified character.
378   */
379  public static CharMatcher is(final char match) {
380    return new CharMatcher() {
381      @Override public boolean matches(char c) {
382        return c == match;
383      }
384
385      @Override public String replaceFrom(
386          CharSequence sequence, char replacement) {
387        return sequence.toString().replace(match, replacement);
388      }
389      @Override public CharMatcher and(CharMatcher other) {
390        return other.matches(match) ? this : NONE;
391      }
392      @Override public CharMatcher or(CharMatcher other) {
393        return other.matches(match) ? other : super.or(other);
394      }
395      @Override public CharMatcher negate() {
396        return isNot(match);
397      }
398      @Override protected void setBits(LookupTable table) {
399        table.set(match);
400      }
401      @Override public CharMatcher precomputed() {
402        return this;
403      }
404    };
405  }
406
407  /**
408   * Returns a {@code char} matcher that matches any character except the one
409   * specified.
410   *
411   * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
412   */
413  public static CharMatcher isNot(final char match) {
414    return new CharMatcher() {
415      @Override public boolean matches(char c) {
416        return c != match;
417      }
418
419      @Override public CharMatcher and(CharMatcher other) {
420        return other.matches(match) ? super.and(other) : other;
421      }
422      @Override public CharMatcher or(CharMatcher other) {
423        return other.matches(match) ? ANY : this;
424      }
425      @Override public CharMatcher negate() {
426        return is(match);
427      }
428    };
429  }
430
431  /**
432   * Returns a {@code char} matcher that matches any character present in the
433   * given character sequence.
434   */
435  public static CharMatcher anyOf(final CharSequence sequence) {
436    switch (sequence.length()) {
437      case 0:
438        return NONE;
439      case 1:
440        return is(sequence.charAt(0));
441      case 2:
442        final char match1 = sequence.charAt(0);
443        final char match2 = sequence.charAt(1);
444        return new CharMatcher() {
445          @Override public boolean matches(char c) {
446            return c == match1 || c == match2;
447          }
448          @Override protected void setBits(LookupTable table) {
449            table.set(match1);
450            table.set(match2);
451          }
452          @Override public CharMatcher precomputed() {
453            return this;
454          }
455        };
456    }
457
458    final char[] chars = sequence.toString().toCharArray();
459    Arrays.sort(chars); // not worth collapsing duplicates
460
461    return new CharMatcher() {
462      @Override public boolean matches(char c) {
463        return Arrays.binarySearch(chars, c) >= 0;
464      }
465      @Override protected void setBits(LookupTable table) {
466        for (char c : chars) {
467          table.set(c);
468        }
469      }
470    };
471  }
472
473  /**
474   * Returns a {@code char} matcher that matches any character not present in
475   * the given character sequence.
476   */
477  public static CharMatcher noneOf(CharSequence sequence) {
478    return anyOf(sequence).negate();
479  }
480
481  /**
482   * Returns a {@code char} matcher that matches any character in a given range
483   * (both endpoints are inclusive). For example, to match any lowercase letter
484   * of the English alphabet, use {@code CharMatcher.inRange('a', 'z')}.
485   *
486   * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
487   */
488  public static CharMatcher inRange(
489      final char startInclusive, final char endInclusive) {
490    checkArgument(endInclusive >= startInclusive);
491    return new CharMatcher() {
492      @Override public boolean matches(char c) {
493        return startInclusive <= c && c <= endInclusive;
494      }
495      @Override protected void setBits(LookupTable table) {
496        char c = startInclusive;
497        while (true) {
498          table.set(c);
499          if (c++ == endInclusive) {
500            break;
501          }
502        }
503      }
504      @Override public CharMatcher precomputed() {
505        return this;
506      }
507    };
508  }
509
510  /**
511   * Returns a matcher with identical behavior to the given {@link
512   * Character}-based predicate, but which operates on primitive {@code char}
513   * instances instead.
514   */
515  public static CharMatcher forPredicate(
516      final Predicate<? super Character> predicate) {
517    checkNotNull(predicate);
518    if (predicate instanceof CharMatcher) {
519      return (CharMatcher) predicate;
520    }
521    return new CharMatcher() {
522      @Override public boolean matches(char c) {
523        return predicate.apply(c);
524      }
525      @Override public boolean apply(Character character) {
526        return predicate.apply(checkNotNull(character));
527      }
528    };
529  }
530
531  // Abstract methods
532
533  /** Determines a true or false value for the given character. */
534  public abstract boolean matches(char c);
535
536  // Non-static factories
537
538  /**
539   * Returns a matcher that matches any character not matched by this matcher.
540   */
541  public CharMatcher negate() {
542    final CharMatcher original = this;
543    return new CharMatcher() {
544      @Override public boolean matches(char c) {
545        return !original.matches(c);
546      }
547
548      @Override public boolean matchesAllOf(CharSequence sequence) {
549        return original.matchesNoneOf(sequence);
550      }
551      @Override public boolean matchesNoneOf(CharSequence sequence) {
552        return original.matchesAllOf(sequence);
553      }
554      @Override public int countIn(CharSequence sequence) {
555        return sequence.length() - original.countIn(sequence);
556      }
557      @Override public CharMatcher negate() {
558        return original;
559      }
560    };
561  }
562
563  /**
564   * Returns a matcher that matches any character matched by both this matcher
565   * and {@code other}.
566   */
567  public CharMatcher and(CharMatcher other) {
568    return new And(Arrays.asList(this, checkNotNull(other)));
569  }
570
571  private static class And extends CharMatcher {
572    List<CharMatcher> components;
573
574    And(List<CharMatcher> components) {
575      this.components = components; // Skip defensive copy (private)
576    }
577
578    @Override public boolean matches(char c) {
579      for (CharMatcher matcher : components) {
580        if (!matcher.matches(c)) {
581          return false;
582        }
583      }
584      return true;
585    }
586
587    @Override public CharMatcher and(CharMatcher other) {
588      List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
589      newComponents.add(checkNotNull(other));
590      return new And(newComponents);
591    }
592  }
593
594  /**
595   * Returns a matcher that matches any character matched by either this matcher
596   * or {@code other}.
597   */
598  public CharMatcher or(CharMatcher other) {
599    return new Or(Arrays.asList(this, checkNotNull(other)));
600  }
601
602  private static class Or extends CharMatcher {
603    List<CharMatcher> components;
604
605    Or(List<CharMatcher> components) {
606      this.components = components; // Skip defensive copy (private)
607    }
608
609    @Override public boolean matches(char c) {
610      for (CharMatcher matcher : components) {
611        if (matcher.matches(c)) {
612          return true;
613        }
614      }
615      return false;
616    }
617
618    @Override public CharMatcher or(CharMatcher other) {
619      List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
620      newComponents.add(checkNotNull(other));
621      return new Or(newComponents);
622    }
623
624    @Override protected void setBits(LookupTable table) {
625      for (CharMatcher matcher : components) {
626        matcher.setBits(table);
627      }
628    }
629  }
630
631  /**
632   * Returns a {@code char} matcher functionally equivalent to this one, but
633   * which may be faster to query than the original; your mileage may vary.
634   * Precomputation takes time and is likely to be worthwhile only if the
635   * precomputed matcher is queried many thousands of times.
636   *
637   * <p>This method has no effect (returns {@code this}) when called in GWT:
638   * it's unclear whether a precomputed matcher is faster, but it certainly
639   * consumes more memory, which doesn't seem like a worthwhile tradeoff in a
640   * browser.
641   */
642  public CharMatcher precomputed() {
643    return Platform.precomputeCharMatcher(this);
644  }
645
646  /**
647   * This is the actual implementation of {@link #precomputed}, but we bounce
648   * calls through a method on {@link Platform} so that we can have different
649   * behavior in GWT.
650   *
651   * <p>The default precomputation is to cache the configuration of the original
652   * matcher in an eight-kilobyte bit array. In some situations this produces a
653   * matcher which is faster to query than the original.
654   *
655   * <p>The default implementation creates a new bit array and passes it to
656   * {@link #setBits(LookupTable)}.
657   */
658  CharMatcher precomputedInternal() {
659    final LookupTable table = new LookupTable();
660    setBits(table);
661
662    return new CharMatcher() {
663      @Override public boolean matches(char c) {
664        return table.get(c);
665      }
666
667      // TODO: make methods like negate() smart
668
669      @Override public CharMatcher precomputed() {
670        return this;
671      }
672    };
673  }
674
675  /**
676   * For use by implementors; sets the bit corresponding to each character ('\0'
677   * to '{@literal \}uFFFF') that matches this matcher in the given bit array,
678   * leaving all other bits untouched.
679   *
680   * <p>The default implementation loops over every possible character value,
681   * invoking {@link #matches} for each one.
682   */
683  protected void setBits(LookupTable table) {
684    char c = Character.MIN_VALUE;
685    while (true) {
686      if (matches(c)) {
687        table.set(c);
688      }
689      if (c++ == Character.MAX_VALUE) {
690        break;
691      }
692    }
693  }
694
695  /**
696   * A bit array with one bit per {@code char} value, used by {@link
697   * CharMatcher#precomputed}.
698   *
699   * <p>TODO: possibly share a common BitArray class with BloomFilter
700   * and others... a simpler java.util.BitSet.
701   */
702  protected static class LookupTable {
703    int[] data = new int[2048];
704
705    void set(char index) {
706      data[index >> 5] |= (1 << index);
707    }
708    boolean get(char index) {
709      return (data[index >> 5] & (1 << index)) != 0;
710    }
711  }
712
713  // Text processing routines
714
715  /**
716   * Returns {@code true} if a character sequence contains only matching
717   * characters.
718   *
719   * <p>The default implementation iterates over the sequence, invoking {@link
720   * #matches} for each character, until this returns {@code false} or the end
721   * is reached.
722   *
723   * @param sequence the character sequence to examine, possibly empty
724   * @return {@code true} if this matcher matches every character in the
725   *     sequence, including when the sequence is empty
726   */
727  public boolean matchesAllOf(CharSequence sequence) {
728    for (int i = sequence.length() - 1; i >= 0; i--) {
729      if (!matches(sequence.charAt(i))) {
730        return false;
731      }
732    }
733    return true;
734  }
735
736  /**
737   * Returns {@code true} if a character sequence contains no matching
738   * characters.
739   *
740   * <p>The default implementation iterates over the sequence, invoking {@link
741   * #matches} for each character, until this returns {@code false} or the end is
742   * reached.
743   *
744   * @param sequence the character sequence to examine, possibly empty
745   * @return {@code true} if this matcher matches every character in the
746   *     sequence, including when the sequence is empty
747   */
748  public boolean matchesNoneOf(CharSequence sequence) {
749    return indexIn(sequence) == -1;
750  }
751
752  // TODO: perhaps add matchesAnyOf()
753
754  /**
755   * Returns the index of the first matching character in a character sequence,
756   * or {@code -1} if no matching character is present.
757   *
758   * <p>The default implementation iterates over the sequence in forward order
759   * calling {@link #matches} for each character.
760   *
761   * @param sequence the character sequence to examine from the beginning
762   * @return an index, or {@code -1} if no character matches
763   */
764  public int indexIn(CharSequence sequence) {
765    int length = sequence.length();
766    for (int i = 0; i < length; i++) {
767      if (matches(sequence.charAt(i))) {
768        return i;
769      }
770    }
771    return -1;
772  }
773
774  /**
775   * Returns the index of the first matching character in a character sequence,
776   * starting from a given position, or {@code -1} if no character matches after
777   * that position.
778   *
779   * <p>The default implementation iterates over the sequence in forward order,
780   * beginning at {@code start}, calling {@link #matches} for each character.
781   *
782   * @param sequence the character sequence to examine
783   * @param start the first index to examine; must be nonnegative and no
784   *     greater than {@code sequence.length()}
785   * @return the index of the first matching character, guaranteed to be no less
786   *     than {@code start}, or {@code -1} if no character matches
787   * @throws IndexOutOfBoundsException if start is negative or greater than
788   *     {@code sequence.length()}
789   */
790  public int indexIn(CharSequence sequence, int start) {
791    int length = sequence.length();
792    Preconditions.checkPositionIndex(start, length);
793    for (int i = start; i < length; i++) {
794      if (matches(sequence.charAt(i))) {
795        return i;
796      }
797    }
798    return -1;
799  }
800
801  /**
802   * Returns the index of the last matching character in a character sequence,
803   * or {@code -1} if no matching character is present.
804   *
805   * <p>The default implementation iterates over the sequence in reverse order
806   * calling {@link #matches} for each character.
807   *
808   * @param sequence the character sequence to examine from the end
809   * @return an index, or {@code -1} if no character matches
810   */
811  public int lastIndexIn(CharSequence sequence) {
812    for (int i = sequence.length() - 1; i >= 0; i--) {
813      if (matches(sequence.charAt(i))) {
814        return i;
815      }
816    }
817    return -1;
818  }
819
820  /**
821   * Returns the number of matching characters found in a character sequence.
822   */
823  public int countIn(CharSequence sequence) {
824    int count = 0;
825    for (int i = 0; i < sequence.length(); i++) {
826      if (matches(sequence.charAt(i))) {
827        count++;
828      }
829    }
830    return count;
831  }
832
833  /**
834   * Returns a string containing all non-matching characters of a character
835   * sequence, in order. For example: <pre>   {@code
836   *
837   *   CharMatcher.is('a').removeFrom("bazaar")}</pre>
838   *
839   * ... returns {@code "bzr"}.
840   */
841  public String removeFrom(CharSequence sequence) {
842    String string = sequence.toString();
843    int pos = indexIn(string);
844    if (pos == -1) {
845      return string;
846    }
847
848    char[] chars = string.toCharArray();
849    int spread = 1;
850
851    // This unusual loop comes from extensive benchmarking
852    OUT:
853    while (true) {
854      pos++;
855      while (true) {
856        if (pos == chars.length) {
857          break OUT;
858        }
859        if (matches(chars[pos])) {
860          break;
861        }
862        chars[pos - spread] = chars[pos];
863        pos++;
864      }
865      spread++;
866    }
867    return new String(chars, 0, pos - spread);
868  }
869
870  /**
871   * Returns a string containing all matching characters of a character
872   * sequence, in order. For example: <pre>   {@code
873   *
874   *   CharMatcher.is('a').retainFrom("bazaar")}</pre>
875   *
876   * ... returns {@code "aaa"}.
877   */
878  public String retainFrom(CharSequence sequence) {
879    return negate().removeFrom(sequence);
880  }
881
882  /**
883   * Returns a string copy of the input character sequence, with each character
884   * that matches this matcher replaced by a given replacement character. For
885   * example: <pre>   {@code
886   *
887   *   CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
888   *
889   * ... returns {@code "rodor"}.
890   *
891   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find
892   * the first matching character, then iterates the remainder of the sequence
893   * calling {@link #matches(char)} for each character.
894   *
895   * @param sequence the character sequence to replace matching characters in
896   * @param replacement the character to append to the result string in place of
897   *     each matching character in {@code sequence}
898   * @return the new string
899   */
900  public String replaceFrom(CharSequence sequence, char replacement) {
901    String string = sequence.toString();
902    int pos = indexIn(string);
903    if (pos == -1) {
904      return string;
905    }
906    char[] chars = string.toCharArray();
907    chars[pos] = replacement;
908    for (int i = pos + 1; i < chars.length; i++) {
909      if (matches(chars[i])) {
910        chars[i] = replacement;
911      }
912    }
913    return new String(chars);
914  }
915
916  /**
917   * Returns a string copy of the input character sequence, with each character
918   * that matches this matcher replaced by a given replacement sequence. For
919   * example: <pre>   {@code
920   *
921   *   CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
922   *
923   * ... returns {@code "yoohoo"}.
924   *
925   * <p><b>Note:</b> If the replacement is a fixed string with only one character,
926   * you are better off calling {@link #replaceFrom(CharSequence, char)} directly.
927   *
928   * @param sequence the character sequence to replace matching characters in
929   * @param replacement the characters to append to the result string in place
930   *     of each matching character in {@code sequence}
931   * @return the new string
932   */
933  public String replaceFrom(CharSequence sequence, CharSequence replacement) {
934    int replacementLen = replacement.length();
935    if (replacementLen == 0) {
936      return removeFrom(sequence);
937    }
938    if (replacementLen == 1) {
939      return replaceFrom(sequence, replacement.charAt(0));
940    }
941
942    String string = sequence.toString();
943    int pos = indexIn(string);
944    if (pos == -1) {
945      return string;
946    }
947
948    int len = string.length();
949    StringBuilder buf = new StringBuilder((int) (len * 1.5) + 16);
950
951    int oldpos = 0;
952    do {
953      buf.append(string, oldpos, pos);
954      buf.append(replacement);
955      oldpos = pos + 1;
956      pos = indexIn(string, oldpos);
957    } while (pos != -1);
958
959    buf.append(string, oldpos, len);
960    return buf.toString();
961  }
962
963  /**
964   * Returns a substring of the input character sequence that omits all
965   * characters this matcher matches from the beginning and from the end of the
966   * string. For example: <pre> {@code
967   *
968   *   CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
969   *
970   * ... returns {@code "cat"}.
971   *
972   * <p>Note that<pre>   {@code
973   *
974   *   CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
975   *
976   * ... is equivalent to {@link String#trim()}.
977   */
978  public String trimFrom(CharSequence sequence) {
979    int len = sequence.length();
980    int first;
981    int last;
982
983    for (first = 0; first < len; first++) {
984      if (!matches(sequence.charAt(first))) {
985        break;
986      }
987    }
988    for (last = len - 1; last > first; last--) {
989      if (!matches(sequence.charAt(last))) {
990        break;
991      }
992    }
993
994    return sequence.subSequence(first, last + 1).toString();
995  }
996
997  /**
998   * Returns a substring of the input character sequence that omits all
999   * characters this matcher matches from the beginning of the
1000   * string. For example: <pre> {@code
1001   *
1002   *   CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1003   *
1004   * ... returns {@code "catbab"}.
1005   */
1006  public String trimLeadingFrom(CharSequence sequence) {
1007    int len = sequence.length();
1008    int first;
1009
1010    for (first = 0; first < len; first++) {
1011      if (!matches(sequence.charAt(first))) {
1012        break;
1013      }
1014    }
1015
1016    return sequence.subSequence(first, len).toString();
1017  }
1018
1019  /**
1020   * Returns a substring of the input character sequence that omits all
1021   * characters this matcher matches from the end of the
1022   * string. For example: <pre> {@code
1023   *
1024   *   CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1025   *
1026   * ... returns {@code "abacat"}.
1027   */
1028  public String trimTrailingFrom(CharSequence sequence) {
1029    int len = sequence.length();
1030    int last;
1031
1032    for (last = len - 1; last >= 0; last--) {
1033      if (!matches(sequence.charAt(last))) {
1034        break;
1035      }
1036    }
1037
1038    return sequence.subSequence(0, last + 1).toString();
1039  }
1040
1041  /**
1042   * Returns a string copy of the input character sequence, with each group of
1043   * consecutive characters that match this matcher replaced by a single
1044   * replacement character. For example: <pre>   {@code
1045   *
1046   *   CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1047   *
1048   * ... returns {@code "b-p-r"}.
1049   *
1050   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find
1051   * the first matching character, then iterates the remainder of the sequence
1052   * calling {@link #matches(char)} for each character.
1053   *
1054   * @param sequence the character sequence to replace matching groups of
1055   *     characters in
1056   * @param replacement the character to append to the result string in place of
1057   *     each group of matching characters in {@code sequence}
1058   * @return the new string
1059   */
1060  public String collapseFrom(CharSequence sequence, char replacement) {
1061    int first = indexIn(sequence);
1062    if (first == -1) {
1063      return sequence.toString();
1064    }
1065
1066    // TODO: this implementation can probably be made faster.
1067
1068    StringBuilder builder = new StringBuilder(sequence.length())
1069        .append(sequence.subSequence(0, first))
1070        .append(replacement);
1071    boolean in = true;
1072    for (int i = first + 1; i < sequence.length(); i++) {
1073      char c = sequence.charAt(i);
1074      if (apply(c)) {
1075        if (!in) {
1076          builder.append(replacement);
1077          in = true;
1078        }
1079      } else {
1080        builder.append(c);
1081        in = false;
1082      }
1083    }
1084    return builder.toString();
1085  }
1086
1087  /**
1088   * Collapses groups of matching characters exactly as {@link #collapseFrom}
1089   * does, except that groups of matching characters at the start or end of the
1090   * sequence are removed without replacement.
1091   */
1092  public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1093    int first = negate().indexIn(sequence);
1094    if (first == -1) {
1095      return ""; // everything matches. nothing's left.
1096    }
1097    StringBuilder builder = new StringBuilder(sequence.length());
1098    boolean inMatchingGroup = false;
1099    for (int i = first; i < sequence.length(); i++) {
1100      char c = sequence.charAt(i);
1101      if (apply(c)) {
1102        inMatchingGroup = true;
1103      } else {
1104        if (inMatchingGroup) {
1105          builder.append(replacement);
1106          inMatchingGroup = false;
1107        }
1108        builder.append(c);
1109      }
1110    }
1111    return builder.toString();
1112  }
1113
1114  // Predicate interface
1115
1116  /**
1117   * Returns {@code true} if this matcher matches the given character.
1118   *
1119   * @throws NullPointerException if {@code character} is null
1120   */
1121  /*@Override*/ public boolean apply(Character character) {
1122    return matches(character);
1123  }
1124}