1/**
2 * Copyright (c) 2000, Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.android.mail.common.base;
18
19import static com.google.android.mail.common.base.Preconditions.checkArgument;
20
21import com.google.common.base.Joiner;
22import com.google.common.base.Joiner.MapJoiner;
23
24import java.io.IOException;
25import java.io.InputStream;
26import java.io.StringWriter;
27import java.util.ArrayList;
28import java.util.Collection;
29import java.util.Collections;
30import java.util.HashMap;
31import java.util.HashSet;
32import java.util.Iterator;
33import java.util.LinkedHashMap;
34import java.util.LinkedList;
35import java.util.List;
36import java.util.Map;
37import java.util.Set;
38import java.util.StringTokenizer;
39import java.util.regex.Matcher;
40import java.util.regex.Pattern;
41
42/**
43 * Static utility methods and constants pertaining to {@code String} or {@code
44 * CharSequence} instances.
45 */
46public final class StringUtil {
47  private StringUtil() {} // COV_NF_LINE
48
49  /**
50   * A completely arbitrary selection of eight whitespace characters. See
51   * <a href="http://go/white+space">this spreadsheet</a> for more details
52   * about whitespace characters.
53   *
54   * @deprecated Rewrite your code to use {@link CharMatcher#WHITESPACE}, or
55   *     consider the precise set of characters you want to match and construct
56   *     the right explicit {@link CharMatcher} or {@link String} for your own
57   *     purposes.
58   */
59  @Deprecated
60  public static final String WHITE_SPACES = " \r\n\t\u3000\u00A0\u2007\u202F";
61
62  /** A string containing the carriage return and linefeed characters. */
63  public static final String LINE_BREAKS = "\r\n";
64
65  /**
66   * Old location of {@link Strings#isNullOrEmpty}; this method will be
67   * deprecated soon.
68   */
69  public static boolean isEmpty(String string) {
70    return Strings.isNullOrEmpty(string);
71  }
72
73  /**
74   * Returns {@code true} if the given string is null, empty, or comprises only
75   * whitespace characters, as defined by {@link CharMatcher#WHITESPACE}.
76   *
77   * <p><b>Warning:</b> there are many competing definitions of "whitespace";
78   * please see <a href="http://go/white+space">this spreadsheet</a> for
79   * details.
80   *
81   * @param string the string reference to check
82   * @return {@code true} if {@code string} is null, empty, or consists of
83   *     whitespace characters only
84   */
85  public static boolean isEmptyOrWhitespace(String string) {
86    return string == null || CharMatcher.WHITESPACE.matchesAllOf(string);
87  }
88
89  /**
90   * Old location of {@link Strings#nullToEmpty}; this method will be
91   * deprecated soon.
92   */
93  public static String makeSafe(String string) {
94    return Strings.nullToEmpty(string);
95  }
96
97  /**
98   * Old location of {@link Strings#emptyToNull}; this method will be
99   * deprecated soon.
100   */
101  public static String toNullIfEmpty(String string) {
102    return Strings.emptyToNull(string);
103  }
104
105  /**
106   * Returns the given string if it is nonempty and contains at least one
107   * non-whitespace character; {@code null} otherwise. See comment in {@link
108   * #isEmptyOrWhitespace} on the definition of whitespace.
109   *
110   * @param string the string to test and possibly return
111   * @return {@code null} if {@code string} is null, empty, or contains only
112   *     whitespace characters; {@code string} itself otherwise
113   */
114  public static String toNullIfEmptyOrWhitespace(
115      String string) {
116    return isEmptyOrWhitespace(string) ? null : string;
117  }
118
119  /**
120   * Old location of {@link Strings#repeat}; this method will be deprecated
121   * soon.
122   */
123  public static String repeat(String string, int count) {
124    return Strings.repeat(string, count);
125  }
126
127  /**
128   * Return the first index in the string of any of the specified characters,
129   * starting at a given index, or {@code -1} if none of the characters is
130   * present.
131   *
132   * @param string the non-null character sequence to look in
133   * @param chars a non-null character sequence containing the set of characters
134   *     to look for. If empty, this method will find no matches and return
135   *     {@code -1}
136   * @param fromIndex the index of the first character to examine in the input
137   *     string. If negative, the entire string will be searched. If greater
138   *     than or equal to the string length, no characters will be searched and
139   *     {@code -1} will be returned.
140   * @return the index of the first match, or {@code -1} if no match was found.
141   *     Guaranteed to be either {@code -1} or a number greater than or equal to
142   *     {@code fromIndex}
143   * @throws NullPointerException if any argument is null
144   */
145  // author: pault
146  public static int indexOfChars(
147      CharSequence string, CharSequence chars, int fromIndex) {
148    if (fromIndex >= string.length()) {
149      return -1;
150    }
151
152    /*
153     * Prepare lookup structures for the characters. TODO(pault): This loop
154     * could be factored into another method to allow caching of the resulting
155     * struct if a use-case of very large character sets exists.
156     */
157    Set<Character> charSet = Collections.emptySet();
158    boolean[] charArray = new boolean[128];
159    for (int i = 0; i < chars.length(); i++) {
160      char c = chars.charAt(i);
161      if (c < 128) {
162        charArray[c] = true;
163      } else {
164        if (charSet.isEmpty()) {
165          charSet = new HashSet<Character>();
166        }
167        charSet.add(c);
168      }
169    }
170
171    // Scan the string for matches
172    for (int i = Math.max(fromIndex, 0); i < string.length(); i++) {
173      char c = string.charAt(i);
174      if (c < 128) {
175        if (charArray[c]) {
176          return i;
177        }
178      } else if (charSet.contains(c)) {
179        return i;
180      }
181    }
182    return -1;
183  }
184
185/*
186 * -------------------------------------------------------------------
187 * This marks the end of the code that has been written or rewritten
188 * in 2008 to the quality standards of the Java core libraries group.
189 * Code below this point is still awaiting cleanup (you can help!).
190 * See http://wiki/Nonconf/JavaCoreLibrariesStandards.
191 * -------------------------------------------------------------------
192 */
193
194
195  /**
196   * @param str the string to split.  Must not be null.
197   * @param delims the delimiter characters. Each character in the
198   *        string is individually treated as a delimiter.
199   * @return an array of tokens. Will not return null. Individual tokens
200   *        do not have leading/trailing whitespace removed.
201   * @deprecated see the detailed instructions under
202   *     {@link #split(String, String, boolean)}
203   */
204  @Deprecated
205  public static String[] split(String str, String delims) {
206    return split(str, delims, false);
207  }
208
209  /**
210   * This method is deprecated because it is too inflexible, providing
211   * only a very specific set of behaviors that almost never matches exactly
212   * what you intend. Prefer using a {@link Splitter}, which is more flexible
213   * and consistent in the way it handles trimming and empty tokens.
214   *
215   * <ul>
216   * <li>Create a {@link Splitter} using {@link Splitter#on(CharMatcher)} such
217   *     as {@code Splitter.on(CharMatcher.anyOf(delims))}.
218   * <li><i>If</i> you need whitespace trimmed from the ends of each segment,
219   *     adding {@code .trimResults()} to your splitter definition should work
220   *     in most cases. To match the exact behavior of this method, use
221   *     {@code .trimResults(CharMatcher.inRange('\0', ' '))}.
222   * <li>This method silently ignores empty tokens in the input, but allows
223   *     empty tokens to appear in the output if {@code trimTokens} is
224   *     {@code true}. Adding {@code .omitEmptyStrings()} to your splitter
225   *     definition will filter empty tokens out but will do so <i>after</i>
226   *     having performed trimming. If you absolutely require this method's
227   *     behavior in this respect, Splitter is not able to match it.
228   * <li>If you need the result as an array, use {@link
229   *     com.google.common.collect.Iterables#toArray(Iterable, Class)} on the
230   *     {@code Iterable<String>} returned by {@link Splitter#split}.
231   * </ul>
232   *
233   * @param str the string to split.  Must not be null.
234   * @param delims the delimiter characters. Each character in the string
235   *        is individually treated as a delimiter.
236   * @param trimTokens if true, leading/trailing whitespace is removed
237   *        from the tokens
238   * @return an array of tokens. Will not return null.
239   * @deprecated
240   */
241  @Deprecated
242  public static String[] split(
243      String str, String delims, boolean trimTokens) {
244    StringTokenizer tokenizer = new StringTokenizer(str, delims);
245    int n = tokenizer.countTokens();
246    String[] list = new String[n];
247    for (int i = 0; i < n; i++) {
248      if (trimTokens) {
249        list[i] = tokenizer.nextToken().trim();
250      } else {
251        list[i] = tokenizer.nextToken();
252      }
253    }
254    return list;
255  }
256
257  /**
258   * Trim characters from only the beginning of a string.
259   * This is a convenience method, it simply calls trimStart(s, null).
260   *
261   * @param s String to be trimmed
262   * @return String with whitespace characters removed from the beginning
263   */
264  public static String trimStart(String s) {
265    return trimStart(s, null);
266  }
267
268  /**
269   * Trim characters from only the beginning of a string.
270   * This method will remove all whitespace characters
271   * (defined by Character.isWhitespace(char), in addition to the characters
272   * provided, from the end of the provided string.
273   *
274   * @param s String to be trimmed
275   * @param extraChars Characters in addition to whitespace characters that
276   *                   should be trimmed.  May be null.
277   * @return String with whitespace and characters in extraChars removed
278   *                   from the beginning
279   */
280  public static String trimStart(String s, String extraChars) {
281    int trimCount = 0;
282    while (trimCount < s.length()) {
283      char ch = s.charAt(trimCount);
284      if (Character.isWhitespace(ch)
285        || (extraChars != null && extraChars.indexOf(ch) >= 0)) {
286        trimCount++;
287      } else {
288        break;
289      }
290    }
291
292    if (trimCount == 0) {
293      return s;
294    }
295    return s.substring(trimCount);
296  }
297
298  /**
299   * Trim characters from only the end of a string.
300   * This is a convenience method, it simply calls trimEnd(s, null).
301   *
302   * @param s String to be trimmed
303   * @return String with whitespace characters removed from the end
304   */
305  public static String trimEnd(String s) {
306    return trimEnd(s, null);
307  }
308
309  /**
310   * Trim characters from only the end of a string.
311   * This method will remove all whitespace characters
312   * (defined by Character.isWhitespace(char), in addition to the characters
313   * provided, from the end of the provided string.
314   *
315   * @param s String to be trimmed
316   * @param extraChars Characters in addition to whitespace characters that
317   *                   should be trimmed.  May be null.
318   * @return String with whitespace and characters in extraChars removed
319   *                   from the end
320   */
321  public static String trimEnd(String s, String extraChars) {
322    int trimCount = 0;
323    while (trimCount < s.length()) {
324      char ch = s.charAt(s.length() - trimCount - 1);
325      if (Character.isWhitespace(ch)
326        || (extraChars != null && extraChars.indexOf(ch) >= 0)) {
327        trimCount++;
328      } else {
329        break;
330      }
331    }
332
333    if (trimCount == 0) {
334      return s;
335    }
336    return s.substring(0, s.length() - trimCount);
337  }
338
339  /**
340   * @param str the string to split.  Must not be null.
341   * @param delims the delimiter characters. Each character in the
342   *        string is individually treated as a delimiter.
343   * @return an array of tokens. Will not return null. Leading/trailing
344   *        whitespace is removed from the tokens.
345   * @deprecated see the detailed instructions under
346   *     {@link #split(String, String, boolean)}
347   */
348  @Deprecated
349  public static String[] splitAndTrim(String str, String delims) {
350    return split(str, delims, true);
351  }
352
353  /** Parse comma-separated list of ints and return as array. */
354  public static int[] splitInts(String str) throws IllegalArgumentException {
355    StringTokenizer tokenizer = new StringTokenizer(str, ",");
356    int n = tokenizer.countTokens();
357    int[] list = new int[n];
358    for (int i = 0; i < n; i++) {
359      String token = tokenizer.nextToken();
360      list[i] = Integer.parseInt(token);
361    }
362    return list;
363  }
364
365  /** Parse comma-separated list of longs and return as array. */
366  public static long[] splitLongs(String str) throws IllegalArgumentException {
367    StringTokenizer tokenizer = new StringTokenizer(str, ",");
368    int n = tokenizer.countTokens();
369    long[] list = new long[n];
370    for (int i = 0; i < n; i++) {
371      String token = tokenizer.nextToken();
372      list[i] = Long.parseLong(token);
373    }
374    return list;
375  }
376
377  /** This replaces the occurrences of 'what' in 'str' with 'with'
378   *
379   * @param str the string to process
380   * @param what to replace
381   * @param with replace with this
382   * @return String str where 'what' was replaced with 'with'
383   *
384   * @deprecated Please use {@link String#replace(CharSequence, CharSequence)}.
385   */
386  @Deprecated
387  public static String replace(
388      String str, CharSequence what, CharSequence with) {
389    // Have to check this argument, for compatibility with the old impl.
390    // For the record, String.replace() is capable of handling an empty target
391    // string... but it does something kind of weird in that case.
392    checkArgument(what.length() > 0);
393    return str.replace(what, with);
394  }
395
396  private static final Splitter NEWLINE_SPLITTER =
397      Splitter.on('\n').omitEmptyStrings();
398
399  /**
400   * Reformats the given string to a fixed width by inserting carriage returns
401   * and trimming unnecessary whitespace. See
402   * {@link #fixedWidth(String[], int)} for details. The {@code str} argument
403   * to this method will be split on newline characters ({@code '\n'}) only
404   * (regardless of platform).  An array of resulting non-empty strings is
405   * then passed to {@link #fixedWidth(String[], int)} as the {@code lines}
406   * parameter.
407   *
408   * @param str the string to format
409   * @param width the fixed width (in characters)
410   */
411  public static String fixedWidth(String str, int width) {
412    List<String> lines = new ArrayList<String>();
413
414    for (String line : NEWLINE_SPLITTER.split(str)) {
415      lines.add(line);
416    }
417
418    String[] lineArray = lines.toArray(new String[0]);
419    return fixedWidth(lineArray, width);
420  }
421
422  /**
423   * Reformats the given array of lines to a fixed width by inserting
424   * newlines and trimming unnecessary whitespace.  This uses simple
425   * whitespace-based splitting, not sophisticated internationalized
426   * line breaking.  Newlines within a line are treated like any other
427   * whitespace.  Lines which are already short enough will be passed
428   * through unmodified.
429   *
430   * <p>Only breaking whitespace characters (those which match
431   * {@link CharMatcher#BREAKING_WHITESPACE}) are treated as whitespace by
432   * this method. Non-breaking whitespace characters will be considered as
433   * ordinary characters which are connected to any other adjacent
434   * non-whitespace characters, and will therefore appear in the returned
435   * string in their original context.
436   *
437   * @param lines array of lines to format
438   * @param width the fixed width (in characters)
439   */
440  public static String fixedWidth(String[] lines, int width) {
441    List<String> formattedLines = new ArrayList<String>();
442
443    for (String line : lines) {
444      formattedLines.add(formatLineToFixedWidth(line, width));
445    }
446
447    return Joiner.on('\n').join(formattedLines);
448  }
449
450  private static final Splitter TO_WORDS =
451      Splitter.on(CharMatcher.BREAKING_WHITESPACE).omitEmptyStrings();
452
453  /**
454   * Helper method for {@link #fixedWidth(String[], int)}
455   */
456  private static String formatLineToFixedWidth(String line, int width) {
457    if (line.length() <= width) {
458      return line;
459    }
460
461    StringBuilder builder = new StringBuilder();
462    int col = 0;
463
464    for (String word : TO_WORDS.split(line)) {
465      if (col == 0) {
466        col = word.length();
467      } else {
468        int newCol = col + word.length() + 1;  // +1 for the space
469
470        if (newCol <= width) {
471          builder.append(' ');
472          col = newCol;
473        } else {
474          builder.append('\n');
475          col = word.length();
476        }
477      }
478
479      builder.append(word);
480    }
481
482    return builder.toString();
483  }
484
485  /**
486   * Splits the argument original into a list of substrings.  All the
487   * substrings in the returned list (except possibly the last) will
488   * have length lineLen.
489   *
490   * @param lineLen  the length of the substrings to put in the list
491   * @param original the original string
492   *
493   * @return a list of strings of length lineLen that together make up the
494   *     original string
495   * @deprecated use {@code Splitter.fixedLength(lineLen).split(original))}
496   *     (note that it returns an {@code Iterable}, not a {@code List})
497   */
498  @Deprecated
499  public static List<String> fixedSplit(String original, int lineLen) {
500    List<String> output = new ArrayList<String>();
501    for (String elem : Splitter.fixedLength(lineLen).split(original)) {
502      output.add(elem);
503    }
504    return output;
505  }
506
507  /**
508   * Indents the given String per line.
509   * @param iString the string to indent
510   * @param iIndentDepth the depth of the indentation
511   * @return the indented string
512   */
513  public static String indent(String iString, int iIndentDepth) {
514    StringBuilder spacer = new StringBuilder();
515    spacer.append("\n");
516    for (int i = 0; i < iIndentDepth; i++) {
517      spacer.append("  ");
518    }
519    return iString.replace("\n", spacer.toString());
520  }
521
522  /**
523   * This is a both way strip.
524   *
525   * @param str the string to strip
526   * @param left strip from left
527   * @param right strip from right
528   * @param what character(s) to strip
529   * @return the stripped string
530   * @deprecated ensure the string is not null and use
531   *  <ul>
532   *    <li> {@code CharMatcher.anyOf(what).trimFrom(str)}
533   *        if {@code left == true} and {@code right == true}
534   *    <li> {@code CharMatcher.anyOf(what).trimLeadingFrom(str)}
535   *        if {@code left == true} and {@code right == false}
536   *    <li> {@code CharMatcher.anyOf(what).trimTrailingFrom(str)}
537   *        if {@code left == false} and {@code right == true}
538   *  </ul>
539   */
540  @Deprecated
541  public static String megastrip(String str,
542                                 boolean left, boolean right,
543                                 String what) {
544    if (str == null) {
545      return null;
546    }
547
548    CharMatcher matcher = CharMatcher.anyOf(what);
549    if (left) {
550      if (right) {
551        return matcher.trimFrom(str);
552      }
553      return matcher.trimLeadingFrom(str);
554    }
555    if (right) {
556      return matcher.trimTrailingFrom(str);
557    }
558    return str;
559  }
560
561  /** strip - strips both ways
562   *
563   * @param str what to strip
564   * @return String the striped string
565   * @deprecated ensure the string is not null and use {@code
566   *     CharMatcher.LEGACY_WHITESPACE.trimFrom(str)}; also consider whether you
567   *     really want the legacy whitespace definition, or something more
568   *     standard like {@link CharMatcher#WHITESPACE}.
569   */
570  @SuppressWarnings("deprecation") // this is deprecated itself
571  @Deprecated public static String strip(String str) {
572    return (str == null) ? null : CharMatcher.LEGACY_WHITESPACE.trimFrom(str);
573  }
574
575  /** Strip white spaces from both end, and collapse white spaces
576   * in the middle.
577   *
578   * @param str what to strip
579   * @return String the striped and collapsed string
580   * @deprecated ensure the string is not null and use {@code
581   *     CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' ')}; also
582   *     consider whether you really want the legacy whitespace definition, or
583   *     something more standard like {@link CharMatcher#WHITESPACE}.
584   */
585  @SuppressWarnings("deprecation") // this is deprecated itself
586  @Deprecated public static String stripAndCollapse(String str) {
587    return (str == null) ? null
588        : CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' ');
589  }
590
591  /**
592   * Give me a string and a potential prefix, and I return the string
593   * following the prefix if the prefix matches, else null.
594   * Analogous to the c++ functions strprefix and var_strprefix.
595   *
596   * @param str the string to strip
597   * @param prefix the expected prefix
598   * @return the stripped string or <code>null</code> if the string
599   * does not start with the prefix
600   */
601  public static String stripPrefix(String str, String prefix) {
602    return str.startsWith(prefix)
603        ? str.substring(prefix.length())
604        : null;
605  }
606
607  /**
608   * Case insensitive version of stripPrefix. Strings are compared in
609   * the same way as in {@link String#equalsIgnoreCase}.
610   * Analogous to the c++ functions strcaseprefix and var_strcaseprefix.
611   *
612   * @param str the string to strip
613   * @param prefix the expected prefix
614   * @return the stripped string or <code>null</code> if the string
615   * does not start with the prefix
616   */
617  public static String stripPrefixIgnoreCase(String str, String prefix) {
618    return startsWithIgnoreCase(str, prefix)
619        ? str.substring(prefix.length())
620        : null;
621  }
622
623  /**
624   * Give me a string and a potential suffix, and I return the string
625   * before the suffix if the suffix matches, else null.
626   * Analogous to the c++ function strsuffix.
627   *
628   * @param str the string to strip
629   * @param suffix the expected suffix
630   * @return the stripped string or <code>null</code> if the string
631   * does not end with the suffix
632   */
633  public static String stripSuffix(String str, String suffix) {
634    return str.endsWith(suffix)
635        ? str.substring(0, str.length() - suffix.length())
636        : null;
637  }
638
639  /**
640   * Case insensitive version of stripSuffix. Strings are compared in
641   * the same way as in {@link String#equalsIgnoreCase}.
642   * Analogous to the c++ function strcasesuffix.
643   *
644   * @param str the string to strip
645   * @param suffix the expected suffix
646   * @return the stripped string or <code>null</code> if the string
647   * does not end with the suffix
648   */
649  public static String stripSuffixIgnoreCase(
650      String str, String suffix) {
651    return endsWithIgnoreCase(str, suffix)
652        ? str.substring(0, str.length() - suffix.length())
653        : null;
654  }
655
656  /**
657   * Strips all non-digit characters from a string.
658   *
659   * The resulting string will only contain characters for which isDigit()
660   * returns true.
661   *
662   * @param str the string to strip
663   * @return a string consisting of digits only, or an empty string
664   * @deprecated use {@code CharMatcher.JAVA_DIGIT.retainFrom(str)} (also
665   *     consider whether this is really the definition of "digit" you wish to
666   *     use)
667   */
668  @Deprecated public static String stripNonDigits(String str) {
669    return CharMatcher.JAVA_DIGIT.retainFrom(str);
670  }
671
672  /**
673   * Finds the last index in str of a character not in the characters
674   * in 'chars' (similar to ANSI string.find_last_not_of).
675   *
676   * Returns -1 if no such character can be found.
677   *
678   * <p><b>Note:</b> If {@code fromIndex} is zero, use {@link CharMatcher}
679   * instead for this: {@code CharMatcher.noneOf(chars).lastIndexIn(str)}.
680   */
681  // TODO(kevinb): after adding fromIndex versions of (last)IndexOf to
682  // CharMatcher, deprecate this
683  public static int lastIndexNotOf(String str, String chars, int fromIndex) {
684    fromIndex = Math.min(fromIndex, str.length() - 1);
685
686    for (int pos = fromIndex; pos >= 0; pos--) {
687      if (chars.indexOf(str.charAt(pos)) < 0) {
688        return pos;
689      }
690    }
691
692    return -1;
693  }
694
695  /**
696   * Like String.replace() except that it accepts any number of old chars.
697   * Replaces any occurrances of 'oldchars' in 'str' with 'newchar'.
698   * Example: replaceChars("Hello, world!", "H,!", ' ') returns " ello  world "
699   *
700   * @deprecated use {@code CharMatcher#replaceFrom(String, char)}, for example
701   *     {@code CharMatcher.anyOf(oldchars).replaceFrom(str, newchar)}
702   */
703  @Deprecated public static String replaceChars(
704      String str, CharSequence oldchars, char newchar) {
705    return CharMatcher.anyOf(oldchars).replaceFrom(str, newchar);
706  }
707
708  /**
709   * Remove any occurrances of 'oldchars' in 'str'.
710   * Example: removeChars("Hello, world!", ",!") returns "Hello world"
711   *
712   * @deprecated use {@link CharMatcher#removeFrom(CharSequence)}, for example
713   *     {@code CharMatcher.anyOf(oldchars).removeFrom(str)}
714   */
715  @Deprecated public static String removeChars(
716      String str, CharSequence oldchars) {
717    return CharMatcher.anyOf(oldchars).removeFrom(str);
718  }
719
720  // See http://www.microsoft.com/typography/unicode/1252.htm
721  private static final CharMatcher FANCY_SINGLE_QUOTE
722      = CharMatcher.anyOf("\u0091\u0092\u2018\u2019");
723  private static final CharMatcher FANCY_DOUBLE_QUOTE
724      = CharMatcher.anyOf("\u0093\u0094\u201c\u201d");
725
726  /**
727   * Replaces microsoft "smart quotes" (curly " and ') with their
728   * ascii counterparts.
729   */
730  public static String replaceSmartQuotes(String str) {
731    String tmp = FANCY_SINGLE_QUOTE.replaceFrom(str, '\'');
732    return FANCY_DOUBLE_QUOTE.replaceFrom(tmp, '"');
733  }
734
735  /**
736   * Convert a string of hex digits to a byte array, with the first
737   * byte in the array being the MSB. The string passed in should be
738   * just the raw digits (upper or lower case), with no leading
739   * or trailing characters (like '0x' or 'h').
740   * An odd number of characters is supported.
741   * If the string is empty, an empty array will be returned.
742   *
743   * This is significantly faster than using
744   *   new BigInteger(str, 16).toByteArray();
745   * especially with larger strings. Here are the results of some
746   * microbenchmarks done on a P4 2.8GHz 2GB RAM running
747   * linux 2.4.22-gg11 and JDK 1.5 with an optimized build:
748   *
749   * String length        hexToBytes (usec)   BigInteger
750   * -----------------------------------------------------
751   * 16                       0.570                 1.43
752   * 256                      8.21                 44.4
753   * 1024                    32.8                 526
754   * 16384                  546                121000
755   */
756  public static byte[] hexToBytes(CharSequence str) {
757    byte[] bytes = new byte[(str.length() + 1) / 2];
758    if (str.length() == 0) {
759      return bytes;
760    }
761    bytes[0] = 0;
762    int nibbleIdx = (str.length() % 2);
763    for (int i = 0; i < str.length(); i++) {
764      char c = str.charAt(i);
765      if (!isHex(c)) {
766        throw new IllegalArgumentException("string contains non-hex chars");
767      }
768      if ((nibbleIdx % 2) == 0) {
769        bytes[nibbleIdx >> 1] = (byte) (hexValue(c) << 4);
770      } else {
771        bytes[nibbleIdx >> 1] += (byte) hexValue(c);
772      }
773      nibbleIdx++;
774    }
775    return bytes;
776  }
777
778  /**
779   * Converts any instances of "\r" or "\r\n" style EOLs into "\n" (Line Feed).
780   */
781  public static String convertEOLToLF(String input) {
782    StringBuilder res = new StringBuilder(input.length());
783    char[] s = input.toCharArray();
784    int from = 0;
785    final int end = s.length;
786    for (int i = 0; i < end; i++) {
787      if (s[i] == '\r') {
788        res.append(s, from, i - from);
789        res.append('\n');
790        if (i + 1 < end && s[i + 1] == '\n') {
791          i++;
792        }
793
794        from = i + 1;
795      }
796    }
797
798    if (from == 0) {   // no \r!
799      return input;
800    }
801
802    res.append(s, from, end - from);
803    return res.toString();
804  }
805
806  /**
807   * Old location of {@link Strings#padStart}; this method will be deprecated
808   * soon.
809   */
810  public static String padLeft(String s, int len, char padChar) {
811    return Strings.padStart(s, len, padChar);
812  }
813
814  /**
815   * Old location of {@link Strings#padEnd}; this method will be deprecated
816   * soon.
817   */
818  public static String padRight(String s, int len, char padChar) {
819    return Strings.padEnd(s, len, padChar);
820  }
821
822  /**
823   * Returns a string consisting of "s", with each of the first "len" characters
824   * replaced by "maskChar" character.
825   */
826  public static String maskLeft(String s, int len, char maskChar) {
827    if (len <= 0) {
828      return s;
829    }
830    len = Math.min(len, s.length());
831    StringBuilder sb = new StringBuilder();
832    for (int i = 0; i < len; i++) {
833      sb.append(maskChar);
834    }
835    sb.append(s.substring(len));
836    return sb.toString();
837  }
838
839  private static boolean isOctal(char c) {
840    return (c >= '0') && (c <= '7');
841  }
842
843  private static boolean isHex(char c) {
844    return ((c >= '0') && (c <= '9')) ||
845           ((c >= 'a') && (c <= 'f')) ||
846           ((c >= 'A') && (c <= 'F'));
847  }
848
849  private static int hexValue(char c) {
850    if ((c >= '0') && (c <= '9')) {
851      return (c - '0');
852    } else if ((c >= 'a') && (c <= 'f')) {
853      return (c - 'a') + 10;
854    } else {
855      return (c - 'A') + 10;
856    }
857  }
858
859  /**
860   * Unescape any C escape sequences (\n, \r, \\, \ooo, etc) and return the
861   * resulting string.
862   */
863  public static String unescapeCString(String s) {
864    if (s.indexOf('\\') < 0) {
865      // Fast path: nothing to unescape
866      return s;
867    }
868
869    StringBuilder sb = new StringBuilder();
870    int len = s.length();
871    for (int i = 0; i < len;) {
872      char c = s.charAt(i++);
873      if (c == '\\' && (i < len)) {
874        c = s.charAt(i++);
875        switch (c) {
876          case 'a':  c = '\007';  break;
877          case 'b':  c = '\b';    break;
878          case 'f':  c = '\f';    break;
879          case 'n':  c = '\n';    break;
880          case 'r':  c = '\r';    break;
881          case 't':  c = '\t';    break;
882          case 'v':  c = '\013';  break;
883          case '\\': c = '\\';    break;
884          case '?':  c = '?';     break;
885          case '\'': c = '\'';    break;
886          case '"':  c = '\"';    break;
887
888          default: {
889            if ((c == 'x') && (i < len) && isHex(s.charAt(i))) {
890              // "\xXX"
891              int v = hexValue(s.charAt(i++));
892              if ((i < len) && isHex(s.charAt(i))) {
893                v = v * 16 + hexValue(s.charAt(i++));
894              }
895              c = (char) v;
896            } else if (isOctal(c)) {
897              // "\OOO"
898              int v = (c - '0');
899              if ((i < len) && isOctal(s.charAt(i))) {
900                v = v * 8 + (s.charAt(i++) - '0');
901              }
902              if ((i < len) && isOctal(s.charAt(i))) {
903                v = v * 8 + (s.charAt(i++) - '0');
904              }
905              c = (char) v;
906            } else {
907              // Propagate unknown escape sequences.
908              sb.append('\\');
909            }
910            break;
911          }
912        }
913      }
914      sb.append(c);
915    }
916    return sb.toString();
917  }
918
919  /**
920   * Unescape any MySQL escape sequences.
921   * See MySQL language reference Chapter 6 at
922   * <a href="http://www.mysql.com/doc/">http://www.mysql.com/doc/</a>.
923   * This function will <strong>not</strong> work for other SQL-like
924   * dialects.
925   * @param s string to unescape, with the surrounding quotes.
926   * @return unescaped string, without the surrounding quotes.
927   * @exception IllegalArgumentException if s is not a valid MySQL string.
928   */
929  public static String unescapeMySQLString(String s)
930      throws IllegalArgumentException {
931    // note: the same buffer is used for both reading and writing
932    // it works because the writer can never outrun the reader
933    char chars[] = s.toCharArray();
934
935    // the string must be quoted 'like this' or "like this"
936    if (chars.length < 2 || chars[0] != chars[chars.length - 1] ||
937        (chars[0] != '\'' && chars[0] != '"')) {
938      throw new IllegalArgumentException("not a valid MySQL string: " + s);
939    }
940
941    // parse the string and decode the backslash sequences; in addition,
942    // quotes can be escaped 'like this: ''', "like this: """, or 'like this: "'
943    int j = 1;  // write position in the string (never exceeds read position)
944    int f = 0;  // state: 0 (normal), 1 (backslash), 2 (quote)
945    for (int i = 1; i < chars.length - 1; i++) {
946      if (f == 0) {             // previous character was normal
947        if (chars[i] == '\\') {
948          f = 1;  // backslash
949        } else if (chars[i] == chars[0]) {
950          f = 2;  // quoting character
951        } else {
952          chars[j++] = chars[i];
953        }
954      } else if (f == 1) {      // previous character was a backslash
955        switch (chars[i]) {
956          case '0':   chars[j++] = '\0';   break;
957          case '\'':  chars[j++] = '\'';   break;
958          case '"':   chars[j++] = '"';    break;
959          case 'b':   chars[j++] = '\b';   break;
960          case 'n':   chars[j++] = '\n';   break;
961          case 'r':   chars[j++] = '\r';   break;
962          case 't':   chars[j++] = '\t';   break;
963          case 'z':   chars[j++] = '\032'; break;
964          case '\\':  chars[j++] = '\\';   break;
965          default:
966            // if the character is not special, backslash disappears
967            chars[j++] = chars[i];
968            break;
969        }
970        f = 0;
971      } else {                  // previous character was a quote
972        // quoting characters must be doubled inside a string
973        if (chars[i] != chars[0]) {
974          throw new IllegalArgumentException("not a valid MySQL string: " + s);
975        }
976        chars[j++] = chars[0];
977        f = 0;
978      }
979    }
980    // string contents cannot end with a special character
981    if (f != 0) {
982      throw new IllegalArgumentException("not a valid MySQL string: " + s);
983    }
984
985    // done
986    return new String(chars, 1, j - 1);
987  }
988
989  // TODO(pbarry): move all HTML methods to common.html package
990
991  static final Map<String, Character> ESCAPE_STRINGS;
992  static final Set<Character> HEX_LETTERS;
993
994  static {
995    // HTML character entity references as defined in HTML 4
996    // see http://www.w3.org/TR/REC-html40/sgml/entities.html
997    ESCAPE_STRINGS = new HashMap<String, Character>(252);
998
999    ESCAPE_STRINGS.put("&nbsp", '\u00A0');
1000    ESCAPE_STRINGS.put("&iexcl", '\u00A1');
1001    ESCAPE_STRINGS.put("&cent", '\u00A2');
1002    ESCAPE_STRINGS.put("&pound", '\u00A3');
1003    ESCAPE_STRINGS.put("&curren", '\u00A4');
1004    ESCAPE_STRINGS.put("&yen", '\u00A5');
1005    ESCAPE_STRINGS.put("&brvbar", '\u00A6');
1006    ESCAPE_STRINGS.put("&sect", '\u00A7');
1007    ESCAPE_STRINGS.put("&uml", '\u00A8');
1008    ESCAPE_STRINGS.put("&copy", '\u00A9');
1009    ESCAPE_STRINGS.put("&ordf", '\u00AA');
1010    ESCAPE_STRINGS.put("&laquo", '\u00AB');
1011    ESCAPE_STRINGS.put("&not", '\u00AC');
1012    ESCAPE_STRINGS.put("&shy", '\u00AD');
1013    ESCAPE_STRINGS.put("&reg", '\u00AE');
1014    ESCAPE_STRINGS.put("&macr", '\u00AF');
1015    ESCAPE_STRINGS.put("&deg", '\u00B0');
1016    ESCAPE_STRINGS.put("&plusmn", '\u00B1');
1017    ESCAPE_STRINGS.put("&sup2", '\u00B2');
1018    ESCAPE_STRINGS.put("&sup3", '\u00B3');
1019    ESCAPE_STRINGS.put("&acute", '\u00B4');
1020    ESCAPE_STRINGS.put("&micro", '\u00B5');
1021    ESCAPE_STRINGS.put("&para", '\u00B6');
1022    ESCAPE_STRINGS.put("&middot", '\u00B7');
1023    ESCAPE_STRINGS.put("&cedil", '\u00B8');
1024    ESCAPE_STRINGS.put("&sup1", '\u00B9');
1025    ESCAPE_STRINGS.put("&ordm", '\u00BA');
1026    ESCAPE_STRINGS.put("&raquo", '\u00BB');
1027    ESCAPE_STRINGS.put("&frac14", '\u00BC');
1028    ESCAPE_STRINGS.put("&frac12", '\u00BD');
1029    ESCAPE_STRINGS.put("&frac34", '\u00BE');
1030    ESCAPE_STRINGS.put("&iquest", '\u00BF');
1031    ESCAPE_STRINGS.put("&Agrave", '\u00C0');
1032    ESCAPE_STRINGS.put("&Aacute", '\u00C1');
1033    ESCAPE_STRINGS.put("&Acirc", '\u00C2');
1034    ESCAPE_STRINGS.put("&Atilde", '\u00C3');
1035    ESCAPE_STRINGS.put("&Auml", '\u00C4');
1036    ESCAPE_STRINGS.put("&Aring", '\u00C5');
1037    ESCAPE_STRINGS.put("&AElig", '\u00C6');
1038    ESCAPE_STRINGS.put("&Ccedil", '\u00C7');
1039    ESCAPE_STRINGS.put("&Egrave", '\u00C8');
1040    ESCAPE_STRINGS.put("&Eacute", '\u00C9');
1041    ESCAPE_STRINGS.put("&Ecirc", '\u00CA');
1042    ESCAPE_STRINGS.put("&Euml", '\u00CB');
1043    ESCAPE_STRINGS.put("&Igrave", '\u00CC');
1044    ESCAPE_STRINGS.put("&Iacute", '\u00CD');
1045    ESCAPE_STRINGS.put("&Icirc", '\u00CE');
1046    ESCAPE_STRINGS.put("&Iuml", '\u00CF');
1047    ESCAPE_STRINGS.put("&ETH", '\u00D0');
1048    ESCAPE_STRINGS.put("&Ntilde", '\u00D1');
1049    ESCAPE_STRINGS.put("&Ograve", '\u00D2');
1050    ESCAPE_STRINGS.put("&Oacute", '\u00D3');
1051    ESCAPE_STRINGS.put("&Ocirc", '\u00D4');
1052    ESCAPE_STRINGS.put("&Otilde", '\u00D5');
1053    ESCAPE_STRINGS.put("&Ouml", '\u00D6');
1054    ESCAPE_STRINGS.put("&times", '\u00D7');
1055    ESCAPE_STRINGS.put("&Oslash", '\u00D8');
1056    ESCAPE_STRINGS.put("&Ugrave", '\u00D9');
1057    ESCAPE_STRINGS.put("&Uacute", '\u00DA');
1058    ESCAPE_STRINGS.put("&Ucirc", '\u00DB');
1059    ESCAPE_STRINGS.put("&Uuml", '\u00DC');
1060    ESCAPE_STRINGS.put("&Yacute", '\u00DD');
1061    ESCAPE_STRINGS.put("&THORN", '\u00DE');
1062    ESCAPE_STRINGS.put("&szlig", '\u00DF');
1063    ESCAPE_STRINGS.put("&agrave", '\u00E0');
1064    ESCAPE_STRINGS.put("&aacute", '\u00E1');
1065    ESCAPE_STRINGS.put("&acirc", '\u00E2');
1066    ESCAPE_STRINGS.put("&atilde", '\u00E3');
1067    ESCAPE_STRINGS.put("&auml", '\u00E4');
1068    ESCAPE_STRINGS.put("&aring", '\u00E5');
1069    ESCAPE_STRINGS.put("&aelig", '\u00E6');
1070    ESCAPE_STRINGS.put("&ccedil", '\u00E7');
1071    ESCAPE_STRINGS.put("&egrave", '\u00E8');
1072    ESCAPE_STRINGS.put("&eacute", '\u00E9');
1073    ESCAPE_STRINGS.put("&ecirc", '\u00EA');
1074    ESCAPE_STRINGS.put("&euml", '\u00EB');
1075    ESCAPE_STRINGS.put("&igrave", '\u00EC');
1076    ESCAPE_STRINGS.put("&iacute", '\u00ED');
1077    ESCAPE_STRINGS.put("&icirc", '\u00EE');
1078    ESCAPE_STRINGS.put("&iuml", '\u00EF');
1079    ESCAPE_STRINGS.put("&eth", '\u00F0');
1080    ESCAPE_STRINGS.put("&ntilde", '\u00F1');
1081    ESCAPE_STRINGS.put("&ograve", '\u00F2');
1082    ESCAPE_STRINGS.put("&oacute", '\u00F3');
1083    ESCAPE_STRINGS.put("&ocirc", '\u00F4');
1084    ESCAPE_STRINGS.put("&otilde", '\u00F5');
1085    ESCAPE_STRINGS.put("&ouml", '\u00F6');
1086    ESCAPE_STRINGS.put("&divide", '\u00F7');
1087    ESCAPE_STRINGS.put("&oslash", '\u00F8');
1088    ESCAPE_STRINGS.put("&ugrave", '\u00F9');
1089    ESCAPE_STRINGS.put("&uacute", '\u00FA');
1090    ESCAPE_STRINGS.put("&ucirc", '\u00FB');
1091    ESCAPE_STRINGS.put("&uuml", '\u00FC');
1092    ESCAPE_STRINGS.put("&yacute", '\u00FD');
1093    ESCAPE_STRINGS.put("&thorn", '\u00FE');
1094    ESCAPE_STRINGS.put("&yuml", '\u00FF');
1095    ESCAPE_STRINGS.put("&fnof", '\u0192');
1096    ESCAPE_STRINGS.put("&Alpha", '\u0391');
1097    ESCAPE_STRINGS.put("&Beta", '\u0392');
1098    ESCAPE_STRINGS.put("&Gamma", '\u0393');
1099    ESCAPE_STRINGS.put("&Delta", '\u0394');
1100    ESCAPE_STRINGS.put("&Epsilon", '\u0395');
1101    ESCAPE_STRINGS.put("&Zeta", '\u0396');
1102    ESCAPE_STRINGS.put("&Eta", '\u0397');
1103    ESCAPE_STRINGS.put("&Theta", '\u0398');
1104    ESCAPE_STRINGS.put("&Iota", '\u0399');
1105    ESCAPE_STRINGS.put("&Kappa", '\u039A');
1106    ESCAPE_STRINGS.put("&Lambda", '\u039B');
1107    ESCAPE_STRINGS.put("&Mu", '\u039C');
1108    ESCAPE_STRINGS.put("&Nu", '\u039D');
1109    ESCAPE_STRINGS.put("&Xi", '\u039E');
1110    ESCAPE_STRINGS.put("&Omicron", '\u039F');
1111    ESCAPE_STRINGS.put("&Pi", '\u03A0');
1112    ESCAPE_STRINGS.put("&Rho", '\u03A1');
1113    ESCAPE_STRINGS.put("&Sigma", '\u03A3');
1114    ESCAPE_STRINGS.put("&Tau", '\u03A4');
1115    ESCAPE_STRINGS.put("&Upsilon", '\u03A5');
1116    ESCAPE_STRINGS.put("&Phi", '\u03A6');
1117    ESCAPE_STRINGS.put("&Chi", '\u03A7');
1118    ESCAPE_STRINGS.put("&Psi", '\u03A8');
1119    ESCAPE_STRINGS.put("&Omega", '\u03A9');
1120    ESCAPE_STRINGS.put("&alpha", '\u03B1');
1121    ESCAPE_STRINGS.put("&beta", '\u03B2');
1122    ESCAPE_STRINGS.put("&gamma", '\u03B3');
1123    ESCAPE_STRINGS.put("&delta", '\u03B4');
1124    ESCAPE_STRINGS.put("&epsilon", '\u03B5');
1125    ESCAPE_STRINGS.put("&zeta", '\u03B6');
1126    ESCAPE_STRINGS.put("&eta", '\u03B7');
1127    ESCAPE_STRINGS.put("&theta", '\u03B8');
1128    ESCAPE_STRINGS.put("&iota", '\u03B9');
1129    ESCAPE_STRINGS.put("&kappa", '\u03BA');
1130    ESCAPE_STRINGS.put("&lambda", '\u03BB');
1131    ESCAPE_STRINGS.put("&mu", '\u03BC');
1132    ESCAPE_STRINGS.put("&nu", '\u03BD');
1133    ESCAPE_STRINGS.put("&xi", '\u03BE');
1134    ESCAPE_STRINGS.put("&omicron", '\u03BF');
1135    ESCAPE_STRINGS.put("&pi", '\u03C0');
1136    ESCAPE_STRINGS.put("&rho", '\u03C1');
1137    ESCAPE_STRINGS.put("&sigmaf", '\u03C2');
1138    ESCAPE_STRINGS.put("&sigma", '\u03C3');
1139    ESCAPE_STRINGS.put("&tau", '\u03C4');
1140    ESCAPE_STRINGS.put("&upsilon", '\u03C5');
1141    ESCAPE_STRINGS.put("&phi", '\u03C6');
1142    ESCAPE_STRINGS.put("&chi", '\u03C7');
1143    ESCAPE_STRINGS.put("&psi", '\u03C8');
1144    ESCAPE_STRINGS.put("&omega", '\u03C9');
1145    ESCAPE_STRINGS.put("&thetasym", '\u03D1');
1146    ESCAPE_STRINGS.put("&upsih", '\u03D2');
1147    ESCAPE_STRINGS.put("&piv", '\u03D6');
1148    ESCAPE_STRINGS.put("&bull", '\u2022');
1149    ESCAPE_STRINGS.put("&hellip", '\u2026');
1150    ESCAPE_STRINGS.put("&prime", '\u2032');
1151    ESCAPE_STRINGS.put("&Prime", '\u2033');
1152    ESCAPE_STRINGS.put("&oline", '\u203E');
1153    ESCAPE_STRINGS.put("&frasl", '\u2044');
1154    ESCAPE_STRINGS.put("&weierp", '\u2118');
1155    ESCAPE_STRINGS.put("&image", '\u2111');
1156    ESCAPE_STRINGS.put("&real", '\u211C');
1157    ESCAPE_STRINGS.put("&trade", '\u2122');
1158    ESCAPE_STRINGS.put("&alefsym", '\u2135');
1159    ESCAPE_STRINGS.put("&larr", '\u2190');
1160    ESCAPE_STRINGS.put("&uarr", '\u2191');
1161    ESCAPE_STRINGS.put("&rarr", '\u2192');
1162    ESCAPE_STRINGS.put("&darr", '\u2193');
1163    ESCAPE_STRINGS.put("&harr", '\u2194');
1164    ESCAPE_STRINGS.put("&crarr", '\u21B5');
1165    ESCAPE_STRINGS.put("&lArr", '\u21D0');
1166    ESCAPE_STRINGS.put("&uArr", '\u21D1');
1167    ESCAPE_STRINGS.put("&rArr", '\u21D2');
1168    ESCAPE_STRINGS.put("&dArr", '\u21D3');
1169    ESCAPE_STRINGS.put("&hArr", '\u21D4');
1170    ESCAPE_STRINGS.put("&forall", '\u2200');
1171    ESCAPE_STRINGS.put("&part", '\u2202');
1172    ESCAPE_STRINGS.put("&exist", '\u2203');
1173    ESCAPE_STRINGS.put("&empty", '\u2205');
1174    ESCAPE_STRINGS.put("&nabla", '\u2207');
1175    ESCAPE_STRINGS.put("&isin", '\u2208');
1176    ESCAPE_STRINGS.put("&notin", '\u2209');
1177    ESCAPE_STRINGS.put("&ni", '\u220B');
1178    ESCAPE_STRINGS.put("&prod", '\u220F');
1179    ESCAPE_STRINGS.put("&sum", '\u2211');
1180    ESCAPE_STRINGS.put("&minus", '\u2212');
1181    ESCAPE_STRINGS.put("&lowast", '\u2217');
1182    ESCAPE_STRINGS.put("&radic", '\u221A');
1183    ESCAPE_STRINGS.put("&prop", '\u221D');
1184    ESCAPE_STRINGS.put("&infin", '\u221E');
1185    ESCAPE_STRINGS.put("&ang", '\u2220');
1186    ESCAPE_STRINGS.put("&and", '\u2227');
1187    ESCAPE_STRINGS.put("&or", '\u2228');
1188    ESCAPE_STRINGS.put("&cap", '\u2229');
1189    ESCAPE_STRINGS.put("&cup", '\u222A');
1190    ESCAPE_STRINGS.put("&int", '\u222B');
1191    ESCAPE_STRINGS.put("&there4", '\u2234');
1192    ESCAPE_STRINGS.put("&sim", '\u223C');
1193    ESCAPE_STRINGS.put("&cong", '\u2245');
1194    ESCAPE_STRINGS.put("&asymp", '\u2248');
1195    ESCAPE_STRINGS.put("&ne", '\u2260');
1196    ESCAPE_STRINGS.put("&equiv", '\u2261');
1197    ESCAPE_STRINGS.put("&le", '\u2264');
1198    ESCAPE_STRINGS.put("&ge", '\u2265');
1199    ESCAPE_STRINGS.put("&sub", '\u2282');
1200    ESCAPE_STRINGS.put("&sup", '\u2283');
1201    ESCAPE_STRINGS.put("&nsub", '\u2284');
1202    ESCAPE_STRINGS.put("&sube", '\u2286');
1203    ESCAPE_STRINGS.put("&supe", '\u2287');
1204    ESCAPE_STRINGS.put("&oplus", '\u2295');
1205    ESCAPE_STRINGS.put("&otimes", '\u2297');
1206    ESCAPE_STRINGS.put("&perp", '\u22A5');
1207    ESCAPE_STRINGS.put("&sdot", '\u22C5');
1208    ESCAPE_STRINGS.put("&lceil", '\u2308');
1209    ESCAPE_STRINGS.put("&rceil", '\u2309');
1210    ESCAPE_STRINGS.put("&lfloor", '\u230A');
1211    ESCAPE_STRINGS.put("&rfloor", '\u230B');
1212    ESCAPE_STRINGS.put("&lang", '\u2329');
1213    ESCAPE_STRINGS.put("&rang", '\u232A');
1214    ESCAPE_STRINGS.put("&loz", '\u25CA');
1215    ESCAPE_STRINGS.put("&spades", '\u2660');
1216    ESCAPE_STRINGS.put("&clubs", '\u2663');
1217    ESCAPE_STRINGS.put("&hearts", '\u2665');
1218    ESCAPE_STRINGS.put("&diams", '\u2666');
1219    ESCAPE_STRINGS.put("&quot", '\u0022');
1220    ESCAPE_STRINGS.put("&amp", '\u0026');
1221    ESCAPE_STRINGS.put("&lt", '\u003C');
1222    ESCAPE_STRINGS.put("&gt", '\u003E');
1223    ESCAPE_STRINGS.put("&OElig", '\u0152');
1224    ESCAPE_STRINGS.put("&oelig", '\u0153');
1225    ESCAPE_STRINGS.put("&Scaron", '\u0160');
1226    ESCAPE_STRINGS.put("&scaron", '\u0161');
1227    ESCAPE_STRINGS.put("&Yuml", '\u0178');
1228    ESCAPE_STRINGS.put("&circ", '\u02C6');
1229    ESCAPE_STRINGS.put("&tilde", '\u02DC');
1230    ESCAPE_STRINGS.put("&ensp", '\u2002');
1231    ESCAPE_STRINGS.put("&emsp", '\u2003');
1232    ESCAPE_STRINGS.put("&thinsp", '\u2009');
1233    ESCAPE_STRINGS.put("&zwnj", '\u200C');
1234    ESCAPE_STRINGS.put("&zwj", '\u200D');
1235    ESCAPE_STRINGS.put("&lrm", '\u200E');
1236    ESCAPE_STRINGS.put("&rlm", '\u200F');
1237    ESCAPE_STRINGS.put("&ndash", '\u2013');
1238    ESCAPE_STRINGS.put("&mdash", '\u2014');
1239    ESCAPE_STRINGS.put("&lsquo", '\u2018');
1240    ESCAPE_STRINGS.put("&rsquo", '\u2019');
1241    ESCAPE_STRINGS.put("&sbquo", '\u201A');
1242    ESCAPE_STRINGS.put("&ldquo", '\u201C');
1243    ESCAPE_STRINGS.put("&rdquo", '\u201D');
1244    ESCAPE_STRINGS.put("&bdquo", '\u201E');
1245    ESCAPE_STRINGS.put("&dagger", '\u2020');
1246    ESCAPE_STRINGS.put("&Dagger", '\u2021');
1247    ESCAPE_STRINGS.put("&permil", '\u2030');
1248    ESCAPE_STRINGS.put("&lsaquo", '\u2039');
1249    ESCAPE_STRINGS.put("&rsaquo", '\u203A');
1250    ESCAPE_STRINGS.put("&euro", '\u20AC');
1251
1252    HEX_LETTERS = new HashSet<Character>(12);
1253
1254    HEX_LETTERS.add('a');
1255    HEX_LETTERS.add('A');
1256    HEX_LETTERS.add('b');
1257    HEX_LETTERS.add('B');
1258    HEX_LETTERS.add('c');
1259    HEX_LETTERS.add('C');
1260    HEX_LETTERS.add('d');
1261    HEX_LETTERS.add('D');
1262    HEX_LETTERS.add('e');
1263    HEX_LETTERS.add('E');
1264    HEX_LETTERS.add('f');
1265    HEX_LETTERS.add('F');
1266  }
1267
1268  /**
1269   * <p>
1270   * Replace all the occurences of HTML escape strings with the
1271   * respective characters.
1272   * </p>
1273   * <p>
1274   * The default mode is strict (requiring semicolons).
1275   * </p>
1276   *
1277   * @param s a <code>String</code> value
1278   * @return a <code>String</code> value
1279   * @throws NullPointerException if the input string is null.
1280   */
1281  public static final String unescapeHTML(String s) {
1282    return unescapeHTML(s, false);
1283  }
1284
1285  /**
1286   * Replace all the occurences of HTML escape strings with the
1287   * respective characters.
1288   *
1289   * @param s a <code>String</code> value
1290   * @param emulateBrowsers a <code>Boolean</code> value that tells the method
1291   *     to allow entity refs not terminated with a semicolon to be unescaped.
1292   *     (a quirk of this feature, and some browsers, is that an explicit
1293   *     terminating character is needed - e.g., &lt$ would be unescaped, but
1294   *     not &ltab - see the tests for a more in-depth description of browsers)
1295   * @return a <code>String</code> value
1296   * @throws NullPointerException if the input string is null.
1297   */
1298  public static final String unescapeHTML(String s, boolean emulateBrowsers) {
1299
1300    // See if there are any '&' in the string since that is what we look
1301    // for to escape. If there isn't, then we don't need to escape this string
1302    // Based on similar technique used in the escape function.
1303    int index = s.indexOf('&');
1304    if (index == -1) {
1305      // Nothing to escape. Return the original string.
1306      return s;
1307    }
1308
1309    // We found an escaped character. Start slow escaping from there.
1310    char[] chars = s.toCharArray();
1311    char[] escaped = new char[chars.length];
1312    System.arraycopy(chars, 0, escaped, 0, index);
1313
1314    // Note: escaped[pos] = end of the escaped char array.
1315    int pos = index;
1316
1317    for (int i = index; i < chars.length;) {
1318      if (chars[i] != '&') {
1319        escaped[pos++] = chars[i++];
1320        continue;
1321      }
1322
1323      // Allow e.g. &#123;
1324      int j = i + 1;
1325      boolean isNumericEntity = false;
1326      if (j < chars.length && chars[j] == '#') {
1327        j++;
1328        isNumericEntity = true;
1329      }
1330
1331      // if it's numeric, also check for hex
1332      boolean isHexEntity = false;
1333      if (j < chars.length && (chars[j] == 'x' || chars[j] == 'X')) {
1334        j++;
1335        isHexEntity = true;
1336      }
1337
1338      // Scan until we find a char that is not valid for this sequence.
1339      for (; j < chars.length; j++) {
1340        char ch = chars[j];
1341        boolean isDigit = Character.isDigit(ch);
1342        if (isNumericEntity) {
1343          // non-hex numeric sequence end condition
1344          if (!isHexEntity && !isDigit) {
1345            break;
1346          }
1347          // hex sequence end contition
1348          if (isHexEntity && !isDigit && !HEX_LETTERS.contains(ch)) {
1349            break;
1350          }
1351        }
1352        // anything other than a digit or letter is always an end condition
1353        if (!isDigit && !Character.isLetter(ch)) {
1354          break;
1355        }
1356      }
1357
1358      boolean replaced = false;
1359      if ((j <= chars.length && emulateBrowsers) ||
1360          (j < chars.length && chars[j] == ';')) {
1361        // Check for &#D; and &#xD; pattern
1362        if (i + 2 < chars.length && s.charAt(i + 1) == '#') {
1363          try {
1364            long charcode = 0;
1365            char ch = s.charAt(i + 2);
1366            if (isHexEntity) {
1367              charcode = Long.parseLong(
1368                  new String(chars, i + 3, j - i - 3), 16);
1369            } else if (Character.isDigit(ch)) {
1370              charcode = Long.parseLong(
1371                  new String(chars, i + 2, j - i - 2));
1372            }
1373            // D800 to DFFF are for UTF16 surrogate pairs, and are not valid HTML entities
1374            // Code points 0xFFFE and 0xFFFF are unicode noncharacters
1375            if ((charcode > 0 && charcode < 0xD800) || (charcode > 0xDFFF && charcode < 0xFFFE)) {
1376              escaped[pos++] = (char) charcode;
1377              replaced = true;
1378            } else if (charcode >= 0x10000 && charcode < 0x110000) {
1379              // These characters are represented as surrogate pairs in UTF16
1380              escaped[pos++] = (char) ((charcode - 0x10000) / 0x400 + 0xD800);
1381              escaped[pos++] = (char) ((charcode - 0x10000) % 0x400 + 0xDC00);
1382              replaced = true;
1383            }
1384          } catch (NumberFormatException ex) {
1385            // Failed, not replaced.
1386          }
1387        } else {
1388          String key = new String(chars, i, j - i);
1389          Character repl = ESCAPE_STRINGS.get(key);
1390          if (repl != null) {
1391            escaped[pos++] = repl;
1392            replaced = true;
1393          }
1394        }
1395        // Skip over ';'
1396        if (j < chars.length && chars[j] == ';') {
1397          j++;
1398        }
1399      }
1400
1401      if (!replaced) {
1402        // Not a recognized escape sequence, leave as-is
1403        System.arraycopy(chars, i, escaped, pos, j - i);
1404        pos += j - i;
1405      }
1406      i = j;
1407    }
1408    return new String(escaped, 0, pos);
1409  }
1410
1411  // Escaper for < and > only.
1412  private static final CharEscaper LT_GT_ESCAPE =
1413      new CharEscaperBuilder()
1414        .addEscape('<', "&lt;")
1415        .addEscape('>', "&gt;")
1416        .toEscaper();
1417
1418  private static final Pattern htmlTagPattern =
1419      Pattern.compile("</?[a-zA-Z][^>]*>");
1420
1421  /**
1422   * Given a <code>String</code>, returns an equivalent <code>String</code> with
1423   * all HTML tags stripped. Note that HTML entities, such as "&amp;amp;" will
1424   * still be preserved.
1425   */
1426  public static String stripHtmlTags(String string) {
1427    if ((string == null) || "".equals(string)) {
1428      return string;
1429    }
1430    String stripped = htmlTagPattern.matcher(string).replaceAll("");
1431    /*
1432     * Certain inputs result in a well-formed HTML:
1433     * <<X>script>alert(0)<</X>/script> results in <script>alert(0)</script>
1434     * The following step ensures that no HTML can slip through by replacing all
1435     * < and > characters with &lt; and &gt; after HTML tags were stripped.
1436     */
1437    return LT_GT_ESCAPE.escape(stripped);
1438  }
1439
1440  /**
1441   * We escape some characters in s to be able to insert strings into JavaScript
1442   * code. Also, make sure that we don't write out {@code -->} or
1443   * {@code </script>}, which may close a script tag, or any char in ["'>] which
1444   * might close a tag or attribute if seen inside an attribute.
1445   */
1446  public static String javaScriptEscape(CharSequence s) {
1447    return javaScriptEscapeHelper(s, false);
1448  }
1449
1450  /**
1451   * We escape some characters in s to be able to insert strings into JavaScript
1452   * code. Also, make sure that we don't write out {@code -->} or
1453   * {@code </script>}, which may close a script tag, or any char in ["'>] which
1454   * might close a tag or attribute if seen inside an attribute.
1455   * Turns all non-ascii characters into ASCII javascript escape sequences
1456   * (eg \\uhhhh or \ooo).
1457   */
1458  public static String javaScriptEscapeToAscii(CharSequence s) {
1459    return javaScriptEscapeHelper(s, true);
1460  }
1461
1462  /**
1463   * Represents the type of javascript escaping to perform.  Each enum below
1464   * determines whether to use octal escapes and how to handle quotes.
1465   */
1466  public static enum JsEscapingMode {
1467    /** No octal escapes, pass-through ', and escape " as \". */
1468    JSON,
1469
1470    /** Octal escapes, escapes ' and " to \42 and \47, respectively. */
1471    EMBEDDABLE_JS,
1472
1473    /** Octal escapes, escapes ' and " to \' and \". */
1474    MINIMAL_JS
1475  }
1476
1477  /**
1478   * Helper for javaScriptEscape and javaScriptEscapeToAscii
1479   */
1480  private static String javaScriptEscapeHelper(CharSequence s,
1481                                               boolean escapeToAscii) {
1482    StringBuilder sb = new StringBuilder(s.length() * 9 / 8);
1483    try {
1484      escapeStringBody(s, escapeToAscii, JsEscapingMode.EMBEDDABLE_JS, sb);
1485    } catch (IOException ex) {
1486      // StringBuilder.append does not throw IOExceptions.
1487      throw new RuntimeException(ex);
1488    }
1489    return sb.toString();
1490  }
1491
1492  /**
1493   * Appends the javascript string literal equivalent of plainText to the given
1494   * out buffer.
1495   * @param plainText the string to escape.
1496   * @param escapeToAscii true to encode all characters not in ascii [\x20-\x7e]
1497   *   <br>
1498   *   Full escaping of unicode entites isn't required but this makes
1499   *   sure that unicode strings will survive regardless of the
1500   *   content-encoding of the javascript file which is important when
1501   *   we use this function to autogenerated javascript source files.
1502   *   This is disabled by default because it makes non-latin strings very long.
1503   *   <br>
1504   *   If you seem to have trouble with character-encodings, maybe
1505   *   turn this on to see if the problem goes away.  If so, you need
1506   *   to specify a character encoding for your javascript somewhere.
1507   * @param jsEscapingMode determines the type of escaping to perform.
1508   * @param out the buffer to append output to.
1509   */
1510  /*
1511   * To avoid fallthrough, we would have to either use a hybrid switch-case/if
1512   * approach (which would obscure our special handling for ' and "), duplicate
1513   * the content of the default case, or pass a half-dozen parameters to a
1514   * helper method containing the code from the default case.
1515   */
1516  @SuppressWarnings("fallthrough")
1517  public static void escapeStringBody(
1518      CharSequence plainText, boolean escapeToAscii,
1519      JsEscapingMode jsEscapingMode, Appendable out)
1520      throws IOException {
1521    int pos = 0;  // Index just past the last char in plainText written to out.
1522    int len = plainText.length();
1523    for (int codePoint, charCount, i = 0; i < len; i += charCount) {
1524      codePoint = Character.codePointAt(plainText, i);
1525      charCount = Character.charCount(codePoint);
1526
1527      if (!shouldEscapeChar(codePoint, escapeToAscii, jsEscapingMode)) {
1528        continue;
1529      }
1530
1531      out.append(plainText, pos, i);
1532      pos = i + charCount;
1533      switch (codePoint) {
1534        case '\b': out.append("\\b"); break;
1535        case '\t': out.append("\\t"); break;
1536        case '\n': out.append("\\n"); break;
1537        case '\f': out.append("\\f"); break;
1538        case '\r': out.append("\\r"); break;
1539        case '\\': out.append("\\\\"); break;
1540        case '"': case '\'':
1541          if (jsEscapingMode == JsEscapingMode.JSON && '\'' == codePoint) {
1542            // JSON does not escape a single quote (and it should be surrounded
1543            // by double quotes).
1544            out.append((char) codePoint);
1545            break;
1546          } else if (jsEscapingMode != JsEscapingMode.EMBEDDABLE_JS) {
1547            out.append('\\').append((char) codePoint);
1548            break;
1549          }
1550          // fall through
1551        default:
1552          if (codePoint >= 0x100 || jsEscapingMode == JsEscapingMode.JSON) {
1553            appendHexJavaScriptRepresentation(codePoint, out);
1554          } else {
1555            // Output the minimal octal encoding.  We can't use an encoding
1556            // shorter than three digits if the next digit is a valid octal
1557            // digit.
1558            boolean pad = i + charCount >= len
1559                || isOctal(plainText.charAt(i + charCount));
1560            appendOctalJavaScriptRepresentation((char) codePoint, pad, out);
1561          }
1562          break;
1563      }
1564    }
1565    out.append(plainText, pos, len);
1566  }
1567
1568  /**
1569   * Helper for escapeStringBody, which decides whether to escape a character.
1570   */
1571  private static boolean shouldEscapeChar(int codePoint,
1572      boolean escapeToAscii, JsEscapingMode jsEscapingMode) {
1573    // If non-ASCII chars should be escaped, identify non-ASCII code points.
1574    if (escapeToAscii && (codePoint < 0x20 || codePoint > 0x7e)) {
1575      return true;
1576    }
1577
1578    // If in JSON escaping mode, check JSON *and* JS escaping rules. The JS
1579    // escaping rules will escape more characters than needed for JSON,
1580    // but it is safe to escape any character in JSON.
1581    // TODO(bbavar): Remove unnecessary escaping for JSON, as long as it can be
1582    //               shown that this change in legacy behavior is safe.
1583    if (jsEscapingMode == JsEscapingMode.JSON) {
1584      return mustEscapeCharInJsonString(codePoint)
1585          || mustEscapeCharInJsString(codePoint);
1586    }
1587
1588    // Finally, just check the default JS escaping rules.
1589    return mustEscapeCharInJsString(codePoint);
1590  }
1591
1592  /**
1593   * Returns a javascript representation of the character in a hex escaped
1594   * format.
1595   *
1596   * @param codePoint The codepoint to append.
1597   * @param out The buffer to which the hex representation should be appended.
1598   */
1599  private static void appendHexJavaScriptRepresentation(
1600      int codePoint, Appendable out)
1601      throws IOException {
1602    if (Character.isSupplementaryCodePoint(codePoint)) {
1603      // Handle supplementary unicode values which are not representable in
1604      // javascript.  We deal with these by escaping them as two 4B sequences
1605      // so that they will round-trip properly when sent from java to javascript
1606      // and back.
1607      char[] surrogates = Character.toChars(codePoint);
1608      appendHexJavaScriptRepresentation(surrogates[0], out);
1609      appendHexJavaScriptRepresentation(surrogates[1], out);
1610      return;
1611    }
1612    out.append("\\u")
1613        .append(HEX_CHARS[(codePoint >>> 12) & 0xf])
1614        .append(HEX_CHARS[(codePoint >>> 8) & 0xf])
1615        .append(HEX_CHARS[(codePoint >>> 4) & 0xf])
1616        .append(HEX_CHARS[codePoint & 0xf]);
1617  }
1618
1619  /**
1620   * Returns a javascript representation of the character in a hex escaped
1621   * format. Although this is a rather specific method, it is made public
1622   * because it is also used by the JSCompiler.
1623   *
1624   * @param ch The character to append.
1625   * @param pad true to force use of the full 3 digit representation.
1626   * @param out The buffer to which the hex representation should be appended.
1627   */
1628  private static void appendOctalJavaScriptRepresentation(
1629      char ch, boolean pad, Appendable out) throws IOException {
1630    if (ch >= 0100
1631        // Be paranoid at the end of a string since someone might call
1632        // this method again with another string segment.
1633        || pad) {
1634      out.append('\\')
1635          .append(OCTAL_CHARS[(ch >>> 6) & 0x7])
1636          .append(OCTAL_CHARS[(ch >>> 3) & 0x7])
1637          .append(OCTAL_CHARS[ch & 0x7]);
1638    } else if (ch >= 010) {
1639      out.append('\\')
1640          .append(OCTAL_CHARS[(ch >>> 3) & 0x7])
1641          .append(OCTAL_CHARS[ch & 0x7]);
1642    } else {
1643      out.append('\\')
1644          .append(OCTAL_CHARS[ch & 0x7]);
1645    }
1646  }
1647
1648  /**
1649   * Although this is a rather specific method, it is made public
1650   * because it is also used by the JSCompiler.
1651   *
1652   * @see #appendHexJavaScriptRepresentation(int, Appendable)
1653   */
1654  public static void appendHexJavaScriptRepresentation(StringBuilder sb,
1655                                                       char c) {
1656    try {
1657      appendHexJavaScriptRepresentation(c, sb);
1658    } catch (IOException ex) {
1659      // StringBuilder does not throw IOException.
1660      throw new RuntimeException(ex);
1661    }
1662  }
1663
1664  /**
1665   * Undo escaping as performed in javaScriptEscape(.)
1666   * Throws an IllegalArgumentException if the string contains
1667   * bad escaping.
1668   */
1669  public static String javaScriptUnescape(String s) {
1670    StringBuilder sb = new StringBuilder(s.length());
1671    for (int i = 0; i < s.length(); ) {
1672      char c = s.charAt(i);
1673      if (c == '\\') {
1674        i = javaScriptUnescapeHelper(s, i + 1, sb);
1675      } else {
1676        sb.append(c);
1677        i++;
1678      }
1679    }
1680    return sb.toString();
1681  }
1682
1683  /**
1684   * Looks for an escape code starting at index i of s,
1685   * and appends it to sb.
1686   * @return the index of the first character in s
1687   * after the escape code.
1688   * @throws IllegalArgumentException if the escape code
1689   * is invalid
1690   */
1691  private static int javaScriptUnescapeHelper(String s, int i,
1692                                              StringBuilder sb) {
1693    if (i >= s.length()) {
1694      throw new IllegalArgumentException(
1695          "End-of-string after escape character in [" + s + "]");
1696    }
1697
1698    char c = s.charAt(i++);
1699    switch (c) {
1700      case 'n': sb.append('\n'); break;
1701      case 'r': sb.append('\r'); break;
1702      case 't': sb.append('\t'); break;
1703      case 'b': sb.append('\b'); break;
1704      case 'f': sb.append('\f'); break;
1705      case '\\':
1706      case '\"':
1707      case '\'':
1708      case '>':
1709        sb.append(c);
1710        break;
1711      case '0': case '1': case '2': case '3':
1712      case '4': case '5': case '6': case '7':
1713        --i;  // backup to first octal digit
1714        int nOctalDigits = 1;
1715        int digitLimit = c < '4' ? 3 : 2;
1716        while (nOctalDigits < digitLimit && i + nOctalDigits < s.length()
1717               && isOctal(s.charAt(i + nOctalDigits))) {
1718          ++nOctalDigits;
1719        }
1720        sb.append(
1721            (char) Integer.parseInt(s.substring(i, i + nOctalDigits), 8));
1722        i += nOctalDigits;
1723        break;
1724      case 'x':
1725      case 'u':
1726        String hexCode;
1727        int nHexDigits = (c == 'u' ? 4 : 2);
1728        try {
1729          hexCode = s.substring(i, i + nHexDigits);
1730        } catch (IndexOutOfBoundsException ioobe) {
1731          throw new IllegalArgumentException(
1732              "Invalid unicode sequence [" + s.substring(i) + "] at index " + i
1733              + " in [" + s + "]");
1734        }
1735        int unicodeValue;
1736        try {
1737          unicodeValue = Integer.parseInt(hexCode, 16);
1738        } catch (NumberFormatException nfe) {
1739          throw new IllegalArgumentException(
1740              "Invalid unicode sequence [" + hexCode + "] at index " + i +
1741              " in [" + s + "]");
1742        }
1743        sb.append((char) unicodeValue);
1744        i += nHexDigits;
1745        break;
1746      default:
1747        throw new IllegalArgumentException(
1748            "Unknown escape code [" + c + "] at index " + i + " in [" + s + "]"
1749            );
1750    }
1751
1752    return i;
1753  }
1754
1755  // C0 control characters except \t, \n, and \r and 0xFFFE and 0xFFFF
1756  private static final CharMatcher CONTROL_MATCHER = CharMatcher.anyOf(
1757      "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
1758      "\u0008\u000B\u000C\u000E\u000F" +
1759      "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
1760      "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
1761      "\uFFFE\uFFFF");
1762
1763  /**
1764   * Escape a string that is meant to be embedded in a CDATA section.
1765   * The returned string is guaranteed to be valid CDATA content.
1766   * The syntax of CDATA sections is the following:
1767   * <blockquote>
1768   *   <code>&lt;[!CDATA[...]]&gt;</code>
1769   * </blockquote>
1770   * The only invalid character sequence in a CDATA tag is "]]&gt;".
1771   * If this sequence is present in the input string, we replace
1772   * it by closing the current CDATA field, then write ']]&amp;gt;',
1773   * then reopen a new CDATA section.
1774   */
1775  public static String xmlCDataEscape(String s) {
1776     // Make sure there are no illegal control characters.
1777     s = CONTROL_MATCHER.removeFrom(s);
1778    // Return the original reference if the string doesn't have a match.
1779    int found = s.indexOf("]]>");
1780    if (found == -1) {
1781      return s;
1782    }
1783
1784    // For each occurrence of "]]>", append a string that adds "]]&gt;" after
1785    // the end of the CDATA which has just been closed, then opens a new CDATA.
1786    StringBuilder sb = new StringBuilder();
1787    int prev = 0;
1788    do {
1789      sb.append(s.substring(prev, found + 3));
1790      sb.append("]]&gt;<![CDATA[");
1791      prev = found + 3;
1792    } while ((found = s.indexOf("]]>", prev)) != -1);
1793    sb.append(s.substring(prev));
1794    return sb.toString();
1795  }
1796
1797  /**
1798   * We escape some characters in s to be able to insert strings into Java code
1799   *
1800   * @deprecated Use {@link CharEscapers#asciiHtmlEscaper()} and {@link
1801   * CharEscapers#javaCharEscaper()} or {@link CharEscapers#javaStringEscaper()}
1802   * instead. This method combines two forms of escaping in a way that's rarely
1803   * desired.
1804   */
1805  @Deprecated
1806  public static String javaEscape(String s) {
1807    return JAVA_ESCAPE.escape(s);
1808  }
1809
1810  // Java escaper.
1811  private static final CharEscaper JAVA_ESCAPE =
1812      new CharEscaperBuilder()
1813        .addEscape('\n', "\\n")
1814        .addEscape('\r', "\\r")
1815        .addEscape('\t', "\\t")
1816        .addEscape('\\', "\\\\")
1817        .addEscape('\"', "\\\"")
1818        .addEscape('&', "&amp;")
1819        .addEscape('<', "&lt;")
1820        .addEscape('>', "&gt;")
1821        .addEscape('\'', "\\\'")
1822        .toEscaper();
1823
1824  /**
1825   * Escapes the special characters from a string so it can be used as part of
1826   * a regex pattern. This method is for use on gnu.regexp style regular
1827   * expressions.
1828   *
1829   * @deprecated Use {@link Pattern#quote(String)} instead. Note that it may not
1830   * be compatible with gnu.regexp style regular expressions.
1831   */
1832  @Deprecated
1833  public static String regexEscape(String s) {
1834    return REGEX_ESCAPE.escape(s);
1835  }
1836
1837  // Regex escaper escapes all regex characters.
1838  private static final CharEscaper REGEX_ESCAPE =
1839      new CharEscaperBuilder()
1840        .addEscape('(', "\\(")
1841        .addEscape(')', "\\)")
1842        .addEscape('|', "\\|")
1843        .addEscape('*', "\\*")
1844        .addEscape('+', "\\+")
1845        .addEscape('?', "\\?")
1846        .addEscape('.', "\\.")
1847        .addEscape('{', "\\{")
1848        .addEscape('}', "\\}")
1849        .addEscape('[', "\\[")
1850        .addEscape(']', "\\]")
1851        .addEscape('$', "\\$")
1852        .addEscape('^', "\\^")
1853        .addEscape('\\', "\\\\")
1854        .toEscaper();
1855
1856  /**
1857   *  If you want to preserve the exact
1858   * current (odd) behavior when {@code doStrip} is {@code true}, use
1859   * {@code .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on
1860   * the splitter.
1861   *
1862   * @param in what to process
1863   * @param delimiter the delimiting string
1864   * @return the tokens
1865   * @deprecated see the detailed instructions under
1866   *     {@link #split(String, String, boolean)}
1867   */
1868  @Deprecated
1869  public static LinkedList<String> string2List(
1870      String in, String delimiter, boolean doStrip) {
1871    if (in == null) {
1872      return null;
1873    }
1874
1875    LinkedList<String> out = new LinkedList<String>();
1876    string2Collection(in, delimiter, doStrip, out);
1877    return out;
1878  }
1879
1880  /**
1881   * See the detailed instructions under {@link
1882   * #split(String, String, boolean)}. Pass the resulting {@code Iterable} to
1883   * {@link com.google.common.collect.Sets#newHashSet(Iterable)}. If you want to
1884   * preserve the exact current (odd) behavior when {@code doStrip} is {@code
1885   * true}, use {@code
1886   * .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on the
1887   * splitter.
1888   *
1889   * @param in what to process
1890   * @param delimiter the delimiting string
1891   * @param doStrip to strip the substrings before adding to the list
1892   * @return the tokens
1893   * @deprecated see the detailed instructions under
1894   *     {@link #split(String, String, boolean)}
1895   */
1896  @Deprecated
1897  public static Set<String> string2Set(
1898       String in, String delimiter, boolean doStrip) {
1899    if (in == null) {
1900      return null;
1901    }
1902
1903    HashSet<String> out = new HashSet<String>();
1904    string2Collection(in, delimiter, doStrip, out);
1905    return out;
1906  }
1907
1908  /**
1909   * See the detailed instructions under {@link
1910   * #split(String, String, boolean)}. If you want to preserve the exact current
1911   * (odd) behavior when {@code doStrip} is {@code true}, use {@code
1912   * .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on the
1913   * splitter.
1914   *
1915   * @param in The delimited input string to process
1916   * @param delimiter The string delimiting entries in the input string.
1917   * @param doStrip whether to strip the substrings before adding to the
1918   *          collection
1919   * @param collection The collection to which the strings will be added. If
1920   *          <code>null</code>, a new <code>List</code> will be created.
1921   * @return The collection to which the substrings were added. This is
1922   *         syntactic sugar to allow call chaining.
1923   * @deprecated see the detailed instructions under
1924   *     {@link #split(String, String, boolean)}
1925   */
1926  @Deprecated
1927  public static Collection<String> string2Collection(
1928      String in,
1929      String delimiter,
1930      boolean doStrip,
1931      Collection<String> collection) {
1932    if (in == null) {
1933      return null;
1934    }
1935    if (collection == null) {
1936      collection = new ArrayList<String>();
1937    }
1938    if (delimiter == null || delimiter.length() == 0) {
1939      collection.add(in);
1940      return collection;
1941    }
1942
1943    int fromIndex = 0;
1944    int pos;
1945    while ((pos = in.indexOf(delimiter, fromIndex)) >= 0) {
1946      String interim = in.substring(fromIndex, pos);
1947      if (doStrip) {
1948        interim = strip(interim);
1949      }
1950      if (!doStrip || interim.length() > 0) {
1951        collection.add(interim);
1952      }
1953
1954      fromIndex = pos + delimiter.length();
1955    }
1956
1957    String interim = in.substring(fromIndex);
1958    if (doStrip) {
1959      interim = strip(interim);
1960    }
1961    if (!doStrip || interim.length() > 0) {
1962      collection.add(interim);
1963    }
1964
1965    return collection;
1966  }
1967
1968  /**
1969   * This converts a string to a Map. It will first split the string into
1970   * entries using delimEntry. Then each entry is split into a key and a value
1971   * using delimKey. By default we strip the keys. Use doStripEntry to strip
1972   * also the entries.
1973   *
1974   * Note that this method returns a {@link HashMap}, which means that entries
1975   * will be in no particular order. See {@link #stringToOrderedMap}.
1976   *
1977   * @param in the string to be processed
1978   * @param delimEntry delimiter for the entries
1979   * @param delimKey delimiter between keys and values
1980   * @param doStripEntry strip entries before inserting in the map
1981   *
1982   * @return HashMap
1983   */
1984  public static HashMap<String, String> string2Map(
1985      String in, String delimEntry, String delimKey,
1986      boolean doStripEntry) {
1987    if (in == null) {
1988      return null;
1989    }
1990
1991    return stringToMapImpl(new HashMap<String, String>(), in, delimEntry,
1992        delimKey, doStripEntry);
1993  }
1994
1995  /**
1996   * This converts a string to a Map, with entries in the same order as the
1997   * key/value pairs in the input string. It will first split the string into
1998   * entries using delimEntry. Then each entry is split into a key and a value
1999   * using delimKey. By default we strip the keys. Use doStripEntry to strip
2000   * also the entries.
2001   *
2002   * @param in the string to be processed
2003   * @param delimEntry delimiter for the entries
2004   * @param delimKey delimiter between keys and values
2005   * @param doStripEntry strip entries before inserting in the map
2006   *
2007   * @return key/value pairs as a Map, in order
2008   */
2009  public static Map<String, String> stringToOrderedMap(
2010      String in, String delimEntry, String delimKey,
2011      boolean doStripEntry) {
2012    if (in == null) {
2013      return null;
2014    }
2015
2016    return stringToMapImpl(new LinkedHashMap<String, String>(), in, delimEntry,
2017        delimKey, doStripEntry);
2018  }
2019
2020  /**
2021   * This adds key/value pairs from the given string to the given Map.
2022   * It will first split the string into entries using delimEntry. Then each
2023   * entry is split into a key and a value using delimKey. By default we
2024   * strip the keys. Use doStripEntry to strip also the entries.
2025   *
2026   * @param out - Map to output into
2027   * @param in - the string to be processed
2028   * @param delimEntry - delimiter for the entries
2029   * @param delimKey - delimiter between keys and values
2030   * @param doStripEntry - strip entries before inserting in the map
2031   * @return out, for caller's convenience
2032   */
2033  private static <T extends Map<String, String>> T stringToMapImpl(T out,
2034      String in, String delimEntry, String delimKey, boolean doStripEntry) {
2035
2036    if (isEmpty(delimEntry) || isEmpty(delimKey)) {
2037      out.put(strip(in), "");
2038      return out;
2039    }
2040
2041    Iterator<String> it = string2List(in, delimEntry, false).iterator();
2042    int len = delimKey.length();
2043    while (it.hasNext()) {
2044      String entry = it.next();
2045      int pos = entry.indexOf(delimKey);
2046      if (pos > 0) {
2047        String value = entry.substring(pos + len);
2048        if (doStripEntry) {
2049          value = strip(value);
2050        }
2051        out.put(strip(entry.substring(0, pos)), value);
2052      } else {
2053        out.put(strip(entry), "");
2054      }
2055    }
2056
2057    return out;
2058  }
2059
2060  /**
2061   * This function concatenates the elements of a Map in a string with form
2062   *  "<key1><sepKey><value1><sepEntry>...<keyN><sepKey><valueN>"
2063   *
2064   * @param in - the map to be converted
2065   * @param sepKey - the separator to put between key and value
2066   * @param sepEntry - the separator to put between map entries
2067   * @return String
2068   * @deprecated create a {@link MapJoiner}, for example {@code
2069   *     Joiner.on(sepEntry).withKeyValueSeparator(sepKey)}. Ensure that your
2070   *     map is non-null and use this map joiner's {@link MapJoiner#join(Map)}
2071   *     method. To preserve behavior exactly, just in-line this method call.
2072   */
2073  @Deprecated public static <K, V> String map2String(
2074      Map<K, V> in, String sepKey, String sepEntry) {
2075    return (in == null) ? null : Joiner
2076        .on(sepEntry)
2077        .useForNull("null")
2078        .withKeyValueSeparator(sepKey)
2079        .join(in);
2080  }
2081
2082  /**
2083   * Given a map, creates and returns a new map in which all keys are the
2084   * lower-cased version of each key.
2085   *
2086   * @param map A map containing String keys to be lowercased
2087   * @throws IllegalArgumentException if the map contains duplicate string keys
2088   *           after lower casing
2089   */
2090  public static <V> Map<String, V> lowercaseKeys(Map<String, V> map) {
2091    Map<String, V> result = new HashMap<String, V>(map.size());
2092    for (Map.Entry<String, V> entry : map.entrySet()) {
2093      String key = entry.getKey();
2094      if (result.containsKey(key.toLowerCase())) {
2095        throw new IllegalArgumentException(
2096            "Duplicate string key in map when lower casing");
2097      }
2098      result.put(key.toLowerCase(), entry.getValue());
2099    }
2100    return result;
2101  }
2102
2103  /**
2104   * Replaces any string of adjacent whitespace characters with the whitespace
2105   * character " ".
2106   *
2107   * @param str the string you want to munge
2108   * @return String with no more excessive whitespace!
2109   * @deprecated ensure the string is not null and use {@code
2110   *     CharMatcher.LEGACY_WHITESPACE.collapseFrom(str, ' ')}; also consider
2111   *     whether you really want the legacy whitespace definition, or something
2112   *     more standard like {@link CharMatcher#WHITESPACE}.
2113   */
2114  @Deprecated public static String collapseWhitespace(String str) {
2115    return (str == null) ? null
2116        : CharMatcher.LEGACY_WHITESPACE.collapseFrom(str, ' ');
2117  }
2118
2119  /**
2120   * Replaces any string of matched characters with the supplied string.<p>
2121   *
2122   * This is a more general version of collapseWhitespace.
2123   *
2124   * <pre>
2125   *   E.g. collapse("hello     world", " ", "::")
2126   *   will return the following string: "hello::world"
2127   * </pre>
2128   *
2129   * @param str the string you want to munge
2130   * @param chars all of the characters to be considered for munge
2131   * @param replacement the replacement string
2132   * @return munged and replaced string.
2133   * @deprecated if {@code replacement} is the empty string, use {@link
2134   *     CharMatcher#removeFrom(CharSequence)}; if it is a single character,
2135   *     use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer
2136   *     replacement strings use {@link String#replaceAll(String, String)} with
2137   *     a regular expression that matches one or more occurrences of {@code
2138   *     chars}. In all cases you must first ensure that {@code str} is not
2139   *     null.
2140   */
2141  @Deprecated public static String collapse(
2142      String str, String chars, String replacement) {
2143    if (str == null) {
2144      return null;
2145    }
2146
2147    StringBuilder newStr = new StringBuilder();
2148
2149    boolean prevCharMatched = false;
2150    char c;
2151    for (int i = 0; i < str.length(); i++) {
2152      c = str.charAt(i);
2153      if (chars.indexOf(c) != -1) {
2154        // this character is matched
2155        if (prevCharMatched) {
2156          // apparently a string of matched chars, so don't append anything
2157          // to the string
2158          continue;
2159        }
2160        prevCharMatched = true;
2161        newStr.append(replacement);
2162      } else {
2163        prevCharMatched = false;
2164        newStr.append(c);
2165      }
2166    }
2167
2168    return newStr.toString();
2169  }
2170
2171  /**
2172   * Returns a string with all sequences of ISO control chars (0x00 to 0x1F and
2173   * 0x7F to 0x9F) replaced by the supplied string.  ISO control characters are
2174   * identified via {@link Character#isISOControl(char)}.
2175   *
2176   * @param str the string you want to strip of ISO control chars
2177   * @param replacement the replacement string
2178   * @return a String with all control characters replaced by the replacement
2179   * string, or null if input is null.
2180   * @deprecated use {@link CharMatcher#JAVA_ISO_CONTROL}. If {@code
2181   *     replacement} is the empty string, use {@link
2182   *     CharMatcher#removeFrom(CharSequence)}; if it is a single character,
2183   *     use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer
2184   *     replacement strings use
2185   *     {@code str.replaceAll("\p{Cntrl}+", replacement)}.
2186   *     In all cases you must first ensure that {@code str} is not null.
2187   */
2188  @Deprecated public static String collapseControlChars(
2189      String str, String replacement) {
2190    /*
2191     * We re-implement the StringUtil.collapse() loop here rather than call
2192     * collapse() with an input String of control chars, because matching via
2193     * isISOControl() is about 10x faster.
2194     */
2195    if (str == null) {
2196      return null;
2197    }
2198
2199    StringBuilder newStr = new StringBuilder();
2200
2201    boolean prevCharMatched = false;
2202    char c;
2203    for (int i = 0; i < str.length(); i++) {
2204      c = str.charAt(i);
2205      if (Character.isISOControl(c)) {
2206        // this character is matched
2207        if (prevCharMatched) {
2208          // apparently a string of matched chars, so don't append anything
2209          // to the string
2210          continue;
2211        }
2212        prevCharMatched = true;
2213        newStr.append(replacement);
2214      } else {
2215        prevCharMatched = false;
2216        newStr.append(c);
2217      }
2218    }
2219
2220    return newStr.toString();
2221  }
2222
2223  /**
2224   * Read a String of up to maxLength bytes from an InputStream.
2225   *
2226   * <p>Note that this method uses the default platform encoding, and expects
2227   * that encoding to be single-byte, which is not always the case. Its use
2228   * is discouraged. For reading the entire stream (maxLength == -1) you can use:
2229   * <pre>
2230   *   CharStreams.toString(new InputStreamReader(is, Charsets.ISO_8859_1))
2231   * </pre>
2232   * {@code CharStreams} is in the {@code com.google.common.io} package.
2233   *
2234   * <p>For maxLength >= 0 a literal translation would be
2235   * <pre>
2236   *   CharStreams.toString(new InputStreamReader(
2237   *       new LimitInputStream(is, maxLength), Charsets.ISO_8859_1))
2238   * </pre>
2239   * For multi-byte encodings that is broken because the limit could end in
2240   * the middle of the character--it would be better to limit the reader than
2241   * the underlying stream.
2242   *
2243   * @param is input stream
2244   * @param maxLength max number of bytes to read from "is". If this is -1, we
2245   *          read everything.
2246   *
2247   * @return String up to maxLength bytes, read from "is"
2248   * @deprecated see the advice above
2249   */
2250  @Deprecated public static String stream2String(InputStream is, int maxLength)
2251      throws IOException {
2252    byte[] buffer = new byte[4096];
2253    StringWriter sw = new StringWriter();
2254    int totalRead = 0;
2255    int read = 0;
2256
2257    do {
2258      sw.write(new String(buffer, 0, read));
2259      totalRead += read;
2260      read = is.read(buffer, 0, buffer.length);
2261    } while (((-1 == maxLength) || (totalRead < maxLength)) && (read != -1));
2262
2263    return sw.toString();
2264  }
2265
2266  /**
2267   * Parse a list of substrings separated by a given delimiter. The delimiter
2268   * can also appear in substrings (just double them):
2269   *
2270   * parseDelimitedString("this|is", '|') returns ["this","is"]
2271   * parseDelimitedString("this||is", '|') returns ["this|is"]
2272   *
2273   * @param list String containing delimited substrings
2274   * @param delimiter Delimiter (anything except ' ' is allowed)
2275   *
2276   * @return String[] A String array of parsed substrings
2277   */
2278  public static String[] parseDelimitedList(String list,
2279                                            char delimiter) {
2280    String delim = "" + delimiter;
2281    // Append a sentinel of delimiter + space
2282    // (see comments below for more info)
2283    StringTokenizer st = new StringTokenizer(list + delim + " ",
2284                                             delim,
2285                                             true);
2286    ArrayList<String> v = new ArrayList<String>();
2287    String lastToken = "";
2288    StringBuilder word = new StringBuilder();
2289
2290    // We keep a sliding window of 2 tokens
2291    //
2292    // delimiter : delimiter -> append delimiter to current word
2293    //                          and clear most recent token
2294    //                          (so delim : delim : delim will not
2295    //                          be treated as two escaped delims.)
2296    //
2297    // tok : delimiter -> append tok to current word
2298    //
2299    // delimiter : tok -> add current word to list, and clear it.
2300    //                    (We append a sentinel that conforms to this
2301    //                    pattern to make sure we've pushed every parsed token)
2302    while (st.hasMoreTokens()) {
2303      String tok = st.nextToken();
2304      if (lastToken != null) {
2305        if (tok.equals(delim)) {
2306          word.append(lastToken);
2307          if (lastToken.equals(delim)) { tok = null; }
2308        } else {
2309          if (word.length() != 0) {
2310            v.add(word.toString());
2311          }
2312          word.setLength(0);
2313        }
2314      }
2315      lastToken = tok;
2316    }
2317
2318    return v.toArray(new String[0]);
2319  }
2320
2321  /**
2322   * Compares two strings, guarding against nulls.
2323   *
2324   * @param nullsAreGreater true if nulls should be greater than any string,
2325   *  false is less than.
2326   * @deprecated use {@link String#CASE_INSENSITIVE_ORDER}, together with
2327   *     {@link com.google.common.collect.Ordering#nullsFirst()} or
2328   *     {@link com.google.common.collect.Ordering#nullsLast()} if
2329   *     needed
2330   */
2331  @Deprecated public static int compareToIgnoreCase(String s1, String s2,
2332      boolean nullsAreGreater) {
2333    if (s1 == s2) {
2334      return 0; // Either both the same String, or both null
2335    }
2336    if (s1 == null) {
2337      return nullsAreGreater ? 1 : -1;
2338    }
2339    if (s2 == null) {
2340      return nullsAreGreater ? -1 : 1;
2341    }
2342    return s1.compareToIgnoreCase(s2);
2343  }
2344
2345  /**
2346   * Splits s with delimiters in delimiter and returns the last token
2347   */
2348  public static String lastToken(String s, String delimiter) {
2349    return s.substring(CharMatcher.anyOf(delimiter).lastIndexIn(s) + 1);
2350  }
2351
2352  private static final Pattern characterReferencePattern =
2353      Pattern.compile("&#?[a-zA-Z0-9]{1,8};");
2354
2355  /**
2356   * Determines if a string contains what looks like an html character
2357   * reference. Useful for deciding whether unescaping is necessary.
2358   */
2359  public static boolean containsCharRef(String s) {
2360    return characterReferencePattern.matcher(s).find();
2361  }
2362
2363  /**
2364   * Determines if a string is a Hebrew word. A string is considered to be
2365   * a Hebrew word if {@link #isHebrew(int)} is true for any of its characters.
2366   */
2367  public static boolean isHebrew(String s) {
2368    int len = s.length();
2369    for (int i = 0; i < len; ++i) {
2370      if (isHebrew(s.codePointAt(i))) {
2371        return true;
2372      }
2373    }
2374    return false;
2375  }
2376
2377  /**
2378   * Determines if a character is a Hebrew character.
2379   */
2380  public static boolean isHebrew(int codePoint) {
2381    return Character.UnicodeBlock.HEBREW.equals(
2382               Character.UnicodeBlock.of(codePoint));
2383  }
2384
2385  /**
2386   * Determines if a string is a CJK word. A string is considered to be CJK
2387   * if {@link #isCjk(char)} is true for any of its characters.
2388   */
2389  public static boolean isCjk(String s) {
2390    int len = s.length();
2391    for (int i = 0; i < len; ++i) {
2392      if (isCjk(s.codePointAt(i))) {
2393        return true;
2394      }
2395    }
2396    return false;
2397  }
2398
2399  /**
2400   * Unicode code blocks containing CJK characters.
2401   */
2402  private static final Set<Character.UnicodeBlock> CJK_BLOCKS;
2403  static {
2404    Set<Character.UnicodeBlock> set = new HashSet<Character.UnicodeBlock>();
2405    set.add(Character.UnicodeBlock.HANGUL_JAMO);
2406    set.add(Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT);
2407    set.add(Character.UnicodeBlock.KANGXI_RADICALS);
2408    set.add(Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION);
2409    set.add(Character.UnicodeBlock.HIRAGANA);
2410    set.add(Character.UnicodeBlock.KATAKANA);
2411    set.add(Character.UnicodeBlock.BOPOMOFO);
2412    set.add(Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO);
2413    set.add(Character.UnicodeBlock.KANBUN);
2414    set.add(Character.UnicodeBlock.BOPOMOFO_EXTENDED);
2415    set.add(Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS);
2416    set.add(Character.UnicodeBlock.ENCLOSED_CJK_LETTERS_AND_MONTHS);
2417    set.add(Character.UnicodeBlock.CJK_COMPATIBILITY);
2418    set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A);
2419    set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS);
2420    set.add(Character.UnicodeBlock.HANGUL_SYLLABLES);
2421    set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS);
2422    set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS);
2423    set.add(Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS);
2424    set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B);
2425    set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT);
2426    CJK_BLOCKS = Collections.unmodifiableSet(set);
2427  }
2428
2429  /**
2430   * Determines if a character is a CJK ideograph or a character typically
2431   * used only in CJK text.
2432   *
2433   * Note: This function cannot handle supplementary characters. To handle all
2434   * Unicode characters, including supplementary characters, use the function
2435   * {@link #isCjk(int)}.
2436   */
2437  public static boolean isCjk(char ch) {
2438    return isCjk((int) ch);
2439  }
2440
2441  /**
2442   * Determines if a character is a CJK ideograph or a character typically
2443   * used only in CJK text.
2444   */
2445  public static boolean isCjk(int codePoint) {
2446    // Time-saving early exit for all Latin-1 characters.
2447    if ((codePoint & 0xFFFFFF00) == 0) {
2448      return false;
2449    }
2450
2451    return CJK_BLOCKS.contains(Character.UnicodeBlock.of(codePoint));
2452  }
2453
2454  /**
2455   * Returns the approximate display width of the string, measured in units of
2456   * ascii characters.
2457   *
2458   * @see StringUtil#displayWidth(char)
2459   */
2460  public static int displayWidth(String s) {
2461    // TODO(kevinb): could reimplement this as
2462    // return s.length() * 2 - CharMatcher.SINGLE_WIDTH.countIn(s);
2463    int width = 0;
2464    int len = s.length();
2465    for (int i = 0; i < len; ++i) {
2466      width += displayWidth(s.charAt(i));
2467    }
2468    return width;
2469  }
2470
2471  /**
2472   * Returns the approximate display width of the character, measured
2473   * in units of ascii characters.
2474   *
2475   * This method should err on the side of caution. By default, characters
2476   * are assumed to have width 2; this covers CJK ideographs, various
2477   * symbols and miscellaneous weird scripts. Given below are some Unicode
2478   * ranges for which it seems safe to assume that no character is
2479   * substantially wider than an ascii character:
2480   *   - Latin, extended Latin, even more extended Latin.
2481   *   - Greek, extended Greek, Cyrillic.
2482   *   - Some symbols (including currency symbols) and punctuation.
2483   *   - Half-width Katakana and Hangul.
2484   *   - Hebrew
2485   *   - Arabic
2486   *   - Thai
2487   * Characters in these ranges are given a width of 1.
2488   *
2489   * IMPORTANT: this function has analogs in C++ (encodingutils.cc,
2490   * named UnicodeCharWidth) and JavaScript
2491   * (java/com/google/ads/common/frontend/adwordsbase/resources/CreateAdUtil.js),
2492   * which need to be updated if you change the implementation here.
2493   */
2494  public static int displayWidth(char ch) {
2495    if (ch <= '\u04f9' ||   // CYRILLIC SMALL LETTER YERU WITH DIAERESIS
2496        ch == '\u05be' ||   // HEBREW PUNCTUATION MAQAF
2497        (ch >= '\u05d0' && ch <= '\u05ea') ||  // HEBREW LETTER ALEF ... TAV
2498        ch == '\u05F3' ||   // HEBREW PUNCTUATION GERESH
2499        ch == '\u05f4' ||   // HEBREW PUNCTUATION GERSHAYIM
2500        (ch >= '\u0600' && ch <= '\u06ff') || // Block=Arabic
2501        (ch >= '\u0750' && ch <= '\u077f') || // Block=Arabic_Supplement
2502        (ch >= '\ufb50' && ch <= '\ufdff') || // Block=Arabic_Presentation_Forms-A
2503        (ch >= '\ufe70' && ch <= '\ufeff') || // Block=Arabic_Presentation_Forms-B
2504        (ch >= '\u1e00' && ch <= '\u20af') || /* LATIN CAPITAL LETTER A WITH RING BELOW
2505                                                 ... DRACHMA SIGN */
2506        (ch >= '\u2100' && ch <= '\u213a') || // ACCOUNT OF ... ROTATED CAPITAL Q
2507        (ch >= '\u0e00' && ch <= '\u0e7f') || // Thai
2508        (ch >= '\uff61' && ch <= '\uffdc')) { /* HALFWIDTH IDEOGRAPHIC FULL STOP
2509                                                 ... HALFWIDTH HANGUL LETTER I */
2510      return 1;
2511    }
2512    return 2;
2513  }
2514
2515  /**
2516   * @return a string representation of the given native array.
2517   */
2518  public static String toString(float[] iArray) {
2519    if (iArray == null) {
2520      return "NULL";
2521    }
2522
2523    StringBuilder buffer = new StringBuilder();
2524    buffer.append("[");
2525    for (int i = 0; i < iArray.length; i++) {
2526      buffer.append(iArray[i]);
2527      if (i != (iArray.length - 1)) {
2528        buffer.append(", ");
2529      }
2530    }
2531    buffer.append("]");
2532    return buffer.toString();
2533  }
2534
2535  /**
2536   * @return a string representation of the given native array.
2537   */
2538  public static String toString(long[] iArray) {
2539    if (iArray == null) {
2540      return "NULL";
2541    }
2542
2543    StringBuilder buffer = new StringBuilder();
2544    buffer.append("[");
2545    for (int i = 0; i < iArray.length; i++) {
2546      buffer.append(iArray[i]);
2547      if (i != (iArray.length - 1)) {
2548        buffer.append(", ");
2549      }
2550    }
2551    buffer.append("]");
2552    return buffer.toString();
2553  }
2554
2555  /**
2556   * @return a string representation of the given native array
2557   */
2558  public static String toString(int[] iArray) {
2559    if (iArray == null) {
2560      return "NULL";
2561    }
2562
2563    StringBuilder buffer = new StringBuilder();
2564    buffer.append("[");
2565    for (int i = 0; i < iArray.length; i++) {
2566      buffer.append(iArray[i]);
2567      if (i != (iArray.length - 1)) {
2568        buffer.append(", ");
2569      }
2570    }
2571    buffer.append("]");
2572    return buffer.toString();
2573  }
2574
2575  /**
2576   * @return a string representation of the given array.
2577   */
2578  public static String toString(String[] iArray) {
2579    if (iArray == null) { return "NULL"; }
2580
2581    StringBuilder buffer = new StringBuilder();
2582    buffer.append("[");
2583    for (int i = 0; i < iArray.length; i++) {
2584      buffer.append("'").append(iArray[i]).append("'");
2585      if (i != iArray.length - 1) {
2586        buffer.append(", ");
2587      }
2588    }
2589    buffer.append("]");
2590
2591    return buffer.toString();
2592  }
2593
2594  /**
2595   * Returns the string, in single quotes, or "NULL". Intended only for
2596   * logging.
2597   *
2598   * @param s the string
2599   * @return the string, in single quotes, or the string "null" if it's null.
2600   */
2601  public static String toString(String s) {
2602    if (s == null) {
2603      return "NULL";
2604    } else {
2605      return new StringBuilder(s.length() + 2).append("'").append(s)
2606                                              .append("'").toString();
2607    }
2608  }
2609
2610  /**
2611   * @return a string representation of the given native array
2612   */
2613  public static String toString(int[][] iArray) {
2614    if (iArray == null) {
2615      return "NULL";
2616    }
2617
2618    StringBuilder buffer = new StringBuilder();
2619    buffer.append("[");
2620    for (int i = 0; i < iArray.length; i++) {
2621      buffer.append("[");
2622      for (int j = 0; j < iArray[i].length; j++) {
2623        buffer.append(iArray[i][j]);
2624        if (j != (iArray[i].length - 1)) {
2625          buffer.append(", ");
2626        }
2627      }
2628      buffer.append("]");
2629      if (i != iArray.length - 1) {
2630        buffer.append(" ");
2631      }
2632    }
2633    buffer.append("]");
2634    return buffer.toString();
2635  }
2636
2637  /**
2638   * @return a string representation of the given native array.
2639   */
2640  public static String toString(long[][] iArray) {
2641    if (iArray == null) { return "NULL"; }
2642
2643    StringBuilder buffer = new StringBuilder();
2644    buffer.append("[");
2645    for (int i = 0; i < iArray.length; i++) {
2646      buffer.append("[");
2647      for (int j = 0; j < iArray[i].length; j++) {
2648        buffer.append(iArray[i][j]);
2649        if (j != (iArray[i].length - 1)) {
2650          buffer.append(", ");
2651        }
2652      }
2653      buffer.append("]");
2654      if (i != iArray.length - 1) {
2655        buffer.append(" ");
2656      }
2657    }
2658    buffer.append("]");
2659    return buffer.toString();
2660  }
2661
2662  /**
2663   * @return a String representation of the given object array.
2664   * The strings are obtained by calling toString() on the
2665   * underlying objects.
2666   */
2667  public static String toString(Object[] obj) {
2668    if (obj == null) { return "NULL"; }
2669    StringBuilder tmp = new StringBuilder();
2670    tmp.append("[");
2671    for (int i = 0; i < obj.length; i++) {
2672      tmp.append(obj[i].toString());
2673      if (i != obj.length - 1) {
2674        tmp.append(",");
2675      }
2676    }
2677    tmp.append("]");
2678    return tmp.toString();
2679  }
2680
2681  private static final char[] HEX_CHARS
2682      = { '0', '1', '2', '3', '4', '5', '6', '7',
2683          '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
2684  private static final char[] OCTAL_CHARS = HEX_CHARS;  // ignore the last 8 :)
2685
2686  /**
2687   * Convert a byte array to a hex-encoding string: "a33bff00..."
2688   *
2689   * @deprecated Use {@link ByteArrays#toHexString}.
2690   */
2691  @Deprecated public static String bytesToHexString(final byte[] bytes) {
2692    return ByteArrays.toHexString(bytes);
2693  }
2694
2695  /**
2696   * Convert a byte array to a hex-encoding string with the specified
2697   * delimiter: "a3&lt;delimiter&gt;3b&lt;delimiter&gt;ff..."
2698   */
2699  public static String bytesToHexString(final byte[] bytes,
2700      Character delimiter) {
2701    StringBuilder hex =
2702      new StringBuilder(bytes.length * (delimiter == null ? 2 : 3));
2703    int nibble1, nibble2;
2704    for (int i = 0; i < bytes.length; i++) {
2705      nibble1 = (bytes[i] >>> 4) & 0xf;
2706      nibble2 = bytes[i] & 0xf;
2707      if (i > 0 && delimiter != null) { hex.append(delimiter.charValue()); }
2708      hex.append(HEX_CHARS[nibble1]);
2709      hex.append(HEX_CHARS[nibble2]);
2710    }
2711    return hex.toString();
2712  }
2713
2714  /**
2715   * Safely convert the string to uppercase.
2716   * @return upper case representation of the String; or null if
2717   * the input string is null.
2718   */
2719  public static String toUpperCase(String src) {
2720    if (src == null) {
2721      return null;
2722    } else {
2723      return src.toUpperCase();
2724    }
2725  }
2726
2727  /**
2728   * Safely convert the string to lowercase.
2729   * @return lower case representation of the String; or null if
2730   * the input string is null.
2731   */
2732  public static String toLowerCase(String src) {
2733    if (src == null) {
2734      return null;
2735    } else {
2736      return src.toLowerCase();
2737    }
2738  }
2739
2740  private static final Pattern dbSpecPattern =
2741      Pattern.compile("(.*)\\{(\\d+),(\\d+)\\}(.*)");
2742
2743  /**
2744   * @param dbSpecComponent a single component of a DBDescriptor spec
2745   * (e.g. the host or database component). The expected format of the string is:
2746   * <br>
2747   *             <center>(prefix){(digits),(digits)}(suffix)</center>
2748   * </br>
2749   * @return a shard expansion of the given String.
2750   * Note that unless the pattern is matched exactly, no expansion is
2751   * performed and the original string is returned unaltered.
2752   * For example, 'db{0,1}.adz' is expanded into 'db0.adz, db1.adz'.
2753   * Note that this method is added to StringUtil instead of
2754   * DBDescriptor to better encapsulate the choice of regexp implementation.
2755   * @throws IllegalArgumentException if the string does not parse.
2756   */
2757  public static String expandShardNames(String dbSpecComponent)
2758      throws IllegalArgumentException, IllegalStateException {
2759
2760    Matcher matcher = dbSpecPattern.matcher(dbSpecComponent);
2761    if (matcher.find()) {
2762      try {
2763        String prefix = dbSpecComponent.substring(
2764          matcher.start(1), matcher.end(1));
2765        int minShard =
2766          Integer.parseInt(
2767            dbSpecComponent.substring(
2768              matcher.start(2), matcher.end(2)));
2769        int maxShard =
2770          Integer.parseInt(
2771            dbSpecComponent.substring(
2772              matcher.start(3), matcher.end(3)));
2773        String suffix = dbSpecComponent.substring(
2774          matcher.start(4), matcher.end(4));
2775        //Log2.logEvent(prefix + " " + minShard + " " + maxShard + " " + suffix);
2776        if (minShard > maxShard) {
2777          throw new IllegalArgumentException(
2778            "Maximum shard must be greater than or equal to " +
2779            "the minimum shard");
2780        }
2781        StringBuilder tmp = new StringBuilder();
2782        for (int shard = minShard; shard <= maxShard; shard++) {
2783          tmp.append(prefix).append(shard).append(suffix);
2784          if (shard != maxShard) {
2785            tmp.append(",");
2786          }
2787        }
2788        return tmp.toString();
2789      } catch (NumberFormatException nfex) {
2790        throw new IllegalArgumentException(
2791          "Malformed DB specification component: " + dbSpecComponent);
2792      }
2793    } else {
2794      return dbSpecComponent;
2795    }
2796  }
2797
2798
2799  /**
2800  * Returns a string that is equivalent to the specified string with its
2801  * first character converted to uppercase as by {@link String#toUpperCase()}.
2802  * The returned string will have the same value as the specified string if
2803  * its first character is non-alphabetic, if its first character is already
2804  * uppercase, or if the specified string is of length 0.
2805  *
2806  * <p>For example:
2807  * <pre>
2808  *    capitalize("foo bar").equals("Foo bar");
2809  *    capitalize("2b or not 2b").equals("2b or not 2b")
2810  *    capitalize("Foo bar").equals("Foo bar");
2811  *    capitalize("").equals("");
2812  * </pre>
2813  *
2814  * @param s the string whose first character is to be uppercased
2815  * @return a string equivalent to <tt>s</tt> with its first character
2816  *     converted to uppercase
2817  * @throws NullPointerException if <tt>s</tt> is null
2818  */
2819  public static String capitalize(String s) {
2820    if (s.length() == 0) {
2821      return s;
2822    }
2823    char first = s.charAt(0);
2824    char capitalized = Character.toUpperCase(first);
2825    return (first == capitalized)
2826        ? s
2827        : capitalized + s.substring(1);
2828  }
2829
2830  /**
2831   * Examine a string to see if it starts with a given prefix (case
2832   * insensitive). Just like String.startsWith() except doesn't
2833   * respect case. Strings are compared in the same way as in
2834   * {@link String#equalsIgnoreCase}.
2835   *
2836   * @param str the string to examine
2837   * @param prefix the prefix to look for
2838   * @return a boolean indicating if str starts with prefix (case insensitive)
2839   */
2840  public static boolean startsWithIgnoreCase(String str, String prefix) {
2841    return str.regionMatches(true, 0, prefix, 0, prefix.length());
2842  }
2843
2844  /**
2845   * Examine a string to see if it ends with a given suffix (case
2846   * insensitive). Just like String.endsWith() except doesn't respect
2847   * case. Strings are compared in the same way as in
2848   * {@link String#equalsIgnoreCase}.
2849   *
2850   * @param str the string to examine
2851   * @param suffix the suffix to look for
2852   * @return a boolean indicating if str ends with suffix (case insensitive)
2853   */
2854  public static boolean endsWithIgnoreCase(String str, String suffix) {
2855    int len = suffix.length();
2856    return str.regionMatches(true, str.length() - len, suffix, 0, len);
2857  }
2858
2859  /**
2860   * @param c one codePoint
2861   * @return the number of bytes needed to encode this codePoint in UTF-8
2862   */
2863  private static int bytesUtf8(int c) {
2864    if (c < 0x80) {
2865      return 1;
2866    } else if (c < 0x00800) {
2867      return 2;
2868    } else if (c < 0x10000) {
2869      return 3;
2870    } else if (c < 0x200000) {
2871      return 4;
2872
2873    // RFC 3629 forbids the use of UTF-8 for codePoint greater than 0x10FFFF,
2874    // so if the caller respects this RFC, this should not happen
2875    } else if (c < 0x4000000) {
2876      return 5;
2877    } else {
2878      return 6;
2879    }
2880  }
2881
2882  /**
2883   * @param str a string
2884   * @return the number of bytes required to represent this string in UTF-8
2885   */
2886  public static int bytesStorage(String str) {
2887    // offsetByCodePoint has a bug if its argument is the result of a
2888    // call to substring. To avoid this, we create a new String
2889    // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664
2890    String s = new String(str);
2891
2892    int len = 0;
2893    for (int i = 0; i < s.length(); i = s.offsetByCodePoints(i, 1)) {
2894      len += bytesUtf8(s.codePointAt(i));
2895    }
2896    return len;
2897  }
2898
2899  /**
2900   * @param str a string
2901   * @param maxbytes
2902   * @return the beginning of the string, so that it uses less than
2903   *     maxbytes bytes in UTF-8
2904   * @throws IndexOutOfBoundsException if maxbytes is negative
2905   */
2906  public static String truncateStringForUtf8Storage(String str, int maxbytes) {
2907    if (maxbytes < 0) {
2908      throw new IndexOutOfBoundsException();
2909    }
2910
2911    // offsetByCodePoint has a bug if its argument is the result of a
2912    // call to substring. To avoid this, we create a new String
2913    // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664
2914    // TODO(cquinn): should be fixed as of 1.5.0_01
2915    String s = new String(str);
2916
2917    int codepoints = 0;
2918    int bytesUsed = 0;
2919    for (codepoints = 0; codepoints < s.length();
2920        codepoints = s.offsetByCodePoints(codepoints, 1)) {
2921      int glyphBytes = StringUtil.bytesUtf8(s.codePointAt(codepoints));
2922      if (bytesUsed + glyphBytes > maxbytes) {
2923        break;
2924      }
2925      bytesUsed += glyphBytes;
2926    }
2927    return s.substring(0, codepoints);
2928  }
2929
2930  /**
2931   * If the given string is of length {@code maxLength} or less, then it is
2932   * returned as is.
2933   * If the string is longer than {@code maxLength}, the returned string is
2934   * truncated before the last space character on or before
2935   * {@code source.charAt(maxLength)}. If the string has no spaces, the
2936   * returned string is truncated to {@code maxLength}.
2937   *
2938   * @param source the string to truncate if necessary
2939   * @param maxLength
2940   * @return the original string if its length is less than or equal to
2941   *     maxLength, otherwise a truncated string as mentioned above
2942   */
2943  public static String truncateIfNecessary(String source, int maxLength) {
2944    if (source.length() <= maxLength) {
2945      return source;
2946    }
2947    String str = unicodePreservingSubstring(source, 0, maxLength);
2948
2949    @SuppressWarnings("deprecation") // we'll make this go away before that does
2950    CharMatcher whitespaceMatcher = CharMatcher.LEGACY_WHITESPACE;
2951    String truncated = whitespaceMatcher.trimTrailingFrom(str);
2952
2953    // We may have had multiple spaces at maxLength, which were stripped away
2954    if (truncated.length() < maxLength) {
2955      return truncated;
2956    }
2957    // We have a truncated string of length maxLength. If the next char was a
2958    // space, we truncated at a word boundary, so we can return immediately
2959    if (Character.isSpaceChar(source.charAt(maxLength))) {
2960      return truncated;
2961    }
2962    // We truncated in the middle of the word. Try to truncate before
2963    // the last space, if it exists. Otherwise, return the truncated string
2964    for (int i = truncated.length() - 1; i >= 0; --i) {
2965      if (Character.isSpaceChar(truncated.charAt(i))) {
2966        String substr = truncated.substring(0, i);
2967        return whitespaceMatcher.trimTrailingFrom(substr);
2968      }
2969    }
2970    return truncated;
2971  }
2972
2973  /**
2974   * If this given string is of length {@code maxLength} or less, it will
2975   * be returned as-is.
2976   * Otherwise it will be trucated to {@code maxLength}, regardless of whether
2977   * there are any space characters in the String. If an ellipsis is requested
2978   * to be appended to the truncated String, the String will be truncated so
2979   * that the ellipsis will also fit within maxLength.
2980   * If no truncation was necessary, no ellipsis will be added.
2981   *
2982   * @param source the String to truncate if necessary
2983   * @param maxLength the maximum number of characters to keep
2984   * @param addEllipsis if true, and if the String had to be truncated,
2985   *     add "..." to the end of the String before returning. Additionally,
2986   *     the ellipsis will only be added if maxLength is greater than 3.
2987   * @return the original string if its length is less than or equal to
2988   *     maxLength, otherwise a truncated string as mentioned above
2989   */
2990  public static String truncateAtMaxLength(String source, int maxLength,
2991      boolean addEllipsis) {
2992
2993    if (source.length() <= maxLength) {
2994      return source;
2995    }
2996    if (addEllipsis && maxLength > 3) {
2997      return unicodePreservingSubstring(source, 0, maxLength - 3) + "...";
2998    }
2999    return unicodePreservingSubstring(source, 0, maxLength);
3000  }
3001
3002  /**
3003   * Normalizes {@code index} such that it respects Unicode character
3004   * boundaries in {@code str}.
3005   *
3006   * <p>If {@code index} is the low surrogate of a unicode character,
3007   * the method returns {@code index - 1}. Otherwise, {@code index} is
3008   * returned.
3009   *
3010   * <p>In the case in which {@code index} falls in an invalid surrogate pair
3011   * (e.g. consecutive low surrogates, consecutive high surrogates), or if
3012   * if it is not a valid index into {@code str}, the original value of
3013   * {@code index} is returned.
3014   *
3015   * @param str the String
3016   * @param index the index to be normalized
3017   * @return a normalized index that does not split a Unicode character
3018   */
3019  public static int unicodePreservingIndex(String str, int index) {
3020    if (index > 0 && index < str.length()) {
3021      if (Character.isHighSurrogate(str.charAt(index - 1)) &&
3022          Character.isLowSurrogate(str.charAt(index))) {
3023        return index - 1;
3024      }
3025    }
3026    return index;
3027  }
3028
3029  /**
3030   * Returns a substring of {@code str} that respects Unicode character
3031   * boundaries.
3032   *
3033   * <p>The string will never be split between a [high, low] surrogate pair,
3034   * as defined by {@link Character#isHighSurrogate} and
3035   * {@link Character#isLowSurrogate}.
3036   *
3037   * <p>If {@code begin} or {@code end} are the low surrogate of a unicode
3038   * character, it will be offset by -1.
3039   *
3040   * <p>This behavior guarantees that
3041   * {@code str.equals(StringUtil.unicodePreservingSubstring(str, 0, n) +
3042   *     StringUtil.unicodePreservingSubstring(str, n, str.length())) } is
3043   * true for all {@code n}.
3044   * </pre>
3045   *
3046   * <p>This means that unlike {@link String#substring(int, int)}, the length of
3047   * the returned substring may not necessarily be equivalent to
3048   * {@code end - begin}.
3049   *
3050   * @param str the original String
3051   * @param begin the beginning index, inclusive
3052   * @param end the ending index, exclusive
3053   * @return the specified substring, possibly adjusted in order to not
3054   *   split unicode surrogate pairs
3055   * @throws IndexOutOfBoundsException if the {@code begin} is negative,
3056   *   or {@code end} is larger than the length of {@code str}, or
3057   *   {@code begin} is larger than {@code end}
3058   */
3059  public static String unicodePreservingSubstring(
3060      String str, int begin, int end) {
3061    return str.substring(unicodePreservingIndex(str, begin),
3062        unicodePreservingIndex(str, end));
3063  }
3064
3065  /**
3066   * Equivalent to:
3067   *
3068   * <pre>
3069   * {@link #unicodePreservingSubstring(String, int, int)}(
3070   *     str, begin, str.length())
3071   * </pre>
3072   */
3073  public static String unicodePreservingSubstring(String str, int begin) {
3074    return unicodePreservingSubstring(str, begin, str.length());
3075  }
3076
3077  /**
3078   * True iff the given character needs to be escaped in a javascript string
3079   * literal.
3080   * <p>
3081   * We need to escape the following characters in javascript string literals.
3082   * <dl>
3083   * <dt> \           <dd> the escape character
3084   * <dt> ', "        <dd> string delimiters.
3085   *                       TODO(msamuel): what about backticks (`) which are
3086   *                       non-standard but recognized as attribute delimiters.
3087   * <dt> &, <, >, =  <dd> so that a string literal can be embedded in XHTML
3088   *                       without further escaping.
3089   * </dl>
3090   * TODO(msamuel): If we're being paranoid, should we escape + to avoid UTF-7
3091   * attacks?
3092   * <p>
3093   * Unicode format control characters (category Cf) must be escaped since they
3094   * are removed by javascript parser in a pre-lex pass.
3095   * <br>According to EcmaScript 262 Section 7.1:
3096   * <blockquote>
3097   *     The format control characters can occur anywhere in the source text of
3098   *     an ECMAScript program. These characters are removed from the source
3099   *     text before applying the lexical grammar.
3100   * </blockquote>
3101   * <p>
3102   * Additionally, line terminators are not allowed to appear inside strings
3103   * and Section 7.3 says
3104   * <blockquote>
3105   *     The following characters are considered to be line terminators:<pre>
3106   *         Code Point Value   Name                  Formal Name
3107   *         \u000A             Line Feed             [LF]
3108   *         \u000D             Carriage Return       [CR]
3109   *         \u2028             Line separator        [LS]
3110   *         \u2029             Paragraph separator   [PS]
3111   * </pre></blockquote>
3112   *
3113   * @param codepoint a char instead of an int since the javascript language
3114   *    does not support extended unicode.
3115   */
3116  static boolean mustEscapeCharInJsString(int codepoint) {
3117    return JS_ESCAPE_CHARS.contains(codepoint);
3118  }
3119
3120  /**
3121   * True iff the given character needs to be escaped in a JSON string literal.
3122   * <p>
3123   * We need to escape the following characters in JSON string literals.
3124   * <dl>
3125   * <dt> \           <dd> the escape character
3126   * <dt> "           <dd> string delimiter
3127   * <dt> 0x00 - 0x1F <dd> control characters
3128   * </dl>
3129   * <p>
3130   * See EcmaScript 262 Section 15.12.1 for the full JSON grammar.
3131   */
3132  static boolean mustEscapeCharInJsonString(int codepoint) {
3133    return JSON_ESCAPE_CHARS.contains(codepoint);
3134  }
3135
3136  /**
3137   * Builds a small set of code points.
3138   * {@code com.google.common.base} cannot depend on ICU4J, thus avoiding ICU's
3139   * {@code UnicodeSet}.
3140   * For all other purposes, please use {@code com.ibm.icu.text.UnicodeSet}.
3141   */
3142  private static class UnicodeSetBuilder {
3143    Set<Integer> codePointSet = new HashSet<Integer>();
3144
3145    UnicodeSetBuilder addCodePoint(int c) {
3146      codePointSet.add(c);
3147      return this;
3148    }
3149
3150    UnicodeSetBuilder addRange(int from, int to) {
3151      for (int i = from; i <= to; i++) {
3152        codePointSet.add(i);
3153      }
3154      return this;
3155    }
3156
3157    Set<Integer> create() {
3158      return codePointSet;
3159    }
3160  }
3161
3162  private static final Set<Integer> JS_ESCAPE_CHARS = new UnicodeSetBuilder()
3163      // All characters in the class of format characters, [:Cf:].
3164      // Source: http://unicode.org/cldr/utility/list-unicodeset.jsp.
3165      .addCodePoint(0xAD)
3166      .addRange(0x600, 0x603)
3167      .addCodePoint(0x6DD)
3168      .addCodePoint(0x070F)
3169      .addRange(0x17B4, 0x17B5)
3170      .addRange(0x200B, 0x200F)
3171      .addRange(0x202A, 0x202E)
3172      .addRange(0x2060, 0x2064)
3173      .addRange(0x206A, 0x206F)
3174      .addCodePoint(0xFEFF)
3175      .addRange(0xFFF9, 0xFFFB)
3176      .addRange(0x0001D173, 0x0001D17A)
3177      .addCodePoint(0x000E0001)
3178      .addRange(0x000E0020, 0x000E007F)
3179      // Plus characters mentioned in the docs of mustEscapeCharInJsString().
3180      .addCodePoint(0x0000)
3181      .addCodePoint(0x000A)
3182      .addCodePoint(0x000D)
3183      .addRange(0x2028, 0x2029)
3184      .addCodePoint(0x0085)
3185      .addCodePoint(Character.codePointAt("'", 0))
3186      .addCodePoint(Character.codePointAt("\"", 0))
3187      .addCodePoint(Character.codePointAt("&", 0))
3188      .addCodePoint(Character.codePointAt("<", 0))
3189      .addCodePoint(Character.codePointAt(">", 0))
3190      .addCodePoint(Character.codePointAt("=", 0))
3191      .addCodePoint(Character.codePointAt("\\", 0))
3192      .create();
3193
3194  private static final Set<Integer> JSON_ESCAPE_CHARS = new UnicodeSetBuilder()
3195      .addCodePoint(Character.codePointAt("\"", 0))
3196      .addCodePoint(Character.codePointAt("\\", 0))
3197      .addRange(0x0000, 0x001F)
3198      .create();
3199
3200  /**
3201   * <b>To be deprecated:</b> use {@link CharEscapers#xmlEscaper()} instead.
3202   */
3203  public static String xmlEscape(String s) {
3204    return CharEscapers.xmlEscaper().escape(s);
3205  }
3206
3207  /**
3208   * <b>To be deprecated:</b> use {@link CharEscapers#asciiHtmlEscaper()} instead.
3209   */
3210  public static String htmlEscape(String s) {
3211    return CharEscapers.asciiHtmlEscaper().escape(s);
3212  }
3213}