1/**
2 * Copyright (c) 2006, Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16package com.google.android.mail.common.base;
17
18import static com.google.android.mail.common.base.Preconditions.checkNotNull;
19
20import java.io.IOException;
21
22/**
23 * Utility functions for dealing with {@code CharEscaper}s, and some commonly
24 * used {@code CharEscaper} instances.
25 *
26 * @author sven@google.com (Sven Mawson)
27 * @author laurence@google.com (Laurence Gonsalves)
28 */
29public final class CharEscapers {
30  private CharEscapers() {}
31
32  // TODO(matevossian): To implementors of escapers --
33  //                    For each xxxEscaper method, please add links to external
34  //                    reference pages that we consider authoritative for what
35  //                    that escaper should exactly be doing.
36
37  /**
38   * Performs no escaping.
39   */
40  private static final CharEscaper NULL_ESCAPER = new CharEscaper() {
41      @Override
42    public String escape(String string) {
43        checkNotNull(string);
44        return string;
45      }
46
47      @Override
48      public Appendable escape(final Appendable out) {
49        checkNotNull(out);
50
51        // we can't simply return out because the CharEscaper contract says that
52        // the returned Appendable will throw a NullPointerException if asked to
53        // append null.
54        return new Appendable() {
55            @Override public Appendable append(CharSequence csq) throws IOException {
56              checkNotNull(csq);
57              out.append(csq);
58              return this;
59            }
60
61            @Override public Appendable append(CharSequence csq, int start, int end)
62                throws IOException {
63              checkNotNull(csq);
64              out.append(csq, start, end);
65              return this;
66            }
67
68            @Override public Appendable append(char c) throws IOException {
69              out.append(c);
70              return this;
71            }
72          };
73      }
74
75      @Override
76      protected char[] escape(char c) {
77        return null;
78      }
79    };
80
81  /**
82   * Returns a {@link CharEscaper} that does no escaping.
83   */
84  public static CharEscaper nullEscaper() {
85    return NULL_ESCAPER;
86  }
87
88  /**
89   * Returns a {@link CharEscaper} instance that escapes special characters in a
90   * string so it can safely be included in an XML document in either element
91   * content or attribute values.
92   *
93   * <p><b>Note</b></p>: silently removes null-characters and control
94   * characters, as there is no way to represent them in XML.
95   */
96  public static CharEscaper xmlEscaper() {
97    return XML_ESCAPER;
98  }
99
100  /**
101   * Escapes special characters from a string so it can safely be included in an
102   * XML document in either element content or attribute values.  Also removes
103   * null-characters and control characters, as there is no way to represent
104   * them in XML.
105   */
106  private static final CharEscaper XML_ESCAPER = newBasicXmlEscapeBuilder()
107      .addEscape('"', "&quot;")
108      .addEscape('\'', "&apos;")
109      .toEscaper();
110
111  /**
112   * Returns a {@link CharEscaper} instance that escapes special characters in a
113   * string so it can safely be included in an XML document in element content.
114   *
115   * <p><b>Note</b></p>: double and single quotes are not escaped, so it is not
116   * safe to use this escaper to escape attribute values. Use the
117   * {@link #xmlEscaper()} escaper to escape attribute values or if you are
118   * unsure. Also silently removes non-whitespace control characters, as there
119   * is no way to represent them in XML.
120   */
121  public static CharEscaper xmlContentEscaper() {
122    return XML_CONTENT_ESCAPER;
123  }
124
125  /**
126   * Escapes special characters from a string so it can safely be included in an
127   * XML document in element content.  Note that quotes are <em>not</em>
128   * escaped, so <em>this is not safe for use in attribute values</em>. Use
129   * {@link #XML_ESCAPER} for attribute values, or if you are unsure.  Also
130   * removes non-whitespace control characters, as there is no way to represent
131   * them in XML.
132   */
133  private static final CharEscaper XML_CONTENT_ESCAPER =
134      newBasicXmlEscapeBuilder().toEscaper();
135
136  /**
137   * Returns a {@link CharEscaper} instance that escapes special characters in a
138   * string so it can safely be included in an HTML document in either element
139   * content or attribute values.
140   *
141   * <p><b>Note</b></p>: alters non-ASCII and control characters.
142   *
143   * The entity list was taken from:
144   * <a href="http://www.w3.org/TR/html4/sgml/entities.html">here</a>
145   */
146  public static CharEscaper htmlEscaper() {
147    return HtmlEscaperHolder.HTML_ESCAPER;
148  }
149
150  /**
151   * A lazy initialization holder for HTML_ESCAPER.
152   */
153  private static class HtmlEscaperHolder {
154    private static final CharEscaper HTML_ESCAPER
155        = new HtmlCharEscaper(new CharEscaperBuilder()
156            .addEscape('"',      "&quot;")
157            .addEscape('\'',     "&#39;")
158            .addEscape('&',      "&amp;")
159            .addEscape('<',      "&lt;")
160            .addEscape('>',      "&gt;")
161            .addEscape('\u00A0', "&nbsp;")
162            .addEscape('\u00A1', "&iexcl;")
163            .addEscape('\u00A2', "&cent;")
164            .addEscape('\u00A3', "&pound;")
165            .addEscape('\u00A4', "&curren;")
166            .addEscape('\u00A5', "&yen;")
167            .addEscape('\u00A6', "&brvbar;")
168            .addEscape('\u00A7', "&sect;")
169            .addEscape('\u00A8', "&uml;")
170            .addEscape('\u00A9', "&copy;")
171            .addEscape('\u00AA', "&ordf;")
172            .addEscape('\u00AB', "&laquo;")
173            .addEscape('\u00AC', "&not;")
174            .addEscape('\u00AD', "&shy;")
175            .addEscape('\u00AE', "&reg;")
176            .addEscape('\u00AF', "&macr;")
177            .addEscape('\u00B0', "&deg;")
178            .addEscape('\u00B1', "&plusmn;")
179            .addEscape('\u00B2', "&sup2;")
180            .addEscape('\u00B3', "&sup3;")
181            .addEscape('\u00B4', "&acute;")
182            .addEscape('\u00B5', "&micro;")
183            .addEscape('\u00B6', "&para;")
184            .addEscape('\u00B7', "&middot;")
185            .addEscape('\u00B8', "&cedil;")
186            .addEscape('\u00B9', "&sup1;")
187            .addEscape('\u00BA', "&ordm;")
188            .addEscape('\u00BB', "&raquo;")
189            .addEscape('\u00BC', "&frac14;")
190            .addEscape('\u00BD', "&frac12;")
191            .addEscape('\u00BE', "&frac34;")
192            .addEscape('\u00BF', "&iquest;")
193            .addEscape('\u00C0', "&Agrave;")
194            .addEscape('\u00C1', "&Aacute;")
195            .addEscape('\u00C2', "&Acirc;")
196            .addEscape('\u00C3', "&Atilde;")
197            .addEscape('\u00C4', "&Auml;")
198            .addEscape('\u00C5', "&Aring;")
199            .addEscape('\u00C6', "&AElig;")
200            .addEscape('\u00C7', "&Ccedil;")
201            .addEscape('\u00C8', "&Egrave;")
202            .addEscape('\u00C9', "&Eacute;")
203            .addEscape('\u00CA', "&Ecirc;")
204            .addEscape('\u00CB', "&Euml;")
205            .addEscape('\u00CC', "&Igrave;")
206            .addEscape('\u00CD', "&Iacute;")
207            .addEscape('\u00CE', "&Icirc;")
208            .addEscape('\u00CF', "&Iuml;")
209            .addEscape('\u00D0', "&ETH;")
210            .addEscape('\u00D1', "&Ntilde;")
211            .addEscape('\u00D2', "&Ograve;")
212            .addEscape('\u00D3', "&Oacute;")
213            .addEscape('\u00D4', "&Ocirc;")
214            .addEscape('\u00D5', "&Otilde;")
215            .addEscape('\u00D6', "&Ouml;")
216            .addEscape('\u00D7', "&times;")
217            .addEscape('\u00D8', "&Oslash;")
218            .addEscape('\u00D9', "&Ugrave;")
219            .addEscape('\u00DA', "&Uacute;")
220            .addEscape('\u00DB', "&Ucirc;")
221            .addEscape('\u00DC', "&Uuml;")
222            .addEscape('\u00DD', "&Yacute;")
223            .addEscape('\u00DE', "&THORN;")
224            .addEscape('\u00DF', "&szlig;")
225            .addEscape('\u00E0', "&agrave;")
226            .addEscape('\u00E1', "&aacute;")
227            .addEscape('\u00E2', "&acirc;")
228            .addEscape('\u00E3', "&atilde;")
229            .addEscape('\u00E4', "&auml;")
230            .addEscape('\u00E5', "&aring;")
231            .addEscape('\u00E6', "&aelig;")
232            .addEscape('\u00E7', "&ccedil;")
233            .addEscape('\u00E8', "&egrave;")
234            .addEscape('\u00E9', "&eacute;")
235            .addEscape('\u00EA', "&ecirc;")
236            .addEscape('\u00EB', "&euml;")
237            .addEscape('\u00EC', "&igrave;")
238            .addEscape('\u00ED', "&iacute;")
239            .addEscape('\u00EE', "&icirc;")
240            .addEscape('\u00EF', "&iuml;")
241            .addEscape('\u00F0', "&eth;")
242            .addEscape('\u00F1', "&ntilde;")
243            .addEscape('\u00F2', "&ograve;")
244            .addEscape('\u00F3', "&oacute;")
245            .addEscape('\u00F4', "&ocirc;")
246            .addEscape('\u00F5', "&otilde;")
247            .addEscape('\u00F6', "&ouml;")
248            .addEscape('\u00F7', "&divide;")
249            .addEscape('\u00F8', "&oslash;")
250            .addEscape('\u00F9', "&ugrave;")
251            .addEscape('\u00FA', "&uacute;")
252            .addEscape('\u00FB', "&ucirc;")
253            .addEscape('\u00FC', "&uuml;")
254            .addEscape('\u00FD', "&yacute;")
255            .addEscape('\u00FE', "&thorn;")
256            .addEscape('\u00FF', "&yuml;")
257            .addEscape('\u0152', "&OElig;")
258            .addEscape('\u0153', "&oelig;")
259            .addEscape('\u0160', "&Scaron;")
260            .addEscape('\u0161', "&scaron;")
261            .addEscape('\u0178', "&Yuml;")
262            .addEscape('\u0192', "&fnof;")
263            .addEscape('\u02C6', "&circ;")
264            .addEscape('\u02DC', "&tilde;")
265            .addEscape('\u0391', "&Alpha;")
266            .addEscape('\u0392', "&Beta;")
267            .addEscape('\u0393', "&Gamma;")
268            .addEscape('\u0394', "&Delta;")
269            .addEscape('\u0395', "&Epsilon;")
270            .addEscape('\u0396', "&Zeta;")
271            .addEscape('\u0397', "&Eta;")
272            .addEscape('\u0398', "&Theta;")
273            .addEscape('\u0399', "&Iota;")
274            .addEscape('\u039A', "&Kappa;")
275            .addEscape('\u039B', "&Lambda;")
276            .addEscape('\u039C', "&Mu;")
277            .addEscape('\u039D', "&Nu;")
278            .addEscape('\u039E', "&Xi;")
279            .addEscape('\u039F', "&Omicron;")
280            .addEscape('\u03A0', "&Pi;")
281            .addEscape('\u03A1', "&Rho;")
282            .addEscape('\u03A3', "&Sigma;")
283            .addEscape('\u03A4', "&Tau;")
284            .addEscape('\u03A5', "&Upsilon;")
285            .addEscape('\u03A6', "&Phi;")
286            .addEscape('\u03A7', "&Chi;")
287            .addEscape('\u03A8', "&Psi;")
288            .addEscape('\u03A9', "&Omega;")
289            .addEscape('\u03B1', "&alpha;")
290            .addEscape('\u03B2', "&beta;")
291            .addEscape('\u03B3', "&gamma;")
292            .addEscape('\u03B4', "&delta;")
293            .addEscape('\u03B5', "&epsilon;")
294            .addEscape('\u03B6', "&zeta;")
295            .addEscape('\u03B7', "&eta;")
296            .addEscape('\u03B8', "&theta;")
297            .addEscape('\u03B9', "&iota;")
298            .addEscape('\u03BA', "&kappa;")
299            .addEscape('\u03BB', "&lambda;")
300            .addEscape('\u03BC', "&mu;")
301            .addEscape('\u03BD', "&nu;")
302            .addEscape('\u03BE', "&xi;")
303            .addEscape('\u03BF', "&omicron;")
304            .addEscape('\u03C0', "&pi;")
305            .addEscape('\u03C1', "&rho;")
306            .addEscape('\u03C2', "&sigmaf;")
307            .addEscape('\u03C3', "&sigma;")
308            .addEscape('\u03C4', "&tau;")
309            .addEscape('\u03C5', "&upsilon;")
310            .addEscape('\u03C6', "&phi;")
311            .addEscape('\u03C7', "&chi;")
312            .addEscape('\u03C8', "&psi;")
313            .addEscape('\u03C9', "&omega;")
314            .addEscape('\u03D1', "&thetasym;")
315            .addEscape('\u03D2', "&upsih;")
316            .addEscape('\u03D6', "&piv;")
317            .addEscape('\u2002', "&ensp;")
318            .addEscape('\u2003', "&emsp;")
319            .addEscape('\u2009', "&thinsp;")
320            .addEscape('\u200C', "&zwnj;")
321            .addEscape('\u200D', "&zwj;")
322            .addEscape('\u200E', "&lrm;")
323            .addEscape('\u200F', "&rlm;")
324            .addEscape('\u2013', "&ndash;")
325            .addEscape('\u2014', "&mdash;")
326            .addEscape('\u2018', "&lsquo;")
327            .addEscape('\u2019', "&rsquo;")
328            .addEscape('\u201A', "&sbquo;")
329            .addEscape('\u201C', "&ldquo;")
330            .addEscape('\u201D', "&rdquo;")
331            .addEscape('\u201E', "&bdquo;")
332            .addEscape('\u2020', "&dagger;")
333            .addEscape('\u2021', "&Dagger;")
334            .addEscape('\u2022', "&bull;")
335            .addEscape('\u2026', "&hellip;")
336            .addEscape('\u2030', "&permil;")
337            .addEscape('\u2032', "&prime;")
338            .addEscape('\u2033', "&Prime;")
339            .addEscape('\u2039', "&lsaquo;")
340            .addEscape('\u203A', "&rsaquo;")
341            .addEscape('\u203E', "&oline;")
342            .addEscape('\u2044', "&frasl;")
343            .addEscape('\u20AC', "&euro;")
344            .addEscape('\u2111', "&image;")
345            .addEscape('\u2118', "&weierp;")
346            .addEscape('\u211C', "&real;")
347            .addEscape('\u2122', "&trade;")
348            .addEscape('\u2135', "&alefsym;")
349            .addEscape('\u2190', "&larr;")
350            .addEscape('\u2191', "&uarr;")
351            .addEscape('\u2192', "&rarr;")
352            .addEscape('\u2193', "&darr;")
353            .addEscape('\u2194', "&harr;")
354            .addEscape('\u21B5', "&crarr;")
355            .addEscape('\u21D0', "&lArr;")
356            .addEscape('\u21D1', "&uArr;")
357            .addEscape('\u21D2', "&rArr;")
358            .addEscape('\u21D3', "&dArr;")
359            .addEscape('\u21D4', "&hArr;")
360            .addEscape('\u2200', "&forall;")
361            .addEscape('\u2202', "&part;")
362            .addEscape('\u2203', "&exist;")
363            .addEscape('\u2205', "&empty;")
364            .addEscape('\u2207', "&nabla;")
365            .addEscape('\u2208', "&isin;")
366            .addEscape('\u2209', "&notin;")
367            .addEscape('\u220B', "&ni;")
368            .addEscape('\u220F', "&prod;")
369            .addEscape('\u2211', "&sum;")
370            .addEscape('\u2212', "&minus;")
371            .addEscape('\u2217', "&lowast;")
372            .addEscape('\u221A', "&radic;")
373            .addEscape('\u221D', "&prop;")
374            .addEscape('\u221E', "&infin;")
375            .addEscape('\u2220', "&ang;")
376            .addEscape('\u2227', "&and;")
377            .addEscape('\u2228', "&or;")
378            .addEscape('\u2229', "&cap;")
379            .addEscape('\u222A', "&cup;")
380            .addEscape('\u222B', "&int;")
381            .addEscape('\u2234', "&there4;")
382            .addEscape('\u223C', "&sim;")
383            .addEscape('\u2245', "&cong;")
384            .addEscape('\u2248', "&asymp;")
385            .addEscape('\u2260', "&ne;")
386            .addEscape('\u2261', "&equiv;")
387            .addEscape('\u2264', "&le;")
388            .addEscape('\u2265', "&ge;")
389            .addEscape('\u2282', "&sub;")
390            .addEscape('\u2283', "&sup;")
391            .addEscape('\u2284', "&nsub;")
392            .addEscape('\u2286', "&sube;")
393            .addEscape('\u2287', "&supe;")
394            .addEscape('\u2295', "&oplus;")
395            .addEscape('\u2297', "&otimes;")
396            .addEscape('\u22A5', "&perp;")
397            .addEscape('\u22C5', "&sdot;")
398            .addEscape('\u2308', "&lceil;")
399            .addEscape('\u2309', "&rceil;")
400            .addEscape('\u230A', "&lfloor;")
401            .addEscape('\u230B', "&rfloor;")
402            .addEscape('\u2329', "&lang;")
403            .addEscape('\u232A', "&rang;")
404            .addEscape('\u25CA', "&loz;")
405            .addEscape('\u2660', "&spades;")
406            .addEscape('\u2663', "&clubs;")
407            .addEscape('\u2665', "&hearts;")
408            .addEscape('\u2666', "&diams;")
409            .toArray());
410  }
411
412  /**
413   * Returns a {@link CharEscaper} instance that escapes special characters in a
414   * string so it can safely be included in an HTML document in either element
415   * content or attribute values.
416   *
417   * <p><b>Note</b></p>: does not alter non-ASCII and control characters.
418   */
419  public static CharEscaper asciiHtmlEscaper() {
420    return ASCII_HTML_ESCAPER;
421  }
422
423  /**
424   * Escapes special characters from a string so it can safely be included in an
425   * HTML document in either element content or attribute values. Does
426   * <em>not</em> alter non-ASCII characters or control characters.
427   */
428  private static final CharEscaper ASCII_HTML_ESCAPER = new CharEscaperBuilder()
429      .addEscape('"', "&quot;")
430      .addEscape('\'', "&#39;")
431      .addEscape('&', "&amp;")
432      .addEscape('<', "&lt;")
433      .addEscape('>', "&gt;")
434      .toEscaper();
435
436  /**
437   * Returns an {@link Escaper} instance that escapes Java chars so they can be
438   * safely included in URIs. For details on escaping URIs, see section 2.4 of
439   * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>.
440   *
441   * <p>When encoding a String, the following rules apply:
442   * <ul>
443   * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
444   *     through "9" remain the same.
445   * <li>The special characters ".", "-", "*", and "_" remain the same.
446   * <li>The space character " " is converted into a plus sign "+".
447   * <li>All other characters are converted into one or more bytes using UTF-8
448   *     encoding and each byte is then represented by the 3-character string
449   *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
450   *     representation of the byte value.
451   * <ul>
452   *
453   * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
454   * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
455   * RFC 3986</a>:<br>
456   * <i>"URI producers and normalizers should use uppercase hexadecimal digits
457   * for all percent-encodings."</i>
458   *
459   * <p>This escaper has identical behavior to (but is potentially much faster
460   * than):
461   * <ul>
462   * <li>{@link com.google.httputil.FastURLEncoder#encode(String)}
463   * <li>{@link com.google.httputil.FastURLEncoder#encode(String,String)}
464   *     with the encoding name "UTF-8"
465   * <li>{@link java.net.URLEncoder#encode(String, String)}
466   *     with the encoding name "UTF-8"
467   * </ul>
468   *
469   * <p>This method is equivalent to {@code uriEscaper(true)}.
470   */
471  public static Escaper uriEscaper() {
472    return uriEscaper(true);
473  }
474
475  /**
476   * Returns an {@link Escaper} instance that escapes Java chars so they can be
477   * safely included in URI path segments. For details on escaping URIs, see
478   * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
479   *
480   * <p>When encoding a String, the following rules apply:
481   * <ul>
482   * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
483   *     through "9" remain the same.
484   * <li>The unreserved characters ".", "-", "~", and "_" remain the same.
485   * <li>The general delimiters "@" and ":" remain the same.
486   * <li>The subdelimiters "!", "$", "&amp;", "'", "(", ")", "*", ",", ";",
487   *     and "=" remain the same.
488   * <li>The space character " " is converted into %20.
489   * <li>All other characters are converted into one or more bytes using UTF-8
490   *     encoding and each byte is then represented by the 3-character string
491   *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
492   *     representation of the byte value.
493   * </ul>
494   *
495   * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
496   * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
497   * RFC 3986</a>:<br>
498   * <i>"URI producers and normalizers should use uppercase hexadecimal digits
499   * for all percent-encodings."</i>
500   */
501  public static Escaper uriPathEscaper() {
502    return URI_PATH_ESCAPER;
503  }
504
505  /**
506   * Returns an {@link Escaper} instance that escapes Java chars so they can be
507   * safely included in URI query string segments. When the query string
508   * consists of a sequence of name=value pairs separated by &amp;, the names
509   * and values should be individually encoded. If you escape an entire query
510   * string in one pass with this escaper, then the "=" and "&amp;" characters
511   * used as separators will also be escaped.
512   *
513   * <p>This escaper is also suitable for escaping fragment identifiers.
514   *
515   * <p>For details on escaping URIs, see
516   * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
517   *
518   * <p>When encoding a String, the following rules apply:
519   * <ul>
520   * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
521   *     through "9" remain the same.
522   * <li>The unreserved characters ".", "-", "~", and "_" remain the same.
523   * <li>The general delimiters "@" and ":" remain the same.
524   * <li>The path delimiters "/" and "?" remain the same.
525   * <li>The subdelimiters "!", "$", "'", "(", ")", "*", ",", and ";",
526   *     remain the same.
527   * <li>The space character " " is converted into %20.
528   * <li>The equals sign "=" is converted into %3D.
529   * <li>The ampersand "&amp;" is converted into %26.
530   * <li>All other characters are converted into one or more bytes using UTF-8
531   *     encoding and each byte is then represented by the 3-character string
532   *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
533   *     representation of the byte value.
534   * </ul>
535   *
536   * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
537   * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
538   * RFC 3986</a>:<br>
539   * <i>"URI producers and normalizers should use uppercase hexadecimal digits
540   * for all percent-encodings."</i>
541   *
542   * <p>This method is equivalent to {@code uriQueryStringEscaper(false)}.
543   */
544  public static Escaper uriQueryStringEscaper() {
545    return uriQueryStringEscaper(false);
546  }
547
548  /**
549   * Returns a {@link Escaper} instance that escapes Java characters so they can
550   * be safely included in URIs. For details on escaping URIs, see section 2.4
551   * of <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>.
552   *
553   * <p>When encoding a String, the following rules apply:
554   * <ul>
555   * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
556   *     through "9" remain the same.
557   * <li>The special characters ".", "-", "*", and "_" remain the same.
558   * <li>If {@code plusForSpace} was specified, the space character " " is
559   *     converted into a plus sign "+". Otherwise it is converted into "%20".
560   * <li>All other characters are converted into one or more bytes using UTF-8
561   *     encoding and each byte is then represented by the 3-character string
562   *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
563   *     representation of the byte value.
564   * </ul>
565   *
566   * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
567   * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
568   * RFC 3986</a>:<br>
569   * <i>"URI producers and normalizers should use uppercase hexadecimal digits
570   * for all percent-encodings."</i>
571   *
572   * @param plusForSpace if {@code true} space is escaped to {@code +} otherwise
573   *        it is escaped to {@code %20}. Although common, the escaping of
574   *        spaces as plus signs has a very ambiguous status in the relevant
575   *        specifications. You should prefer {@code %20} unless you are doing
576   *        exact character-by-character comparisons of URLs and backwards
577   *        compatibility requires you to use plus signs.
578   *
579   * @see #uriEscaper()
580   */
581  public static Escaper uriEscaper(boolean plusForSpace) {
582    return plusForSpace ? URI_ESCAPER : URI_ESCAPER_NO_PLUS;
583  }
584
585  /**
586   * Returns an {@link Escaper} instance that escapes Java chars so they can be
587   * safely included in URI query string segments. When the query string
588   * consists of a sequence of name=value pairs separated by &amp;, the names
589   * and values should be individually encoded. If you escape an entire query
590   * string in one pass with this escaper, then the "=" and "&amp;" characters
591   * used as separators will also be escaped.
592   *
593   * <p>This escaper is also suitable for escaping fragment identifiers.
594   *
595   * <p>For details on escaping URIs, see
596   * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
597   *
598   * <p>When encoding a String, the following rules apply:
599   * <ul>
600   * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
601   *     through "9" remain the same.
602   * <li>The unreserved characters ".", "-", "~", and "_" remain the same.
603   * <li>The general delimiters "@" and ":" remain the same.
604   * <li>The path delimiters "/" and "?" remain the same.
605   * <li>The subdelimiters "!", "$", "'", "(", ")", "*", ",", and ";",
606   *     remain the same.
607   * <li>If {@code plusForSpace} was specified, the space character " " is
608   *     converted into a plus sign "+". Otherwise it is converted into "%20".
609   * <li>The equals sign "=" is converted into %3D.
610   * <li>The ampersand "&amp;" is converted into %26.
611   * <li>All other characters are converted into one or more bytes using UTF-8
612   *     encoding and each byte is then represented by the 3-character string
613   *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
614   *     representation of the byte value.
615   * </ul>
616   *
617   * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
618   * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
619   * RFC 3986</a>:<br>
620   * <i>"URI producers and normalizers should use uppercase hexadecimal digits
621   * for all percent-encodings."</i>
622   *
623   * @param plusForSpace if {@code true} space is escaped to {@code +} otherwise
624   *        it is escaped to {@code %20}. Although common, the escaping of
625   *        spaces as plus signs has a very ambiguous status in the relevant
626   *        specifications. You should prefer {@code %20} unless you are doing
627   *        exact character-by-character comparisons of URLs and backwards
628   *        compatibility requires you to use plus signs.
629   *
630   * @see #uriQueryStringEscaper()
631   */
632  public static Escaper uriQueryStringEscaper(boolean plusForSpace) {
633    return plusForSpace ?
634           URI_QUERY_STRING_ESCAPER_WITH_PLUS : URI_QUERY_STRING_ESCAPER;
635  }
636
637  private static final Escaper URI_ESCAPER =
638      new PercentEscaper(PercentEscaper.SAFECHARS_URLENCODER, true);
639
640  private static final Escaper URI_ESCAPER_NO_PLUS =
641      new PercentEscaper(PercentEscaper.SAFECHARS_URLENCODER, false);
642
643  private static final Escaper URI_PATH_ESCAPER =
644      new PercentEscaper(PercentEscaper.SAFEPATHCHARS_URLENCODER, false);
645
646  private static final Escaper URI_QUERY_STRING_ESCAPER =
647      new PercentEscaper(PercentEscaper.SAFEQUERYSTRINGCHARS_URLENCODER, false);
648
649  private static final Escaper URI_QUERY_STRING_ESCAPER_WITH_PLUS =
650      new PercentEscaper(PercentEscaper.SAFEQUERYSTRINGCHARS_URLENCODER, true);
651
652  /**
653   * Returns a {@link Escaper} instance that escapes Java characters in a manner
654   * compatible with the C++ webutil/url URL class (the {@code kGoogle1Escape}
655   * set).
656   *
657   * <p>When encoding a String, the following rules apply:
658   * <ul>
659   * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
660   * through "9" remain the same.
661   * <li>The special characters "!", "(", ")", "*", "-", ".", "_", "~", ",", "/"
662   * and ":" remain the same.
663   * <li>The space character " " is converted into a plus sign "+".
664   * <li>All other characters are converted into one or more bytes using UTF-8
665   *     encoding and each byte is then represented by the 3-character string
666   *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
667   *     representation of the byte value.
668   * </ul>
669   *
670   * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
671   * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
672   * RFC 3986</a>:<br>
673   * <i>"URI producers and normalizers should use uppercase hexadecimal digits
674   * for all percent-encodings."</i>
675   *
676   * <p><b>Note</b>: This escaper is a special case and is <em>not
677   * compliant</em> with <a href="http://www.ietf.org/rfc/rfc2396.txt">
678   * RFC 2396</a>. Specifically it will not escape "/", ":" and ",". This is
679   * only provided for certain limited use cases and you should favor using
680   * {@link #uriEscaper()} whenever possible.
681   */
682  public static Escaper cppUriEscaper() {
683    return CPP_URI_ESCAPER;
684  }
685
686  // Based on comments from FastURLEncoder:
687  // These octets mimic the ones escaped by the C++ webutil/url URL class --
688  // the kGoogle1Escape set.
689  // To produce the same escaping as C++, use this set with the plusForSpace
690  // option.
691  // WARNING: Contrary to RFC 2396 ",", "/" and ":" are listed as safe here.
692  private static final Escaper CPP_URI_ESCAPER =
693      new PercentEscaper("!()*-._~,/:", true);
694
695  /**
696   * Returns a {@link CharEscaper} instance that escapes special characters in a
697   * string so it can safely be included in a Java string literal.
698   *
699   * <p><b>Note</b></p>: does not escape single quotes, so use the escaper
700   * returned by {@link #javaCharEscaper()} if you are generating char
701   * literals or if you are unsure.
702   */
703  public static CharEscaper javaStringEscaper() {
704    return JAVA_STRING_ESCAPER;
705  }
706
707  /**
708   * Escapes special characters from a string so it can safely be included in a
709   * Java string literal. Does <em>not</em> escape single-quotes, so use
710   * JAVA_CHAR_ESCAPE if you are generating char literals, or if you are unsure.
711   *
712   * <p>Note that non-ASCII characters will be octal or Unicode escaped.
713   */
714  private static final CharEscaper JAVA_STRING_ESCAPER
715      = new JavaCharEscaper(new CharEscaperBuilder()
716          .addEscape('\b', "\\b")
717          .addEscape('\f', "\\f")
718          .addEscape('\n', "\\n")
719          .addEscape('\r', "\\r")
720          .addEscape('\t', "\\t")
721          .addEscape('\"', "\\\"")
722          .addEscape('\\', "\\\\")
723          .toArray());
724
725  /**
726   * Returns a {@link CharEscaper} instance that escapes special characters in a
727   * string so it can safely be included in a Java char or string literal. The
728   * behavior of this escaper is the same as that of the
729   * {@link #javaStringEscaper()}, except it also escapes single quotes.
730   */
731  public static CharEscaper javaCharEscaper() {
732    return JAVA_CHAR_ESCAPER;
733  }
734
735  /**
736   * Escapes special characters from a string so it can safely be included in a
737   * Java char literal or string literal.
738   *
739   * <p>Note that non-ASCII characters will be octal or Unicode escaped.
740   *
741   * <p>This is the same as {@link #JAVA_STRING_ESCAPER}, except that it escapes
742   * single quotes.
743   */
744  private static final CharEscaper JAVA_CHAR_ESCAPER
745      = new JavaCharEscaper(new CharEscaperBuilder()
746          .addEscape('\b', "\\b")
747          .addEscape('\f', "\\f")
748          .addEscape('\n', "\\n")
749          .addEscape('\r', "\\r")
750          .addEscape('\t', "\\t")
751          .addEscape('\'', "\\'")
752          .addEscape('\"', "\\\"")
753          .addEscape('\\', "\\\\")
754          .toArray());
755
756  /**
757   * Returns a {@link CharEscaper} instance that replaces non-ASCII characters
758   * in a string with their Unicode escape sequences ({@code \\uxxxx} where
759   * {@code xxxx} is a hex number). Existing escape sequences won't be affected.
760   */
761  public static CharEscaper javaStringUnicodeEscaper() {
762    return JAVA_STRING_UNICODE_ESCAPER;
763  }
764
765  /**
766   * Escapes each non-ASCII character in with its Unicode escape sequence
767   * {@code \\uxxxx} where {@code xxxx} is a hex number. Existing escape
768   * sequences won't be affected.
769   */
770  private static final CharEscaper JAVA_STRING_UNICODE_ESCAPER
771      = new CharEscaper() {
772          @Override protected char[] escape(char c) {
773            if (c <= 127) {
774              return null;
775            }
776
777            char[] r = new char[6];
778            r[5] = HEX_DIGITS[c & 15];
779            c >>>= 4;
780            r[4] = HEX_DIGITS[c & 15];
781            c >>>= 4;
782            r[3] = HEX_DIGITS[c & 15];
783            c >>>= 4;
784            r[2] = HEX_DIGITS[c & 15];
785            r[1] = 'u';
786            r[0] = '\\';
787            return r;
788          }
789        };
790
791  /**
792   * Returns a {@link CharEscaper} instance that escapes special characters from
793   * a string so it can safely be included in a Python string literal. Does not
794   * have any special handling for non-ASCII characters.
795   */
796  public static CharEscaper pythonEscaper() {
797    return PYTHON_ESCAPER;
798  }
799
800  /**
801   * Escapes special characters in a string so it can safely be included in a
802   * Python string literal. Does not have any special handling for non-ASCII
803   * characters.
804   */
805  private static final CharEscaper PYTHON_ESCAPER = new CharEscaperBuilder()
806      // TODO(laurence): perhaps this should escape non-ASCII characters?
807      .addEscape('\n', "\\n")
808      .addEscape('\r', "\\r")
809      .addEscape('\t', "\\t")
810      .addEscape('\\', "\\\\")
811      .addEscape('\"', "\\\"")
812      .addEscape('\'', "\\\'")
813      .toEscaper();
814
815  /**
816   * Returns a {@link CharEscaper} instance that escapes non-ASCII characters in
817   * a string so it can safely be included in a Javascript string literal.
818   * Non-ASCII characters are replaced with their ASCII javascript escape
819   * sequences (e.g., \\uhhhh or \xhh).
820   */
821  public static CharEscaper javascriptEscaper() {
822    return JAVASCRIPT_ESCAPER;
823  }
824
825  /**
826   * {@code CharEscaper} to escape javascript strings. Turns all non-ASCII
827   * characters into ASCII javascript escape sequences (e.g., \\uhhhh or \xhh).
828   */
829  private static final CharEscaper JAVASCRIPT_ESCAPER
830      = new JavascriptCharEscaper(new CharEscaperBuilder()
831          .addEscape('\'', "\\x27")
832          .addEscape('"',  "\\x22")
833          .addEscape('<',  "\\x3c")
834          .addEscape('=',  "\\x3d")
835          .addEscape('>',  "\\x3e")
836          .addEscape('&',  "\\x26")
837          .addEscape('\b', "\\b")
838          .addEscape('\t', "\\t")
839          .addEscape('\n', "\\n")
840          .addEscape('\f', "\\f")
841          .addEscape('\r', "\\r")
842          .addEscape('\\', "\\\\")
843          .toArray());
844
845  private static CharEscaperBuilder newBasicXmlEscapeBuilder() {
846    return new CharEscaperBuilder()
847        .addEscape('&', "&amp;")
848        .addEscape('<', "&lt;")
849        .addEscape('>', "&gt;")
850        .addEscapes(new char[] {
851            '\000', '\001', '\002', '\003', '\004',
852            '\005', '\006', '\007', '\010', '\013',
853            '\014', '\016', '\017', '\020', '\021',
854            '\022', '\023', '\024', '\025', '\026',
855            '\027', '\030', '\031', '\032', '\033',
856            '\034', '\035', '\036', '\037'}, "");
857  }
858
859  /**
860   * Returns a composite {@link CharEscaper} instance that tries to escape
861   * characters using a primary {@code CharEscaper} first and falls back to a
862   * secondary one if there is no escaping.
863   *
864   * <p>The returned escaper will attempt to escape each character using the
865   * primary escaper, and if the primary escaper has no escaping for that
866   * character, it will use the secondary escaper. If the secondary escaper has
867   * no escaping for a character either, the original character will be used.
868   * If the primary escaper has an escape for a character, the secondary escaper
869   * will not be used at all for that character; the escaped output of the
870   * primary is not run through the secondary. For a case where you would like
871   * to first escape with one escaper, and then with another, it is recommended
872   * that you call each escaper in order.
873   *
874   * @param primary The primary {@code CharEscaper} to use
875   * @param secondary The secondary {@code CharEscaper} to use if the first one
876   *     has no escaping rule for a character
877   * @throws NullPointerException if any of the arguments is null
878   */
879  public static CharEscaper fallThrough(CharEscaper primary,
880      CharEscaper secondary) {
881    checkNotNull(primary);
882    checkNotNull(secondary);
883    return new FallThroughCharEscaper(primary, secondary);
884  }
885
886  /**
887   * A fast {@link CharEscaper} that uses an array of replacement characters and
888   * a range of safe characters. It overrides {@link #escape(String)} to improve
889   * performance. Rough benchmarking shows that this almost doubles the speed
890   * when processing strings that do not require escaping (providing the escape
891   * test itself is efficient).
892   */
893  private static abstract class FastCharEscaper extends CharEscaper {
894
895    protected final char[][] replacements;
896    protected final int replacementLength;
897    protected final char safeMin;
898    protected final char safeMax;
899
900    public FastCharEscaper(char[][] replacements, char safeMin, char safeMax) {
901      this.replacements = replacements;
902      this.replacementLength = replacements.length;
903      this.safeMin = safeMin;
904      this.safeMax = safeMax;
905    }
906
907    /** Overridden for performance (see {@link FastCharEscaper}). */
908    @Override public String escape(String s) {
909      int slen = s.length();
910      for (int index = 0; index < slen; index++) {
911        char c = s.charAt(index);
912        if ((c < replacementLength && replacements[c] != null)
913            || c < safeMin || c > safeMax) {
914          return escapeSlow(s, index);
915        }
916      }
917      return s;
918    }
919  }
920
921  /**
922   * Escaper for Java character escaping, contains both an array and a
923   * backup function.  We're not overriding the array decorator because we
924   * want to keep this as fast as possible, so no calls to super.escape first.
925   */
926  private static class JavaCharEscaper extends FastCharEscaper {
927
928    public JavaCharEscaper(char[][] replacements) {
929      super(replacements, ' ', '~');
930    }
931
932    @Override protected char[] escape(char c) {
933      // First check if our array has a valid escaping.
934      if (c < replacementLength) {
935        char[] r = replacements[c];
936        if (r != null) {
937          return r;
938        }
939      }
940
941      // This range is un-escaped.
942      if (safeMin <= c && c <= safeMax) {
943        return null;
944      }
945
946      if (c <= 0xFF) {
947        // Convert c to an octal-escaped string.
948        // Equivalent to String.format("\\%03o", (int)c);
949        char[] r = new char[4];
950        r[0] = '\\';
951        r[3] = HEX_DIGITS[c & 7];
952        c >>>= 3;
953        r[2] = HEX_DIGITS[c & 7];
954        c >>>= 3;
955        r[1] = HEX_DIGITS[c & 7];
956        return r;
957      }
958
959      // Convert c to a hex-escaped string.
960      // Equivalent to String.format("\\u%04x", (int)c);
961      char[] r = new char[6];
962      r[0] = '\\';
963      r[1] = 'u';
964      r[5] = HEX_DIGITS[c & 15];
965      c >>>= 4;
966      r[4] = HEX_DIGITS[c & 15];
967      c >>>= 4;
968      r[3] = HEX_DIGITS[c & 15];
969      c >>>= 4;
970      r[2] = HEX_DIGITS[c & 15];
971      return r;
972    }
973  }
974
975  /**
976   * Escaper for javascript character escaping, contains both an array and a
977   * backup function. We're not overriding the array decorator because we
978   * want to keep this as fast as possible, so no calls to super.escape first.
979   */
980  private static class JavascriptCharEscaper extends FastCharEscaper {
981
982    public JavascriptCharEscaper(char[][] replacements) {
983      super(replacements, ' ', '~');
984    }
985
986    @Override protected char[] escape(char c) {
987      // First check if our array has a valid escaping.
988      if (c < replacementLength) {
989        char[] r = replacements[c];
990        if (r != null) {
991          return r;
992        }
993      }
994
995      // This range is unescaped.
996      if (safeMin <= c && c <= safeMax) {
997        return null;
998      }
999
1000      // we can do a 2 digit hex escape for chars less that 0x100
1001      if (c < 0x100) {
1002        char[] r = new char[4];
1003        r[3] = HEX_DIGITS[c & 0xf];
1004        c >>>= 4;
1005        r[2] = HEX_DIGITS[c & 0xf];
1006        r[1] = 'x';
1007        r[0] = '\\';
1008        return r;
1009      }
1010
1011      // 4 digit hex escape everything else
1012      char[] r = new char[6];
1013      r[5] = HEX_DIGITS[c & 0xf];
1014      c >>>= 4;
1015      r[4] = HEX_DIGITS[c & 0xf];
1016      c >>>= 4;
1017      r[3] = HEX_DIGITS[c & 0xf];
1018      c >>>= 4;
1019      r[2] = HEX_DIGITS[c & 0xf];
1020      r[1] = 'u';
1021      r[0] = '\\';
1022      return r;
1023    }
1024  }
1025
1026  /**
1027   * Escaper for HTML character escaping, contains both an array and a
1028   * backup function.  We're not overriding the array decorator because we
1029   * want to keep this as fast as possible, so no calls to super.escape first.
1030   */
1031  private static class HtmlCharEscaper extends FastCharEscaper {
1032
1033    public HtmlCharEscaper(char[][] replacements) {
1034      super(replacements, Character.MIN_VALUE, '~');
1035    }
1036
1037    @Override protected char[] escape(char c) {
1038      // First check if our array has a valid escaping.
1039      if (c < replacementLength) {
1040        char[] r = replacements[c];
1041        if (r != null) {
1042          return r;
1043        }
1044      }
1045
1046      // ~ is ASCII 126, the highest value char that does not need
1047      // to be escaped
1048      if (c <= safeMax) {
1049        return null;
1050      }
1051
1052      int index;
1053      if (c < 1000) {
1054        index = 4;
1055      } else if (c < 10000) {
1056        index = 5;
1057      } else {
1058        index = 6;
1059      }
1060      char[] result = new char[index + 2];
1061      result[0] = '&';
1062      result[1] = '#';
1063      result[index + 1] = ';';
1064
1065      // TODO(sven): Convert this to a sequence of shifts/additions
1066      // to avoid the division and modulo operators.
1067      int intValue = c;
1068      for (; index > 1; index--) {
1069        result[index] = HEX_DIGITS[intValue % 10];
1070        intValue /= 10;
1071      }
1072      return result;
1073    }
1074  }
1075
1076  /**
1077   * A composite {@code CharEscaper} object that tries to escape characters
1078   * using a primary {@code CharEscaper} first and falls back to a secondary
1079   * one if there is no escaping.
1080   */
1081  private static class FallThroughCharEscaper extends CharEscaper {
1082
1083    private final CharEscaper primary;
1084    private final CharEscaper secondary;
1085
1086    public FallThroughCharEscaper(CharEscaper primary, CharEscaper secondary) {
1087      this.primary = primary;
1088      this.secondary = secondary;
1089    }
1090
1091    @Override
1092    protected char[] escape(char c) {
1093      char result[] = primary.escape(c);
1094      if (result == null) {
1095        result = secondary.escape(c);
1096      }
1097      return result;
1098    }
1099  }
1100
1101  private static final char[] HEX_DIGITS = "0123456789abcdef".toCharArray();
1102}