1/**
2 * Copyright (c) 2008, Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.android.mail.common.base;
18
19import static com.google.android.mail.common.base.Preconditions.checkNotNull;
20
21/**
22 * A {@code UnicodeEscaper} that escapes some set of Java characters using
23 * the URI percent encoding scheme. The set of safe characters (those which
24 * remain unescaped) can be specified on construction.
25 *
26 * <p>For details on escaping URIs for use in web pages, see section 2.4 of
27 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
28 *
29 * <p>In most cases this class should not need to be used directly. If you
30 * have no special requirements for escaping your URIs, you should use either
31 * {@link CharEscapers#uriEscaper()} or
32 * {@link CharEscapers#uriEscaper(boolean)}.
33 *
34 * <p>When encoding a String, the following rules apply:
35 * <ul>
36 * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
37 * through "9" remain the same.
38 * <li>Any additionally specified safe characters remain the same.
39 * <li>If {@code plusForSpace} was specified, the space character " " is
40 * converted into a plus sign "+".
41 * <li>All other characters are converted into one or more bytes using UTF-8
42 *     encoding and each byte is then represented by the 3-character string
43 *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal representation
44 *     of the byte value.
45 * </ul>
46 *
47 * <p>RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!",
48 * "~", "*", "'", "(" and ")". It goes on to state:
49 *
50 * <p><i>Unreserved characters can be escaped without changing the semantics
51 * of the URI, but this should not be done unless the URI is being used
52 * in a context that does not allow the unescaped character to appear.</i>
53 *
54 * <p>For performance reasons the only currently supported character encoding of
55 * this class is UTF-8.
56 *
57 * <p><b>Note</b>: This escaper produces uppercase hexadecimal sequences. From
58 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
59 * <i>"URI producers and normalizers should use uppercase hexadecimal digits
60 * for all percent-encodings."</i>
61 *
62 * @author dbeaumont@google.com (David Beaumont)
63 */
64public class PercentEscaper extends UnicodeEscaper {
65  /**
66   * A string of safe characters that mimics the behavior of
67   * {@link java.net.URLEncoder}.
68   *
69   * TODO(dbeaumont): Fix escapers to be compliant with RFC 3986
70   */
71  public static final String SAFECHARS_URLENCODER = "-_.*";
72
73  /**
74   * A string of characters that do not need to be encoded when used in URI
75   * path segments, as specified in RFC 3986. Note that some of these
76   * characters do need to be escaped when used in other parts of the URI.
77   */
78  public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;=";
79
80  /**
81   * A string of characters that do not need to be encoded when used in URI
82   * query strings, as specified in RFC 3986. Note that some of these
83   * characters do need to be escaped when used in other parts of the URI.
84   */
85  public static final String SAFEQUERYSTRINGCHARS_URLENCODER
86      = "-_.!~*'()@:$,;/?:";
87
88  // In some uri escapers spaces are escaped to '+'
89  private static final char[] URI_ESCAPED_SPACE = { '+' };
90
91  // TODO(dbeaumont): Remove this once UriEscaper uses lower case
92  private static final char[] UPPER_HEX_DIGITS =
93      "0123456789ABCDEF".toCharArray();
94
95  /**
96   * If true we should convert space to the {@code +} character.
97   */
98  private final boolean plusForSpace;
99
100  /**
101   * An array of flags where for any {@code char c} if {@code safeOctets[c]} is
102   * true then {@code c} should remain unmodified in the output. If
103   * {@code c > safeOctets.length} then it should be escaped.
104   */
105  private final boolean[] safeOctets;
106
107  /**
108   * Constructs a URI escaper with the specified safe characters and optional
109   * handling of the space character.
110   *
111   * @param safeChars a non null string specifying additional safe characters
112   *        for this escaper (the ranges 0..9, a..z and A..Z are always safe and
113   *        should not be specified here)
114   * @param plusForSpace true if ASCII space should be escaped to {@code +}
115   *        rather than {@code %20}
116   * @throws IllegalArgumentException if any of the parameters were invalid
117   */
118  public PercentEscaper(String safeChars, boolean plusForSpace) {
119    checkNotNull(safeChars);  // eager for GWT.
120
121    // Avoid any misunderstandings about the behavior of this escaper
122    if (safeChars.matches(".*[0-9A-Za-z].*")) {
123      throw new IllegalArgumentException(
124          "Alphanumeric characters are always 'safe' and should not be " +
125          "explicitly specified");
126    }
127    // Avoid ambiguous parameters. Safe characters are never modified so if
128    // space is a safe character then setting plusForSpace is meaningless.
129    if (plusForSpace && safeChars.contains(" ")) {
130      throw new IllegalArgumentException(
131          "plusForSpace cannot be specified when space is a 'safe' character");
132    }
133    if (safeChars.contains("%")) {
134      throw new IllegalArgumentException(
135          "The '%' character cannot be specified as 'safe'");
136    }
137    this.plusForSpace = plusForSpace;
138    this.safeOctets = createSafeOctets(safeChars);
139  }
140
141  /**
142   * Creates a boolean[] with entries corresponding to the character values
143   * for 0-9, A-Z, a-z and those specified in safeChars set to true. The array
144   * is as small as is required to hold the given character information.
145   */
146  private static boolean[] createSafeOctets(String safeChars) {
147    int maxChar = 'z';
148    char[] safeCharArray = safeChars.toCharArray();
149    for (char c : safeCharArray) {
150      maxChar = Math.max(c, maxChar);
151    }
152    boolean[] octets = new boolean[maxChar + 1];
153    for (int c = '0'; c <= '9'; c++) {
154      octets[c] = true;
155    }
156    for (int c = 'A'; c <= 'Z'; c++) {
157      octets[c] = true;
158    }
159    for (int c = 'a'; c <= 'z'; c++) {
160      octets[c] = true;
161    }
162    for (char c : safeCharArray) {
163      octets[c] = true;
164    }
165    return octets;
166  }
167
168  /*
169   * Overridden for performance. For unescaped strings this improved the
170   * performance of the uri escaper from ~760ns to ~400ns as measured by
171   * {@link CharEscapersBenchmark}.
172   */
173  @Override
174  protected int nextEscapeIndex(CharSequence csq, int index, int end) {
175    for (; index < end; index++) {
176      char c = csq.charAt(index);
177      if (c >= safeOctets.length || !safeOctets[c]) {
178        break;
179      }
180    }
181    return index;
182  }
183
184  /*
185   * Overridden for performance. For unescaped strings this improved the
186   * performance of the uri escaper from ~400ns to ~170ns as measured by
187   * {@link CharEscapersBenchmark}.
188   */
189  @Override
190  public String escape(String s) {
191    checkNotNull(s);
192    int slen = s.length();
193    for (int index = 0; index < slen; index++) {
194      char c = s.charAt(index);
195      if (c >= safeOctets.length || !safeOctets[c]) {
196        return escapeSlow(s, index);
197      }
198    }
199    return s;
200  }
201
202  /**
203   * Escapes the given Unicode code point in UTF-8.
204   */
205  @Override
206  protected char[] escape(int cp) {
207    // We should never get negative values here but if we do it will throw an
208    // IndexOutOfBoundsException, so at least it will get spotted.
209    if (cp < safeOctets.length && safeOctets[cp]) {
210      return null;
211    } else if (cp == ' ' && plusForSpace) {
212      return URI_ESCAPED_SPACE;
213    } else if (cp <= 0x7F) {
214      // Single byte UTF-8 characters
215      // Start with "%--" and fill in the blanks
216      char[] dest = new char[3];
217      dest[0] = '%';
218      dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
219      dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
220      return dest;
221    } else if (cp <= 0x7ff) {
222      // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
223      // Start with "%--%--" and fill in the blanks
224      char[] dest = new char[6];
225      dest[0] = '%';
226      dest[3] = '%';
227      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
228      cp >>>= 4;
229      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
230      cp >>>= 2;
231      dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
232      cp >>>= 4;
233      dest[1] = UPPER_HEX_DIGITS[0xC | cp];
234      return dest;
235    } else if (cp <= 0xffff) {
236      // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
237      // Start with "%E-%--%--" and fill in the blanks
238      char[] dest = new char[9];
239      dest[0] = '%';
240      dest[1] = 'E';
241      dest[3] = '%';
242      dest[6] = '%';
243      dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
244      cp >>>= 4;
245      dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
246      cp >>>= 2;
247      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
248      cp >>>= 4;
249      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
250      cp >>>= 2;
251      dest[2] = UPPER_HEX_DIGITS[cp];
252      return dest;
253    } else if (cp <= 0x10ffff) {
254      char[] dest = new char[12];
255      // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
256      // Start with "%F-%--%--%--" and fill in the blanks
257      dest[0] = '%';
258      dest[1] = 'F';
259      dest[3] = '%';
260      dest[6] = '%';
261      dest[9] = '%';
262      dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
263      cp >>>= 4;
264      dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
265      cp >>>= 2;
266      dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
267      cp >>>= 4;
268      dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
269      cp >>>= 2;
270      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
271      cp >>>= 4;
272      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
273      cp >>>= 2;
274      dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
275      return dest;
276    } else {
277      // If this ever happens it is due to bug in UnicodeEscaper, not bad input.
278      throw new IllegalArgumentException(
279          "Invalid unicode character value " + cp);
280    }
281  }
282}