1/*
2 * Copyright (C) 2008 Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16package com.android.mail.lib.base;
17
18import static com.android.mail.lib.base.Preconditions.checkNotNull;
19
20/**
21 * A {@code UnicodeEscaper} that escapes some set of Java characters using
22 * the URI percent encoding scheme. The set of safe characters (those which
23 * remain unescaped) can be specified on construction.
24 *
25 * <p>For details on escaping URIs for use in web pages, see section 2.4 of
26 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
27 *
28 * <p>In most cases this class should not need to be used directly. If you
29 * have no special requirements for escaping your URIs, you should use either
30 * {@link CharEscapers#uriEscaper()} or
31 * {@link CharEscapers#uriEscaper(boolean)}.
32 *
33 * <p>When encoding a String, the following rules apply:
34 * <ul>
35 * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
36 * through "9" remain the same.
37 * <li>Any additionally specified safe characters remain the same.
38 * <li>If {@code plusForSpace} was specified, the space character " " is
39 * converted into a plus sign "+".
40 * <li>All other characters are converted into one or more bytes using UTF-8
41 *     encoding and each byte is then represented by the 3-character string
42 *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal representation
43 *     of the byte value.
44 * </ul>
45 *
46 * <p>RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!",
47 * "~", "*", "'", "(" and ")". It goes on to state:
48 *
49 * <p><i>Unreserved characters can be escaped without changing the semantics
50 * of the URI, but this should not be done unless the URI is being used
51 * in a context that does not allow the unescaped character to appear.</i>
52 *
53 * <p>For performance reasons the only currently supported character encoding of
54 * this class is UTF-8.
55 *
56 * <p><b>Note</b>: This escaper produces uppercase hexadecimal sequences. From
57 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
58 * <i>"URI producers and normalizers should use uppercase hexadecimal digits
59 * for all percent-encodings."</i>
60 *
61 * @author dbeaumont@google.com (David Beaumont)
62 */
63public class PercentEscaper extends UnicodeEscaper {
64  /**
65   * A string of safe characters that mimics the behavior of
66   * {@link java.net.URLEncoder}.
67   *
68   * TODO(dbeaumont): Fix escapers to be compliant with RFC 3986
69   */
70  public static final String SAFECHARS_URLENCODER = "-_.*";
71
72  /**
73   * A string of characters that do not need to be encoded when used in URI
74   * path segments, as specified in RFC 3986. Note that some of these
75   * characters do need to be escaped when used in other parts of the URI.
76   */
77  public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;=";
78
79  /**
80   * A string of characters that do not need to be encoded when used in URI
81   * query strings, as specified in RFC 3986. Note that some of these
82   * characters do need to be escaped when used in other parts of the URI.
83   */
84  public static final String SAFEQUERYSTRINGCHARS_URLENCODER
85      = "-_.!~*'()@:$,;/?:";
86
87  // In some uri escapers spaces are escaped to '+'
88  private static final char[] URI_ESCAPED_SPACE = { '+' };
89
90  // TODO(dbeaumont): Remove this once UriEscaper uses lower case
91  private static final char[] UPPER_HEX_DIGITS =
92      "0123456789ABCDEF".toCharArray();
93
94  /**
95   * If true we should convert space to the {@code +} character.
96   */
97  private final boolean plusForSpace;
98
99  /**
100   * An array of flags where for any {@code char c} if {@code safeOctets[c]} is
101   * true then {@code c} should remain unmodified in the output. If
102   * {@code c > safeOctets.length} then it should be escaped.
103   */
104  private final boolean[] safeOctets;
105
106  /**
107   * Constructs a URI escaper with the specified safe characters and optional
108   * handling of the space character.
109   *
110   * @param safeChars a non null string specifying additional safe characters
111   *        for this escaper (the ranges 0..9, a..z and A..Z are always safe and
112   *        should not be specified here)
113   * @param plusForSpace true if ASCII space should be escaped to {@code +}
114   *        rather than {@code %20}
115   * @throws IllegalArgumentException if any of the parameters were invalid
116   */
117  public PercentEscaper(String safeChars, boolean plusForSpace) {
118    checkNotNull(safeChars);  // eager for GWT.
119
120    // Avoid any misunderstandings about the behavior of this escaper
121    if (safeChars.matches(".*[0-9A-Za-z].*")) {
122      throw new IllegalArgumentException(
123          "Alphanumeric characters are always 'safe' and should not be " +
124          "explicitly specified");
125    }
126    // Avoid ambiguous parameters. Safe characters are never modified so if
127    // space is a safe character then setting plusForSpace is meaningless.
128    if (plusForSpace && safeChars.contains(" ")) {
129      throw new IllegalArgumentException(
130          "plusForSpace cannot be specified when space is a 'safe' character");
131    }
132    if (safeChars.contains("%")) {
133      throw new IllegalArgumentException(
134          "The '%' character cannot be specified as 'safe'");
135    }
136    this.plusForSpace = plusForSpace;
137    this.safeOctets = createSafeOctets(safeChars);
138  }
139
140  /**
141   * Creates a boolean[] with entries corresponding to the character values
142   * for 0-9, A-Z, a-z and those specified in safeChars set to true. The array
143   * is as small as is required to hold the given character information.
144   */
145  private static boolean[] createSafeOctets(String safeChars) {
146    int maxChar = 'z';
147    char[] safeCharArray = safeChars.toCharArray();
148    for (char c : safeCharArray) {
149      maxChar = Math.max(c, maxChar);
150    }
151    boolean[] octets = new boolean[maxChar + 1];
152    for (int c = '0'; c <= '9'; c++) {
153      octets[c] = true;
154    }
155    for (int c = 'A'; c <= 'Z'; c++) {
156      octets[c] = true;
157    }
158    for (int c = 'a'; c <= 'z'; c++) {
159      octets[c] = true;
160    }
161    for (char c : safeCharArray) {
162      octets[c] = true;
163    }
164    return octets;
165  }
166
167  /*
168   * Overridden for performance. For unescaped strings this improved the
169   * performance of the uri escaper from ~760ns to ~400ns as measured by
170   * {@link CharEscapersBenchmark}.
171   */
172  @Override
173  protected int nextEscapeIndex(CharSequence csq, int index, int end) {
174    for (; index < end; index++) {
175      char c = csq.charAt(index);
176      if (c >= safeOctets.length || !safeOctets[c]) {
177        break;
178      }
179    }
180    return index;
181  }
182
183  /*
184   * Overridden for performance. For unescaped strings this improved the
185   * performance of the uri escaper from ~400ns to ~170ns as measured by
186   * {@link CharEscapersBenchmark}.
187   */
188  @Override
189  public String escape(String s) {
190    checkNotNull(s);
191    int slen = s.length();
192    for (int index = 0; index < slen; index++) {
193      char c = s.charAt(index);
194      if (c >= safeOctets.length || !safeOctets[c]) {
195        return escapeSlow(s, index);
196      }
197    }
198    return s;
199  }
200
201  /**
202   * Escapes the given Unicode code point in UTF-8.
203   */
204  @Override
205  protected char[] escape(int cp) {
206    // We should never get negative values here but if we do it will throw an
207    // IndexOutOfBoundsException, so at least it will get spotted.
208    if (cp < safeOctets.length && safeOctets[cp]) {
209      return null;
210    } else if (cp == ' ' && plusForSpace) {
211      return URI_ESCAPED_SPACE;
212    } else if (cp <= 0x7F) {
213      // Single byte UTF-8 characters
214      // Start with "%--" and fill in the blanks
215      char[] dest = new char[3];
216      dest[0] = '%';
217      dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
218      dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
219      return dest;
220    } else if (cp <= 0x7ff) {
221      // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
222      // Start with "%--%--" and fill in the blanks
223      char[] dest = new char[6];
224      dest[0] = '%';
225      dest[3] = '%';
226      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
227      cp >>>= 4;
228      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
229      cp >>>= 2;
230      dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
231      cp >>>= 4;
232      dest[1] = UPPER_HEX_DIGITS[0xC | cp];
233      return dest;
234    } else if (cp <= 0xffff) {
235      // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
236      // Start with "%E-%--%--" and fill in the blanks
237      char[] dest = new char[9];
238      dest[0] = '%';
239      dest[1] = 'E';
240      dest[3] = '%';
241      dest[6] = '%';
242      dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
243      cp >>>= 4;
244      dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
245      cp >>>= 2;
246      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
247      cp >>>= 4;
248      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
249      cp >>>= 2;
250      dest[2] = UPPER_HEX_DIGITS[cp];
251      return dest;
252    } else if (cp <= 0x10ffff) {
253      char[] dest = new char[12];
254      // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
255      // Start with "%F-%--%--%--" and fill in the blanks
256      dest[0] = '%';
257      dest[1] = 'F';
258      dest[3] = '%';
259      dest[6] = '%';
260      dest[9] = '%';
261      dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
262      cp >>>= 4;
263      dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
264      cp >>>= 2;
265      dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
266      cp >>>= 4;
267      dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
268      cp >>>= 2;
269      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
270      cp >>>= 4;
271      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
272      cp >>>= 2;
273      dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
274      return dest;
275    } else {
276      // If this ever happens it is due to bug in UnicodeEscaper, not bad input.
277      throw new IllegalArgumentException(
278          "Invalid unicode character value " + cp);
279    }
280  }
281}