1/* 2 * Copyright (C) 2008 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16package com.android.mail.lib.base; 17 18import static com.android.mail.lib.base.Preconditions.checkNotNull; 19 20/** 21 * A {@code UnicodeEscaper} that escapes some set of Java characters using 22 * the URI percent encoding scheme. The set of safe characters (those which 23 * remain unescaped) can be specified on construction. 24 * 25 * <p>For details on escaping URIs for use in web pages, see section 2.4 of 26 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>. 27 * 28 * <p>In most cases this class should not need to be used directly. If you 29 * have no special requirements for escaping your URIs, you should use either 30 * {@link CharEscapers#uriEscaper()} or 31 * {@link CharEscapers#uriEscaper(boolean)}. 32 * 33 * <p>When encoding a String, the following rules apply: 34 * <ul> 35 * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" 36 * through "9" remain the same. 37 * <li>Any additionally specified safe characters remain the same. 38 * <li>If {@code plusForSpace} was specified, the space character " " is 39 * converted into a plus sign "+". 40 * <li>All other characters are converted into one or more bytes using UTF-8 41 * encoding and each byte is then represented by the 3-character string 42 * "%XY", where "XY" is the two-digit, uppercase, hexadecimal representation 43 * of the byte value. 44 * </ul> 45 * 46 * <p>RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!", 47 * "~", "*", "'", "(" and ")". It goes on to state: 48 * 49 * <p><i>Unreserved characters can be escaped without changing the semantics 50 * of the URI, but this should not be done unless the URI is being used 51 * in a context that does not allow the unescaped character to appear.</i> 52 * 53 * <p>For performance reasons the only currently supported character encoding of 54 * this class is UTF-8. 55 * 56 * <p><b>Note</b>: This escaper produces uppercase hexadecimal sequences. From 57 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br> 58 * <i>"URI producers and normalizers should use uppercase hexadecimal digits 59 * for all percent-encodings."</i> 60 * 61 * @author dbeaumont@google.com (David Beaumont) 62 */ 63public class PercentEscaper extends UnicodeEscaper { 64 /** 65 * A string of safe characters that mimics the behavior of 66 * {@link java.net.URLEncoder}. 67 * 68 * TODO(dbeaumont): Fix escapers to be compliant with RFC 3986 69 */ 70 public static final String SAFECHARS_URLENCODER = "-_.*"; 71 72 /** 73 * A string of characters that do not need to be encoded when used in URI 74 * path segments, as specified in RFC 3986. Note that some of these 75 * characters do need to be escaped when used in other parts of the URI. 76 */ 77 public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;="; 78 79 /** 80 * A string of characters that do not need to be encoded when used in URI 81 * query strings, as specified in RFC 3986. Note that some of these 82 * characters do need to be escaped when used in other parts of the URI. 83 */ 84 public static final String SAFEQUERYSTRINGCHARS_URLENCODER 85 = "-_.!~*'()@:$,;/?:"; 86 87 // In some uri escapers spaces are escaped to '+' 88 private static final char[] URI_ESCAPED_SPACE = { '+' }; 89 90 // TODO(dbeaumont): Remove this once UriEscaper uses lower case 91 private static final char[] UPPER_HEX_DIGITS = 92 "0123456789ABCDEF".toCharArray(); 93 94 /** 95 * If true we should convert space to the {@code +} character. 96 */ 97 private final boolean plusForSpace; 98 99 /** 100 * An array of flags where for any {@code char c} if {@code safeOctets[c]} is 101 * true then {@code c} should remain unmodified in the output. If 102 * {@code c > safeOctets.length} then it should be escaped. 103 */ 104 private final boolean[] safeOctets; 105 106 /** 107 * Constructs a URI escaper with the specified safe characters and optional 108 * handling of the space character. 109 * 110 * @param safeChars a non null string specifying additional safe characters 111 * for this escaper (the ranges 0..9, a..z and A..Z are always safe and 112 * should not be specified here) 113 * @param plusForSpace true if ASCII space should be escaped to {@code +} 114 * rather than {@code %20} 115 * @throws IllegalArgumentException if any of the parameters were invalid 116 */ 117 public PercentEscaper(String safeChars, boolean plusForSpace) { 118 checkNotNull(safeChars); // eager for GWT. 119 120 // Avoid any misunderstandings about the behavior of this escaper 121 if (safeChars.matches(".*[0-9A-Za-z].*")) { 122 throw new IllegalArgumentException( 123 "Alphanumeric characters are always 'safe' and should not be " + 124 "explicitly specified"); 125 } 126 // Avoid ambiguous parameters. Safe characters are never modified so if 127 // space is a safe character then setting plusForSpace is meaningless. 128 if (plusForSpace && safeChars.contains(" ")) { 129 throw new IllegalArgumentException( 130 "plusForSpace cannot be specified when space is a 'safe' character"); 131 } 132 if (safeChars.contains("%")) { 133 throw new IllegalArgumentException( 134 "The '%' character cannot be specified as 'safe'"); 135 } 136 this.plusForSpace = plusForSpace; 137 this.safeOctets = createSafeOctets(safeChars); 138 } 139 140 /** 141 * Creates a boolean[] with entries corresponding to the character values 142 * for 0-9, A-Z, a-z and those specified in safeChars set to true. The array 143 * is as small as is required to hold the given character information. 144 */ 145 private static boolean[] createSafeOctets(String safeChars) { 146 int maxChar = 'z'; 147 char[] safeCharArray = safeChars.toCharArray(); 148 for (char c : safeCharArray) { 149 maxChar = Math.max(c, maxChar); 150 } 151 boolean[] octets = new boolean[maxChar + 1]; 152 for (int c = '0'; c <= '9'; c++) { 153 octets[c] = true; 154 } 155 for (int c = 'A'; c <= 'Z'; c++) { 156 octets[c] = true; 157 } 158 for (int c = 'a'; c <= 'z'; c++) { 159 octets[c] = true; 160 } 161 for (char c : safeCharArray) { 162 octets[c] = true; 163 } 164 return octets; 165 } 166 167 /* 168 * Overridden for performance. For unescaped strings this improved the 169 * performance of the uri escaper from ~760ns to ~400ns as measured by 170 * {@link CharEscapersBenchmark}. 171 */ 172 @Override 173 protected int nextEscapeIndex(CharSequence csq, int index, int end) { 174 for (; index < end; index++) { 175 char c = csq.charAt(index); 176 if (c >= safeOctets.length || !safeOctets[c]) { 177 break; 178 } 179 } 180 return index; 181 } 182 183 /* 184 * Overridden for performance. For unescaped strings this improved the 185 * performance of the uri escaper from ~400ns to ~170ns as measured by 186 * {@link CharEscapersBenchmark}. 187 */ 188 @Override 189 public String escape(String s) { 190 checkNotNull(s); 191 int slen = s.length(); 192 for (int index = 0; index < slen; index++) { 193 char c = s.charAt(index); 194 if (c >= safeOctets.length || !safeOctets[c]) { 195 return escapeSlow(s, index); 196 } 197 } 198 return s; 199 } 200 201 /** 202 * Escapes the given Unicode code point in UTF-8. 203 */ 204 @Override 205 protected char[] escape(int cp) { 206 // We should never get negative values here but if we do it will throw an 207 // IndexOutOfBoundsException, so at least it will get spotted. 208 if (cp < safeOctets.length && safeOctets[cp]) { 209 return null; 210 } else if (cp == ' ' && plusForSpace) { 211 return URI_ESCAPED_SPACE; 212 } else if (cp <= 0x7F) { 213 // Single byte UTF-8 characters 214 // Start with "%--" and fill in the blanks 215 char[] dest = new char[3]; 216 dest[0] = '%'; 217 dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 218 dest[1] = UPPER_HEX_DIGITS[cp >>> 4]; 219 return dest; 220 } else if (cp <= 0x7ff) { 221 // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff] 222 // Start with "%--%--" and fill in the blanks 223 char[] dest = new char[6]; 224 dest[0] = '%'; 225 dest[3] = '%'; 226 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 227 cp >>>= 4; 228 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 229 cp >>>= 2; 230 dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 231 cp >>>= 4; 232 dest[1] = UPPER_HEX_DIGITS[0xC | cp]; 233 return dest; 234 } else if (cp <= 0xffff) { 235 // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff] 236 // Start with "%E-%--%--" and fill in the blanks 237 char[] dest = new char[9]; 238 dest[0] = '%'; 239 dest[1] = 'E'; 240 dest[3] = '%'; 241 dest[6] = '%'; 242 dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 243 cp >>>= 4; 244 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 245 cp >>>= 2; 246 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 247 cp >>>= 4; 248 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 249 cp >>>= 2; 250 dest[2] = UPPER_HEX_DIGITS[cp]; 251 return dest; 252 } else if (cp <= 0x10ffff) { 253 char[] dest = new char[12]; 254 // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff] 255 // Start with "%F-%--%--%--" and fill in the blanks 256 dest[0] = '%'; 257 dest[1] = 'F'; 258 dest[3] = '%'; 259 dest[6] = '%'; 260 dest[9] = '%'; 261 dest[11] = UPPER_HEX_DIGITS[cp & 0xF]; 262 cp >>>= 4; 263 dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 264 cp >>>= 2; 265 dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 266 cp >>>= 4; 267 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 268 cp >>>= 2; 269 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 270 cp >>>= 4; 271 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 272 cp >>>= 2; 273 dest[2] = UPPER_HEX_DIGITS[cp & 0x7]; 274 return dest; 275 } else { 276 // If this ever happens it is due to bug in UnicodeEscaper, not bad input. 277 throw new IllegalArgumentException( 278 "Invalid unicode character value " + cp); 279 } 280 } 281}