escape.cc revision 72a454cd3513ac24fbdd0e0cb9ad70b86a99b801
1// Copyright (c) 2011 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include <algorithm> 6 7#include "net/base/escape.h" 8 9#include "base/i18n/icu_string_conversions.h" 10#include "base/logging.h" 11#include "base/string_piece.h" 12#include "base/string_util.h" 13#include "base/utf_string_conversions.h" 14#include "base/utf_offset_string_conversions.h" 15 16namespace { 17 18static const char* const kHexString = "0123456789ABCDEF"; 19inline char IntToHex(int i) { 20 DCHECK(i >= 0 && i <= 15) << i << " not a hex value"; 21 return kHexString[i]; 22} 23 24// A fast bit-vector map for ascii characters. 25// 26// Internally stores 256 bits in an array of 8 ints. 27// Does quick bit-flicking to lookup needed characters. 28class Charmap { 29 public: 30 Charmap(uint32 b0, uint32 b1, uint32 b2, uint32 b3, 31 uint32 b4, uint32 b5, uint32 b6, uint32 b7) { 32 map_[0] = b0; map_[1] = b1; map_[2] = b2; map_[3] = b3; 33 map_[4] = b4; map_[5] = b5; map_[6] = b6; map_[7] = b7; 34 } 35 36 bool Contains(unsigned char c) const { 37 return (map_[c >> 5] & (1 << (c & 31))) ? true : false; 38 } 39 40 private: 41 uint32 map_[8]; 42}; 43 44// Given text to escape and a Charmap defining which values to escape, 45// return an escaped string. If use_plus is true, spaces are converted 46// to +, otherwise, if spaces are in the charmap, they are converted to 47// %20. 48const std::string Escape(const std::string& text, const Charmap& charmap, 49 bool use_plus) { 50 std::string escaped; 51 escaped.reserve(text.length() * 3); 52 for (unsigned int i = 0; i < text.length(); ++i) { 53 unsigned char c = static_cast<unsigned char>(text[i]); 54 if (use_plus && ' ' == c) { 55 escaped.push_back('+'); 56 } else if (charmap.Contains(c)) { 57 escaped.push_back('%'); 58 escaped.push_back(IntToHex(c >> 4)); 59 escaped.push_back(IntToHex(c & 0xf)); 60 } else { 61 escaped.push_back(c); 62 } 63 } 64 return escaped; 65} 66 67// Contains nonzero when the corresponding character is unescapable for normal 68// URLs. These characters are the ones that may change the parsing of a URL, so 69// we don't want to unescape them sometimes. In many case we won't want to 70// unescape spaces, but that is controlled by parameters to Unescape*. 71// 72// The basic rule is that we can't unescape anything that would changing parsing 73// like # or ?. We also can't unescape &, =, or + since that could be part of a 74// query and that could change the server's parsing of the query. Nor can we 75// unescape \ since googleurl will convert it to a /. 76// 77// Lastly, we can't unescape anything that doesn't have a canonical 78// representation in a URL. This means that unescaping will change the URL, and 79// you could get different behavior if you copy and paste the URL, or press 80// enter in the URL bar. The list of characters that fall into this category 81// are the ones labeled PASS (allow either escaped or unescaped) in the big 82// lookup table at the top of googleurl/src/url_canon_path.cc 83const char kUrlUnescape[128] = { 84// NULL, control chars... 85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 87// ' ' ! " # $ % & ' ( ) * + , - . / 88 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 89// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 90 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 91// @ A B C D E F G H I J K L M N O 92 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 93// P Q R S T U V W X Y Z [ \ ] ^ _ 94 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 95// ` a b c d e f g h i j k l m n o 96 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 97// p q r s t u v w x y z { | } ~ <NBSP> 98 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 99}; 100 101template<typename STR> 102STR UnescapeURLImpl(const STR& escaped_text, 103 UnescapeRule::Type rules, 104 size_t* offset_for_adjustment) { 105 size_t offset_temp = string16::npos; 106 if (!offset_for_adjustment) 107 offset_for_adjustment = &offset_temp; 108 else if (*offset_for_adjustment >= escaped_text.length()) 109 *offset_for_adjustment = string16::npos; 110 111 // Do not unescape anything, return the |escaped_text| text. 112 if (rules == UnescapeRule::NONE) 113 return escaped_text; 114 115 // The output of the unescaping is always smaller than the input, so we can 116 // reserve the input size to make sure we have enough buffer and don't have 117 // to allocate in the loop below. 118 STR result; 119 result.reserve(escaped_text.length()); 120 121 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) { 122 if (static_cast<unsigned char>(escaped_text[i]) >= 128) { 123 // Non ASCII character, append as is. 124 result.push_back(escaped_text[i]); 125 continue; 126 } 127 128 char current_char = static_cast<char>(escaped_text[i]); 129 if (current_char == '%' && i + 2 < max) { 130 const typename STR::value_type most_sig_digit( 131 static_cast<typename STR::value_type>(escaped_text[i + 1])); 132 const typename STR::value_type least_sig_digit( 133 static_cast<typename STR::value_type>(escaped_text[i + 2])); 134 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { 135 unsigned char value = HexDigitToInt(most_sig_digit) * 16 + 136 HexDigitToInt(least_sig_digit); 137 if (value >= 0x80 || // Unescape all high-bit characters. 138 // For 7-bit characters, the lookup table tells us all valid chars. 139 (kUrlUnescape[value] || 140 // ...and we allow some additional unescaping when flags are set. 141 (value == ' ' && (rules & UnescapeRule::SPACES)) || 142 // Allow any of the prohibited but non-control characters when 143 // we're doing "special" chars. 144 (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) || 145 // Additionally allow control characters if requested. 146 (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { 147 // Use the unescaped version of the character. 148 size_t length_before_append = result.length(); 149 result.push_back(value); 150 i += 2; 151 152 // Adjust offset to match length change. 153 if (*offset_for_adjustment != std::string::npos) { 154 if (*offset_for_adjustment > (length_before_append + 2)) 155 *offset_for_adjustment -= 2; 156 else if (*offset_for_adjustment > length_before_append) 157 *offset_for_adjustment = std::string::npos; 158 } 159 } else { 160 // Keep escaped. Append a percent and we'll get the following two 161 // digits on the next loops through. 162 result.push_back('%'); 163 } 164 } else { 165 // Invalid escape sequence, just pass the percent through and continue 166 // right after it. 167 result.push_back('%'); 168 } 169 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) && 170 escaped_text[i] == '+') { 171 result.push_back(' '); 172 } else { 173 // Normal case for unescaped characters. 174 result.push_back(escaped_text[i]); 175 } 176 } 177 178 return result; 179} 180 181} // namespace 182 183// Everything except alphanumerics and !'()*-._~ 184// See RFC 2396 for the list of reserved characters. 185static const Charmap kQueryCharmap( 186 0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L, 187 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL); 188 189std::string EscapeQueryParamValue(const std::string& text, bool use_plus) { 190 return Escape(text, kQueryCharmap, use_plus); 191} 192 193// Convert the string to a sequence of bytes and then % escape anything 194// except alphanumerics and !'()*-._~ 195string16 EscapeQueryParamValueUTF8(const string16& text, 196 bool use_plus) { 197 return UTF8ToUTF16(Escape(UTF16ToUTF8(text), kQueryCharmap, use_plus)); 198} 199 200// non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|} 201static const Charmap kPathCharmap( 202 0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L, 203 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL); 204 205std::string EscapePath(const std::string& path) { 206 return Escape(path, kPathCharmap, false); 207} 208 209// non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|} 210static const Charmap kUrlEscape( 211 0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L, 212 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL 213); 214 215std::string EscapeUrlEncodedData(const std::string& path) { 216 return Escape(path, kUrlEscape, true); 217} 218 219// non-7bit 220static const Charmap kNonASCIICharmap( 221 0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L, 222 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL); 223 224std::string EscapeNonASCII(const std::string& input) { 225 return Escape(input, kNonASCIICharmap, false); 226} 227 228// Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and 229// !'()*-._~% 230static const Charmap kExternalHandlerCharmap( 231 0xffffffffL, 0x5000080dL, 0x68000000L, 0xb8000001L, 232 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL); 233 234std::string EscapeExternalHandlerValue(const std::string& text) { 235 return Escape(text, kExternalHandlerCharmap, false); 236} 237 238bool EscapeQueryParamValue(const string16& text, const char* codepage, 239 bool use_plus, string16* escaped) { 240 // TODO(brettw) bug 1201094: this function should be removed, this "SKIP" 241 // behavior is wrong when the character can't be encoded properly. 242 std::string encoded; 243 if (!base::UTF16ToCodepage(text, codepage, 244 base::OnStringConversionError::SKIP, &encoded)) 245 return false; 246 247 escaped->assign(UTF8ToUTF16(Escape(encoded, kQueryCharmap, use_plus))); 248 return true; 249} 250 251string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text, 252 UnescapeRule::Type rules, 253 size_t* offset_for_adjustment) { 254 std::wstring result; 255 size_t original_offset = offset_for_adjustment ? *offset_for_adjustment : 0; 256 std::string unescaped_url( 257 UnescapeURLImpl(text, rules, offset_for_adjustment)); 258 if (UTF8ToWideAndAdjustOffset(unescaped_url.data(), unescaped_url.length(), 259 &result, offset_for_adjustment)) 260 return WideToUTF16Hack(result); // Character set looks like it's valid. 261 262 // Not valid. Return the escaped version. Undo our changes to 263 // |offset_for_adjustment| since we haven't changed the string after all. 264 if (offset_for_adjustment) 265 *offset_for_adjustment = original_offset; 266 return WideToUTF16Hack(UTF8ToWideAndAdjustOffset(text, 267 offset_for_adjustment)); 268} 269 270std::string UnescapeURLComponent(const std::string& escaped_text, 271 UnescapeRule::Type rules) { 272 return UnescapeURLImpl(escaped_text, rules, NULL); 273} 274 275string16 UnescapeURLComponent(const string16& escaped_text, 276 UnescapeRule::Type rules) { 277 return UnescapeURLImpl(escaped_text, rules, NULL); 278} 279 280 281template <class str> 282void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) { 283 static const struct { 284 char key; 285 const char* replacement; 286 } kCharsToEscape[] = { 287 { '<', "<" }, 288 { '>', ">" }, 289 { '&', "&" }, 290 { '"', """ }, 291 { '\'', "'" }, 292 }; 293 size_t k; 294 for (k = 0; k < ARRAYSIZE_UNSAFE(kCharsToEscape); ++k) { 295 if (c == kCharsToEscape[k].key) { 296 const char* p = kCharsToEscape[k].replacement; 297 while (*p) 298 output->push_back(*p++); 299 break; 300 } 301 } 302 if (k == ARRAYSIZE_UNSAFE(kCharsToEscape)) 303 output->push_back(c); 304} 305 306void AppendEscapedCharForHTML(char c, std::string* output) { 307 AppendEscapedCharForHTMLImpl(c, output); 308} 309 310void AppendEscapedCharForHTML(wchar_t c, string16* output) { 311 AppendEscapedCharForHTMLImpl(c, output); 312} 313 314template <class str> 315str EscapeForHTMLImpl(const str& input) { 316 str result; 317 result.reserve(input.size()); // optimize for no escaping 318 319 for (typename str::const_iterator it = input.begin(); it != input.end(); ++it) 320 AppendEscapedCharForHTMLImpl(*it, &result); 321 322 return result; 323} 324 325std::string EscapeForHTML(const std::string& input) { 326 return EscapeForHTMLImpl(input); 327} 328 329string16 EscapeForHTML(const string16& input) { 330 return EscapeForHTMLImpl(input); 331} 332 333string16 UnescapeForHTML(const string16& input) { 334 static const struct { 335 const wchar_t* ampersand_code; 336 const char replacement; 337 } kEscapeToChars[] = { 338 { L"<", '<' }, 339 { L">", '>' }, 340 { L"&", '&' }, 341 { L""", '"' }, 342 { L"'", '\''}, 343 }; 344 345 if (input.find(WideToUTF16(L"&")) == std::string::npos) 346 return input; 347 348 string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)]; 349 string16 text(input); 350 for (string16::iterator iter = text.begin(); iter != text.end(); ++iter) { 351 if (*iter == '&') { 352 // Potential ampersand encode char. 353 size_t index = iter - text.begin(); 354 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) { 355 if (ampersand_chars[i].empty()) 356 ampersand_chars[i] = WideToUTF16(kEscapeToChars[i].ampersand_code); 357 if (text.find(ampersand_chars[i], index) == index) { 358 text.replace(iter, iter + ampersand_chars[i].length(), 359 1, kEscapeToChars[i].replacement); 360 break; 361 } 362 } 363 } 364 } 365 return text; 366} 367