escape.cc revision 72a454cd3513ac24fbdd0e0cb9ad70b86a99b801
1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <algorithm>
6
7#include "net/base/escape.h"
8
9#include "base/i18n/icu_string_conversions.h"
10#include "base/logging.h"
11#include "base/string_piece.h"
12#include "base/string_util.h"
13#include "base/utf_string_conversions.h"
14#include "base/utf_offset_string_conversions.h"
15
16namespace {
17
18static const char* const kHexString = "0123456789ABCDEF";
19inline char IntToHex(int i) {
20  DCHECK(i >= 0 && i <= 15) << i << " not a hex value";
21  return kHexString[i];
22}
23
24// A fast bit-vector map for ascii characters.
25//
26// Internally stores 256 bits in an array of 8 ints.
27// Does quick bit-flicking to lookup needed characters.
28class Charmap {
29 public:
30  Charmap(uint32 b0, uint32 b1, uint32 b2, uint32 b3,
31          uint32 b4, uint32 b5, uint32 b6, uint32 b7) {
32    map_[0] = b0; map_[1] = b1; map_[2] = b2; map_[3] = b3;
33    map_[4] = b4; map_[5] = b5; map_[6] = b6; map_[7] = b7;
34  }
35
36  bool Contains(unsigned char c) const {
37    return (map_[c >> 5] & (1 << (c & 31))) ? true : false;
38  }
39
40 private:
41  uint32 map_[8];
42};
43
44// Given text to escape and a Charmap defining which values to escape,
45// return an escaped string.  If use_plus is true, spaces are converted
46// to +, otherwise, if spaces are in the charmap, they are converted to
47// %20.
48const std::string Escape(const std::string& text, const Charmap& charmap,
49                         bool use_plus) {
50  std::string escaped;
51  escaped.reserve(text.length() * 3);
52  for (unsigned int i = 0; i < text.length(); ++i) {
53    unsigned char c = static_cast<unsigned char>(text[i]);
54    if (use_plus && ' ' == c) {
55      escaped.push_back('+');
56    } else if (charmap.Contains(c)) {
57      escaped.push_back('%');
58      escaped.push_back(IntToHex(c >> 4));
59      escaped.push_back(IntToHex(c & 0xf));
60    } else {
61      escaped.push_back(c);
62    }
63  }
64  return escaped;
65}
66
67// Contains nonzero when the corresponding character is unescapable for normal
68// URLs. These characters are the ones that may change the parsing of a URL, so
69// we don't want to unescape them sometimes. In many case we won't want to
70// unescape spaces, but that is controlled by parameters to Unescape*.
71//
72// The basic rule is that we can't unescape anything that would changing parsing
73// like # or ?. We also can't unescape &, =, or + since that could be part of a
74// query and that could change the server's parsing of the query. Nor can we
75// unescape \ since googleurl will convert it to a /.
76//
77// Lastly, we can't unescape anything that doesn't have a canonical
78// representation in a URL. This means that unescaping will change the URL, and
79// you could get different behavior if you copy and paste the URL, or press
80// enter in the URL bar. The list of characters that fall into this category
81// are the ones labeled PASS (allow either escaped or unescaped) in the big
82// lookup table at the top of googleurl/src/url_canon_path.cc
83const char kUrlUnescape[128] = {
84//   NULL, control chars...
85     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87//  ' ' !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /
88     0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
89//   0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
90     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
91//   @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O
92     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
93//   P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
94     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
95//   `  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o
96     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
97//   p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~  <NBSP>
98     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
99};
100
101template<typename STR>
102STR UnescapeURLImpl(const STR& escaped_text,
103                    UnescapeRule::Type rules,
104                    size_t* offset_for_adjustment) {
105  size_t offset_temp = string16::npos;
106  if (!offset_for_adjustment)
107    offset_for_adjustment = &offset_temp;
108  else if (*offset_for_adjustment >= escaped_text.length())
109    *offset_for_adjustment = string16::npos;
110
111  // Do not unescape anything, return the |escaped_text| text.
112  if (rules == UnescapeRule::NONE)
113    return escaped_text;
114
115  // The output of the unescaping is always smaller than the input, so we can
116  // reserve the input size to make sure we have enough buffer and don't have
117  // to allocate in the loop below.
118  STR result;
119  result.reserve(escaped_text.length());
120
121  for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
122    if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
123      // Non ASCII character, append as is.
124      result.push_back(escaped_text[i]);
125      continue;
126    }
127
128    char current_char = static_cast<char>(escaped_text[i]);
129    if (current_char == '%' && i + 2 < max) {
130      const typename STR::value_type most_sig_digit(
131          static_cast<typename STR::value_type>(escaped_text[i + 1]));
132      const typename STR::value_type least_sig_digit(
133          static_cast<typename STR::value_type>(escaped_text[i + 2]));
134      if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
135        unsigned char value = HexDigitToInt(most_sig_digit) * 16 +
136            HexDigitToInt(least_sig_digit);
137        if (value >= 0x80 ||  // Unescape all high-bit characters.
138            // For 7-bit characters, the lookup table tells us all valid chars.
139            (kUrlUnescape[value] ||
140             // ...and we allow some additional unescaping when flags are set.
141             (value == ' ' && (rules & UnescapeRule::SPACES)) ||
142             // Allow any of the prohibited but non-control characters when
143             // we're doing "special" chars.
144             (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
145             // Additionally allow control characters if requested.
146             (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
147          // Use the unescaped version of the character.
148          size_t length_before_append = result.length();
149          result.push_back(value);
150          i += 2;
151
152          // Adjust offset to match length change.
153          if (*offset_for_adjustment != std::string::npos) {
154            if (*offset_for_adjustment > (length_before_append + 2))
155              *offset_for_adjustment -= 2;
156            else if (*offset_for_adjustment > length_before_append)
157              *offset_for_adjustment = std::string::npos;
158          }
159        } else {
160          // Keep escaped. Append a percent and we'll get the following two
161          // digits on the next loops through.
162          result.push_back('%');
163        }
164      } else {
165        // Invalid escape sequence, just pass the percent through and continue
166        // right after it.
167        result.push_back('%');
168      }
169    } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
170               escaped_text[i] == '+') {
171      result.push_back(' ');
172    } else {
173      // Normal case for unescaped characters.
174      result.push_back(escaped_text[i]);
175    }
176  }
177
178  return result;
179}
180
181}  // namespace
182
183// Everything except alphanumerics and !'()*-._~
184// See RFC 2396 for the list of reserved characters.
185static const Charmap kQueryCharmap(
186  0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L,
187  0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
188
189std::string EscapeQueryParamValue(const std::string& text, bool use_plus) {
190  return Escape(text, kQueryCharmap, use_plus);
191}
192
193// Convert the string to a sequence of bytes and then % escape anything
194// except alphanumerics and !'()*-._~
195string16 EscapeQueryParamValueUTF8(const string16& text,
196                                   bool use_plus) {
197  return UTF8ToUTF16(Escape(UTF16ToUTF8(text), kQueryCharmap, use_plus));
198}
199
200// non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
201static const Charmap kPathCharmap(
202  0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L,
203  0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
204
205std::string EscapePath(const std::string& path) {
206  return Escape(path, kPathCharmap, false);
207}
208
209// non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
210static const Charmap kUrlEscape(
211  0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L,
212  0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
213);
214
215std::string EscapeUrlEncodedData(const std::string& path) {
216  return Escape(path, kUrlEscape, true);
217}
218
219// non-7bit
220static const Charmap kNonASCIICharmap(
221  0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L,
222  0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
223
224std::string EscapeNonASCII(const std::string& input) {
225  return Escape(input, kNonASCIICharmap, false);
226}
227
228// Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
229// !'()*-._~%
230static const Charmap kExternalHandlerCharmap(
231  0xffffffffL, 0x5000080dL, 0x68000000L, 0xb8000001L,
232  0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
233
234std::string EscapeExternalHandlerValue(const std::string& text) {
235  return Escape(text, kExternalHandlerCharmap, false);
236}
237
238bool EscapeQueryParamValue(const string16& text, const char* codepage,
239                           bool use_plus, string16* escaped) {
240  // TODO(brettw) bug 1201094: this function should be removed, this "SKIP"
241  // behavior is wrong when the character can't be encoded properly.
242  std::string encoded;
243  if (!base::UTF16ToCodepage(text, codepage,
244                             base::OnStringConversionError::SKIP, &encoded))
245    return false;
246
247  escaped->assign(UTF8ToUTF16(Escape(encoded, kQueryCharmap, use_plus)));
248  return true;
249}
250
251string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
252                                           UnescapeRule::Type rules,
253                                           size_t* offset_for_adjustment) {
254  std::wstring result;
255  size_t original_offset = offset_for_adjustment ? *offset_for_adjustment : 0;
256  std::string unescaped_url(
257      UnescapeURLImpl(text, rules, offset_for_adjustment));
258  if (UTF8ToWideAndAdjustOffset(unescaped_url.data(), unescaped_url.length(),
259                                &result, offset_for_adjustment))
260    return WideToUTF16Hack(result);      // Character set looks like it's valid.
261
262  // Not valid.  Return the escaped version.  Undo our changes to
263  // |offset_for_adjustment| since we haven't changed the string after all.
264  if (offset_for_adjustment)
265    *offset_for_adjustment = original_offset;
266  return WideToUTF16Hack(UTF8ToWideAndAdjustOffset(text,
267                                                   offset_for_adjustment));
268}
269
270std::string UnescapeURLComponent(const std::string& escaped_text,
271                                 UnescapeRule::Type rules) {
272  return UnescapeURLImpl(escaped_text, rules, NULL);
273}
274
275string16 UnescapeURLComponent(const string16& escaped_text,
276                              UnescapeRule::Type rules) {
277  return UnescapeURLImpl(escaped_text, rules, NULL);
278}
279
280
281template <class str>
282void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
283  static const struct {
284    char key;
285    const char* replacement;
286  } kCharsToEscape[] = {
287    { '<', "&lt;" },
288    { '>', "&gt;" },
289    { '&', "&amp;" },
290    { '"', "&quot;" },
291    { '\'', "&#39;" },
292  };
293  size_t k;
294  for (k = 0; k < ARRAYSIZE_UNSAFE(kCharsToEscape); ++k) {
295    if (c == kCharsToEscape[k].key) {
296      const char* p = kCharsToEscape[k].replacement;
297      while (*p)
298        output->push_back(*p++);
299      break;
300    }
301  }
302  if (k == ARRAYSIZE_UNSAFE(kCharsToEscape))
303    output->push_back(c);
304}
305
306void AppendEscapedCharForHTML(char c, std::string* output) {
307  AppendEscapedCharForHTMLImpl(c, output);
308}
309
310void AppendEscapedCharForHTML(wchar_t c, string16* output) {
311  AppendEscapedCharForHTMLImpl(c, output);
312}
313
314template <class str>
315str EscapeForHTMLImpl(const str& input) {
316  str result;
317  result.reserve(input.size());  // optimize for no escaping
318
319  for (typename str::const_iterator it = input.begin(); it != input.end(); ++it)
320    AppendEscapedCharForHTMLImpl(*it, &result);
321
322  return result;
323}
324
325std::string EscapeForHTML(const std::string& input) {
326  return EscapeForHTMLImpl(input);
327}
328
329string16 EscapeForHTML(const string16& input) {
330  return EscapeForHTMLImpl(input);
331}
332
333string16 UnescapeForHTML(const string16& input) {
334  static const struct {
335    const wchar_t* ampersand_code;
336    const char replacement;
337  } kEscapeToChars[] = {
338    { L"&lt;", '<' },
339    { L"&gt;", '>' },
340    { L"&amp;", '&' },
341    { L"&quot;", '"' },
342    { L"&#39;", '\''},
343  };
344
345  if (input.find(WideToUTF16(L"&")) == std::string::npos)
346    return input;
347
348  string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)];
349  string16 text(input);
350  for (string16::iterator iter = text.begin(); iter != text.end(); ++iter) {
351    if (*iter == '&') {
352      // Potential ampersand encode char.
353      size_t index = iter - text.begin();
354      for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) {
355        if (ampersand_chars[i].empty())
356          ampersand_chars[i] = WideToUTF16(kEscapeToChars[i].ampersand_code);
357        if (text.find(ampersand_chars[i], index) == index) {
358          text.replace(iter, iter + ampersand_chars[i].length(),
359                       1, kEscapeToChars[i].replacement);
360          break;
361        }
362      }
363    }
364  }
365  return text;
366}
367