1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "net/base/escape.h"
6
7#include <algorithm>
8
9#include "base/logging.h"
10#include "base/scoped_ptr.h"
11#include "base/string_piece.h"
12#include "base/string_util.h"
13#include "base/utf_string_conversions.h"
14#include "base/utf_offset_string_conversions.h"
15
16namespace {
17
18static const char* const kHexString = "0123456789ABCDEF";
19inline char IntToHex(int i) {
20  DCHECK(i >= 0 && i <= 15) << i << " not a hex value";
21  return kHexString[i];
22}
23
24// A fast bit-vector map for ascii characters.
25//
26// Internally stores 256 bits in an array of 8 ints.
27// Does quick bit-flicking to lookup needed characters.
28class Charmap {
29 public:
30  Charmap(uint32 b0, uint32 b1, uint32 b2, uint32 b3,
31          uint32 b4, uint32 b5, uint32 b6, uint32 b7) {
32    map_[0] = b0; map_[1] = b1; map_[2] = b2; map_[3] = b3;
33    map_[4] = b4; map_[5] = b5; map_[6] = b6; map_[7] = b7;
34  }
35
36  bool Contains(unsigned char c) const {
37    return (map_[c >> 5] & (1 << (c & 31))) ? true : false;
38  }
39
40 private:
41  uint32 map_[8];
42};
43
44// Given text to escape and a Charmap defining which values to escape,
45// return an escaped string.  If use_plus is true, spaces are converted
46// to +, otherwise, if spaces are in the charmap, they are converted to
47// %20.
48std::string Escape(const std::string& text, const Charmap& charmap,
49                   bool use_plus) {
50  std::string escaped;
51  escaped.reserve(text.length() * 3);
52  for (unsigned int i = 0; i < text.length(); ++i) {
53    unsigned char c = static_cast<unsigned char>(text[i]);
54    if (use_plus && ' ' == c) {
55      escaped.push_back('+');
56    } else if (charmap.Contains(c)) {
57      escaped.push_back('%');
58      escaped.push_back(IntToHex(c >> 4));
59      escaped.push_back(IntToHex(c & 0xf));
60    } else {
61      escaped.push_back(c);
62    }
63  }
64  return escaped;
65}
66
67// Contains nonzero when the corresponding character is unescapable for normal
68// URLs. These characters are the ones that may change the parsing of a URL, so
69// we don't want to unescape them sometimes. In many case we won't want to
70// unescape spaces, but that is controlled by parameters to Unescape*.
71//
72// The basic rule is that we can't unescape anything that would changing parsing
73// like # or ?. We also can't unescape &, =, or + since that could be part of a
74// query and that could change the server's parsing of the query. Nor can we
75// unescape \ since googleurl will convert it to a /.
76//
77// Lastly, we can't unescape anything that doesn't have a canonical
78// representation in a URL. This means that unescaping will change the URL, and
79// you could get different behavior if you copy and paste the URL, or press
80// enter in the URL bar. The list of characters that fall into this category
81// are the ones labeled PASS (allow either escaped or unescaped) in the big
82// lookup table at the top of googleurl/src/url_canon_path.cc
83const char kUrlUnescape[128] = {
84//   NULL, control chars...
85     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87//  ' ' !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /
88     0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
89//   0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
90     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
91//   @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O
92     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
93//   P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
94     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
95//   `  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o
96     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
97//   p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~  <NBSP>
98     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
99};
100
101template<typename STR>
102STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,
103                               UnescapeRule::Type rules,
104                               std::vector<size_t>* offsets_for_adjustment) {
105  if (offsets_for_adjustment) {
106    std::for_each(offsets_for_adjustment->begin(),
107                  offsets_for_adjustment->end(),
108                  LimitOffset<std::wstring>(escaped_text.length()));
109  }
110  // Do not unescape anything, return the |escaped_text| text.
111  if (rules == UnescapeRule::NONE)
112    return escaped_text;
113
114  // The output of the unescaping is always smaller than the input, so we can
115  // reserve the input size to make sure we have enough buffer and don't have
116  // to allocate in the loop below.
117  STR result;
118  result.reserve(escaped_text.length());
119
120  AdjustEncodingOffset::Adjustments adjustments;  // Locations of adjusted text.
121  for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
122    if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
123      // Non ASCII character, append as is.
124      result.push_back(escaped_text[i]);
125      continue;
126    }
127
128    char current_char = static_cast<char>(escaped_text[i]);
129    if (current_char == '%' && i + 2 < max) {
130      const typename STR::value_type most_sig_digit(
131          static_cast<typename STR::value_type>(escaped_text[i + 1]));
132      const typename STR::value_type least_sig_digit(
133          static_cast<typename STR::value_type>(escaped_text[i + 2]));
134      if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
135        unsigned char value = HexDigitToInt(most_sig_digit) * 16 +
136            HexDigitToInt(least_sig_digit);
137        if (value >= 0x80 ||  // Unescape all high-bit characters.
138            // For 7-bit characters, the lookup table tells us all valid chars.
139            (kUrlUnescape[value] ||
140             // ...and we allow some additional unescaping when flags are set.
141             (value == ' ' && (rules & UnescapeRule::SPACES)) ||
142             // Allow any of the prohibited but non-control characters when
143             // we're doing "special" chars.
144             (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
145             // Additionally allow control characters if requested.
146             (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
147          // Use the unescaped version of the character.
148          adjustments.push_back(i);
149          result.push_back(value);
150          i += 2;
151        } else {
152          // Keep escaped. Append a percent and we'll get the following two
153          // digits on the next loops through.
154          result.push_back('%');
155        }
156      } else {
157        // Invalid escape sequence, just pass the percent through and continue
158        // right after it.
159        result.push_back('%');
160      }
161    } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
162               escaped_text[i] == '+') {
163      result.push_back(' ');
164    } else {
165      // Normal case for unescaped characters.
166      result.push_back(escaped_text[i]);
167    }
168  }
169
170  // Make offset adjustment.
171  if (offsets_for_adjustment && !adjustments.empty()) {
172    std::for_each(offsets_for_adjustment->begin(),
173                   offsets_for_adjustment->end(),
174                   AdjustEncodingOffset(adjustments));
175  }
176
177  return result;
178}
179
180template<typename STR>
181STR UnescapeURLImpl(const STR& escaped_text,
182                    UnescapeRule::Type rules,
183                    size_t* offset_for_adjustment) {
184  std::vector<size_t> offsets;
185  if (offset_for_adjustment)
186    offsets.push_back(*offset_for_adjustment);
187  STR result = UnescapeURLWithOffsetsImpl(escaped_text, rules, &offsets);
188  if (offset_for_adjustment)
189    *offset_for_adjustment = offsets[0];
190  return result;
191}
192
193}  // namespace
194
195// Everything except alphanumerics and !'()*-._~
196// See RFC 2396 for the list of reserved characters.
197static const Charmap kQueryCharmap(
198  0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L,
199  0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
200
201std::string EscapeQueryParamValue(const std::string& text, bool use_plus) {
202  return Escape(text, kQueryCharmap, use_plus);
203}
204
205// Convert the string to a sequence of bytes and then % escape anything
206// except alphanumerics and !'()*-._~
207string16 EscapeQueryParamValueUTF8(const string16& text,
208                                   bool use_plus) {
209  return UTF8ToUTF16(Escape(UTF16ToUTF8(text), kQueryCharmap, use_plus));
210}
211
212// non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
213static const Charmap kPathCharmap(
214  0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L,
215  0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
216
217std::string EscapePath(const std::string& path) {
218  return Escape(path, kPathCharmap, false);
219}
220
221// non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
222static const Charmap kUrlEscape(
223  0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L,
224  0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
225);
226
227std::string EscapeUrlEncodedData(const std::string& path) {
228  return Escape(path, kUrlEscape, true);
229}
230
231// non-7bit
232static const Charmap kNonASCIICharmap(
233  0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L,
234  0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
235
236std::string EscapeNonASCII(const std::string& input) {
237  return Escape(input, kNonASCIICharmap, false);
238}
239
240// Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
241// !'()*-._~%
242static const Charmap kExternalHandlerCharmap(
243  0xffffffffL, 0x5000080dL, 0x68000000L, 0xb8000001L,
244  0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
245
246std::string EscapeExternalHandlerValue(const std::string& text) {
247  return Escape(text, kExternalHandlerCharmap, false);
248}
249
250string16 UnescapeAndDecodeUTF8URLComponentWithOffsets(
251    const std::string& text,
252    UnescapeRule::Type rules,
253    std::vector<size_t>* offsets_for_adjustment) {
254  std::wstring result;
255  std::vector<size_t> original_offsets;
256  if (offsets_for_adjustment)
257    original_offsets = *offsets_for_adjustment;
258  std::string unescaped_url(
259      UnescapeURLWithOffsetsImpl(text, rules, offsets_for_adjustment));
260  if (UTF8ToWideAndAdjustOffsets(unescaped_url.data(), unescaped_url.length(),
261                                &result, offsets_for_adjustment))
262    return WideToUTF16Hack(result);      // Character set looks like it's valid.
263
264  // Not valid.  Return the escaped version.  Undo our changes to
265  // |offset_for_adjustment| since we haven't changed the string after all.
266  if (offsets_for_adjustment)
267    *offsets_for_adjustment = original_offsets;
268  return WideToUTF16Hack(UTF8ToWideAndAdjustOffsets(
269      text, offsets_for_adjustment));
270}
271
272string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
273                                           UnescapeRule::Type rules,
274                                           size_t* offset_for_adjustment) {
275  std::vector<size_t> offsets;
276  if (offset_for_adjustment)
277    offsets.push_back(*offset_for_adjustment);
278  string16 result =
279      UnescapeAndDecodeUTF8URLComponentWithOffsets(text, rules, &offsets);
280  if (offset_for_adjustment)
281    *offset_for_adjustment = offsets[0];
282  return result;
283}
284
285std::string UnescapeURLComponent(const std::string& escaped_text,
286                                 UnescapeRule::Type rules) {
287  return UnescapeURLWithOffsetsImpl<std::string>(escaped_text, rules, NULL);
288}
289
290string16 UnescapeURLComponent(const string16& escaped_text,
291                              UnescapeRule::Type rules) {
292  return UnescapeURLWithOffsetsImpl<string16>(escaped_text, rules, NULL);
293}
294
295
296template <class str>
297void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
298  static const struct {
299    char key;
300    const char* replacement;
301  } kCharsToEscape[] = {
302    { '<', "&lt;" },
303    { '>', "&gt;" },
304    { '&', "&amp;" },
305    { '"', "&quot;" },
306    { '\'', "&#39;" },
307  };
308  size_t k;
309  for (k = 0; k < ARRAYSIZE_UNSAFE(kCharsToEscape); ++k) {
310    if (c == kCharsToEscape[k].key) {
311      const char* p = kCharsToEscape[k].replacement;
312      while (*p)
313        output->push_back(*p++);
314      break;
315    }
316  }
317  if (k == ARRAYSIZE_UNSAFE(kCharsToEscape))
318    output->push_back(c);
319}
320
321void AppendEscapedCharForHTML(char c, std::string* output) {
322  AppendEscapedCharForHTMLImpl(c, output);
323}
324
325void AppendEscapedCharForHTML(wchar_t c, string16* output) {
326  AppendEscapedCharForHTMLImpl(c, output);
327}
328
329template <class str>
330str EscapeForHTMLImpl(const str& input) {
331  str result;
332  result.reserve(input.size());  // optimize for no escaping
333
334  for (typename str::const_iterator it = input.begin(); it != input.end(); ++it)
335    AppendEscapedCharForHTMLImpl(*it, &result);
336
337  return result;
338}
339
340std::string EscapeForHTML(const std::string& input) {
341  return EscapeForHTMLImpl(input);
342}
343
344string16 EscapeForHTML(const string16& input) {
345  return EscapeForHTMLImpl(input);
346}
347
348string16 UnescapeForHTML(const string16& input) {
349  static const struct {
350    const wchar_t* ampersand_code;
351    const char replacement;
352  } kEscapeToChars[] = {
353    { L"&lt;", '<' },
354    { L"&gt;", '>' },
355    { L"&amp;", '&' },
356    { L"&quot;", '"' },
357    { L"&#39;", '\''},
358  };
359
360  if (input.find(WideToUTF16(L"&")) == std::string::npos)
361    return input;
362
363  string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)];
364  string16 text(input);
365  for (string16::iterator iter = text.begin(); iter != text.end(); ++iter) {
366    if (*iter == '&') {
367      // Potential ampersand encode char.
368      size_t index = iter - text.begin();
369      for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) {
370        if (ampersand_chars[i].empty())
371          ampersand_chars[i] = WideToUTF16(kEscapeToChars[i].ampersand_code);
372        if (text.find(ampersand_chars[i], index) == index) {
373          text.replace(iter, iter + ampersand_chars[i].length(),
374                       1, kEscapeToChars[i].replacement);
375          break;
376        }
377      }
378    }
379  }
380  return text;
381}
382
383AdjustEncodingOffset::AdjustEncodingOffset(const Adjustments& adjustments)
384  : adjustments(adjustments) {}
385
386void AdjustEncodingOffset::operator()(size_t& offset) {
387  // For each encoded character occurring before an offset subtract 2.
388  if (offset == string16::npos)
389    return;
390  size_t adjusted_offset = offset;
391  for (Adjustments::const_iterator i = adjustments.begin();
392       i != adjustments.end(); ++i) {
393    size_t location = *i;
394    if (offset <= location) {
395      offset = adjusted_offset;
396      return;
397    }
398    if (offset <= (location + 2)) {
399      offset = string16::npos;
400      return;
401    }
402    adjusted_offset -= 2;
403  }
404  offset = adjusted_offset;
405}
406