172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Use of this source code is governed by a BSD-style license that can be
3c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// found in the LICENSE file.
4c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
5c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#ifndef NET_BASE_ESCAPE_H_
6c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define NET_BASE_ESCAPE_H_
73345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#pragma once
8c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
9c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include <string>
10ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include <vector>
11c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
12c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/basictypes.h"
13c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/string16.h"
14a7dee89fba4aaa5f0be152cfc7c7b9b5cff98d51John Reck#include "net/base/net_export.h"
15c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
16c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Escaping --------------------------------------------------------------------
17c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
18c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Escape a file.  This includes:
19c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
20c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string EscapePath(const std::string& path);
21c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
22c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Escape application/x-www-form-urlencoded content.  This includes:
23c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// non-printable, non-7bit, and (including space)  ?>=<;+'&%$#"![\]^`{|}
24c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Space is escaped as + and other special characters as %XX (hex).
25c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string EscapeUrlEncodedData(const std::string& path);
26c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
27c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Escape all non-ASCII input.
28c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string EscapeNonASCII(const std::string& input);
29c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
30c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Escapes characters in text suitable for use as an external protocol handler
31c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// command.
32c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// We %XX everything except alphanumerics and %-_.!~*'() and the restricted
33c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// chracters (;/?:@&=+$,).
34c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string EscapeExternalHandlerValue(const std::string& text);
35c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
36c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Append the given character to the output string, escaping the character if
37c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// the character would be interpretted as an HTML delimiter.
38c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid AppendEscapedCharForHTML(char c, std::string* output);
39c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
40c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Escape chars that might cause this text to be interpretted as HTML tags.
41c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string EscapeForHTML(const std::string& text);
42c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstring16 EscapeForHTML(const string16& text);
43c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
44c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Unescaping ------------------------------------------------------------------
45c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
46c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottclass UnescapeRule {
47c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott public:
48c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // A combination of the following flags that is passed to the unescaping
49c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // functions.
50c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  typedef uint32 Type;
51c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
52c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  enum {
53c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // Don't unescape anything at all.
54c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    NONE = 0,
55c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
56c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // Don't unescape anything special, but all normal unescaping will happen.
57c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // This is a placeholder and can't be combined with other flags (since it's
58c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // just the absence of them). All other unescape rules imply "normal" in
59c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // addition to their special meaning. Things like escaped letters, digits,
60c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // and most symbols will get unescaped with this mode.
61c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    NORMAL = 1,
62c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
63c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // Convert %20 to spaces. In some places where we're showing URLs, we may
64c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // want this. In places where the URL may be copied and pasted out, then
65c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // you wouldn't want this since it might not be interpreted in one piece
66c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // by other applications.
67c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    SPACES = 2,
68c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
69c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // Unescapes various characters that will change the meaning of URLs,
70c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // including '%', '+', '&', '/', '#'. If we unescaped these characters, the
71c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // resulting URL won't be the same as the source one. This flag is used when
72c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // generating final output like filenames for URLs where we won't be
73c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // interpreting as a URL and want to do as much unescaping as possible.
74c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    URL_SPECIAL_CHARS = 4,
75c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
76c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // Unescapes control characters such as %01. This INCLUDES NULLs. This is
77c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // used for rare cases such as data: URL decoding where the result is binary
78c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // data. You should not use this for normal URLs!
79c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    CONTROL_CHARS = 8,
80c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
81c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // URL queries use "+" for space. This flag controls that replacement.
82c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    REPLACE_PLUS_WITH_SPACE = 16,
83c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  };
84c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott};
85c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
86c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Unescapes |escaped_text| and returns the result.
87c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Unescaping consists of looking for the exact pattern "%XX", where each X is
88c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// a hex digit, and converting to the character with the numerical value of
89c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// those digits. Thus "i%20=%203%3b" unescapes to "i = 3;".
90c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
91c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Watch out: this doesn't necessarily result in the correct final result,
92c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// because the encoding may be unknown. For example, the input might be ASCII,
93c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// which, after unescaping, is supposed to be interpreted as UTF-8, and then
94c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// converted into full wide chars. This function won't tell you if any
95c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// conversions need to take place, it only unescapes.
96c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string UnescapeURLComponent(const std::string& escaped_text,
97c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                                 UnescapeRule::Type rules);
98c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstring16 UnescapeURLComponent(const string16& escaped_text,
99c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                              UnescapeRule::Type rules);
100c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
101c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Unescapes the given substring as a URL, and then tries to interpret the
102c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// result as being encoded as UTF-8. If the result is convertable into UTF-8, it
103c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// will be returned as converted. If it is not, the original escaped string will
104ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// be converted into a string16 and returned. (|offset[s]_for_adjustment|)
105ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// specifies one or more offsets into the source strings; each offset will be
106ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// adjusted to point at the same logical place in the result strings during
107ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// decoding.  If this isn't possible because an offset points past the end of
108ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// the source strings or into the middle of a multibyte sequence, the offending
109ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// offset will be set to std::wstring::npos. |offset[s]_for_adjustment| may be
110ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// NULL.
111c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstring16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
112c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                                           UnescapeRule::Type rules,
113c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                                           size_t* offset_for_adjustment);
114ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsenstring16 UnescapeAndDecodeUTF8URLComponentWithOffsets(
115ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    const std::string& text,
116ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    UnescapeRule::Type rules,
117ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen    std::vector<size_t>* offsets_for_adjustment);
118c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
119c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Unescape the following ampersand character codes from |text|:
120c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// &lt; &gt; &amp; &quot; &#39;
121c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstring16 UnescapeForHTML(const string16& text);
122c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
123c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Deprecated ------------------------------------------------------------------
124c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
125c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Escapes characters in text suitable for use as a query parameter value.
126c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// We %XX everything except alphanumerics and -_.!~*'()
127c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Spaces change to "+" unless you pass usePlus=false.
128c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// This is basically the same as encodeURIComponent in javascript.
129c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// For the string16 version, we do a conversion to charset before encoding the
130c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// string.  If the charset doesn't exist, we return false.
131a7dee89fba4aaa5f0be152cfc7c7b9b5cff98d51John ReckNET_EXPORT std::string EscapeQueryParamValue(const std::string& text, bool use_plus);
132c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool EscapeQueryParamValue(const string16& text, const char* codepage,
133c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                           bool use_plus, string16* escaped);
134c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
13572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen// A specialized version of EscapeQueryParamValue for string16s that
136c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// assumes the codepage is UTF8.  This is provided as a convenience.
13772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenstring16 EscapeQueryParamValueUTF8(const string16& text, bool use_plus);
138c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
139ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// Private Functions (Exposed for Unit Testing) --------------------------------
140ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
141ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// A function called by std::for_each that will adjust any offset which occurs
142ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// after one or more encoded characters.
143ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsenstruct AdjustEncodingOffset {
144ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  typedef std::vector<size_t> Adjustments;
145ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
146ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  explicit AdjustEncodingOffset(const Adjustments& adjustments);
147ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  void operator()(size_t& offset);
148ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
149ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen  const Adjustments& adjustments;
150ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen};
151ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen
152c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif  // NET_BASE_ESCAPE_H_
153