172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen// Copyright (c) 2011 The Chromium Authors. All rights reserved. 2c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Use of this source code is governed by a BSD-style license that can be 3c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// found in the LICENSE file. 4c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 5c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#ifndef NET_BASE_ESCAPE_H_ 6c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define NET_BASE_ESCAPE_H_ 73345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#pragma once 8c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 9c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include <string> 10ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include <vector> 11c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 12c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/basictypes.h" 13c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/string16.h" 14a7dee89fba4aaa5f0be152cfc7c7b9b5cff98d51John Reck#include "net/base/net_export.h" 15c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 16c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Escaping -------------------------------------------------------------------- 17c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 18c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Escape a file. This includes: 19c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|} 20c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string EscapePath(const std::string& path); 21c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 22c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Escape application/x-www-form-urlencoded content. This includes: 23c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|} 24c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Space is escaped as + and other special characters as %XX (hex). 25c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string EscapeUrlEncodedData(const std::string& path); 26c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 27c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Escape all non-ASCII input. 28c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string EscapeNonASCII(const std::string& input); 29c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 30c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Escapes characters in text suitable for use as an external protocol handler 31c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// command. 32c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// We %XX everything except alphanumerics and %-_.!~*'() and the restricted 33c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// chracters (;/?:@&=+$,). 34c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string EscapeExternalHandlerValue(const std::string& text); 35c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 36c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Append the given character to the output string, escaping the character if 37c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// the character would be interpretted as an HTML delimiter. 38c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid AppendEscapedCharForHTML(char c, std::string* output); 39c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 40c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Escape chars that might cause this text to be interpretted as HTML tags. 41c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string EscapeForHTML(const std::string& text); 42c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstring16 EscapeForHTML(const string16& text); 43c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 44c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Unescaping ------------------------------------------------------------------ 45c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 46c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottclass UnescapeRule { 47c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott public: 48c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // A combination of the following flags that is passed to the unescaping 49c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // functions. 50c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott typedef uint32 Type; 51c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 52c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott enum { 53c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Don't unescape anything at all. 54c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott NONE = 0, 55c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 56c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Don't unescape anything special, but all normal unescaping will happen. 57c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // This is a placeholder and can't be combined with other flags (since it's 58c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // just the absence of them). All other unescape rules imply "normal" in 59c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // addition to their special meaning. Things like escaped letters, digits, 60c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // and most symbols will get unescaped with this mode. 61c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott NORMAL = 1, 62c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 63c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Convert %20 to spaces. In some places where we're showing URLs, we may 64c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // want this. In places where the URL may be copied and pasted out, then 65c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // you wouldn't want this since it might not be interpreted in one piece 66c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // by other applications. 67c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott SPACES = 2, 68c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 69c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Unescapes various characters that will change the meaning of URLs, 70c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // including '%', '+', '&', '/', '#'. If we unescaped these characters, the 71c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // resulting URL won't be the same as the source one. This flag is used when 72c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // generating final output like filenames for URLs where we won't be 73c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // interpreting as a URL and want to do as much unescaping as possible. 74c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott URL_SPECIAL_CHARS = 4, 75c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 76c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Unescapes control characters such as %01. This INCLUDES NULLs. This is 77c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // used for rare cases such as data: URL decoding where the result is binary 78c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // data. You should not use this for normal URLs! 79c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott CONTROL_CHARS = 8, 80c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 81c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // URL queries use "+" for space. This flag controls that replacement. 82c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott REPLACE_PLUS_WITH_SPACE = 16, 83c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott }; 84c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}; 85c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 86c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Unescapes |escaped_text| and returns the result. 87c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Unescaping consists of looking for the exact pattern "%XX", where each X is 88c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// a hex digit, and converting to the character with the numerical value of 89c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// those digits. Thus "i%20=%203%3b" unescapes to "i = 3;". 90c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 91c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Watch out: this doesn't necessarily result in the correct final result, 92c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// because the encoding may be unknown. For example, the input might be ASCII, 93c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// which, after unescaping, is supposed to be interpreted as UTF-8, and then 94c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// converted into full wide chars. This function won't tell you if any 95c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// conversions need to take place, it only unescapes. 96c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstd::string UnescapeURLComponent(const std::string& escaped_text, 97c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UnescapeRule::Type rules); 98c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstring16 UnescapeURLComponent(const string16& escaped_text, 99c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UnescapeRule::Type rules); 100c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 101c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Unescapes the given substring as a URL, and then tries to interpret the 102c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// result as being encoded as UTF-8. If the result is convertable into UTF-8, it 103c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// will be returned as converted. If it is not, the original escaped string will 104ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// be converted into a string16 and returned. (|offset[s]_for_adjustment|) 105ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// specifies one or more offsets into the source strings; each offset will be 106ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// adjusted to point at the same logical place in the result strings during 107ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// decoding. If this isn't possible because an offset points past the end of 108ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// the source strings or into the middle of a multibyte sequence, the offending 109ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// offset will be set to std::wstring::npos. |offset[s]_for_adjustment| may be 110ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// NULL. 111c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstring16 UnescapeAndDecodeUTF8URLComponent(const std::string& text, 112c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UnescapeRule::Type rules, 113c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott size_t* offset_for_adjustment); 114ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsenstring16 UnescapeAndDecodeUTF8URLComponentWithOffsets( 115ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen const std::string& text, 116ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen UnescapeRule::Type rules, 117ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen std::vector<size_t>* offsets_for_adjustment); 118c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 119c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Unescape the following ampersand character codes from |text|: 120c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// < > & " ' 121c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstring16 UnescapeForHTML(const string16& text); 122c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 123c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Deprecated ------------------------------------------------------------------ 124c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 125c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Escapes characters in text suitable for use as a query parameter value. 126c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// We %XX everything except alphanumerics and -_.!~*'() 127c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Spaces change to "+" unless you pass usePlus=false. 128c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// This is basically the same as encodeURIComponent in javascript. 129c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// For the string16 version, we do a conversion to charset before encoding the 130c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// string. If the charset doesn't exist, we return false. 131a7dee89fba4aaa5f0be152cfc7c7b9b5cff98d51John ReckNET_EXPORT std::string EscapeQueryParamValue(const std::string& text, bool use_plus); 132c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool EscapeQueryParamValue(const string16& text, const char* codepage, 133c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott bool use_plus, string16* escaped); 134c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 13572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen// A specialized version of EscapeQueryParamValue for string16s that 136c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// assumes the codepage is UTF8. This is provided as a convenience. 13772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenstring16 EscapeQueryParamValueUTF8(const string16& text, bool use_plus); 138c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 139ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// Private Functions (Exposed for Unit Testing) -------------------------------- 140ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen 141ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// A function called by std::for_each that will adjust any offset which occurs 142ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// after one or more encoded characters. 143ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsenstruct AdjustEncodingOffset { 144ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen typedef std::vector<size_t> Adjustments; 145ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen 146ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen explicit AdjustEncodingOffset(const Adjustments& adjustments); 147ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen void operator()(size_t& offset); 148ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen 149ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen const Adjustments& adjustments; 150ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen}; 151ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen 152c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif // NET_BASE_ESCAPE_H_ 153