url_parse.h revision 868fa2fe829687343ffae624259930155e16dbd8
1// Copyright 2013 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#ifndef URL_URL_PARSE_H_ 6#define URL_URL_PARSE_H_ 7 8#include <string> 9 10#include "base/basictypes.h" 11#include "base/string16.h" 12#include "url/url_export.h" 13 14namespace url_parse { 15 16// Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and 17// KURLGoogle.cpp still rely on this type. 18typedef char16 UTF16Char; 19 20// Component ------------------------------------------------------------------ 21 22// Represents a substring for URL parsing. 23struct Component { 24 Component() : begin(0), len(-1) {} 25 26 // Normal constructor: takes an offset and a length. 27 Component(int b, int l) : begin(b), len(l) {} 28 29 int end() const { 30 return begin + len; 31 } 32 33 // Returns true if this component is valid, meaning the length is given. Even 34 // valid components may be empty to record the fact that they exist. 35 bool is_valid() const { 36 return (len != -1); 37 } 38 39 // Returns true if the given component is specified on false, the component 40 // is either empty or invalid. 41 bool is_nonempty() const { 42 return (len > 0); 43 } 44 45 void reset() { 46 begin = 0; 47 len = -1; 48 } 49 50 bool operator==(const Component& other) const { 51 return begin == other.begin && len == other.len; 52 } 53 54 int begin; // Byte offset in the string of this component. 55 int len; // Will be -1 if the component is unspecified. 56}; 57 58// Helper that returns a component created with the given begin and ending 59// points. The ending point is non-inclusive. 60inline Component MakeRange(int begin, int end) { 61 return Component(begin, end - begin); 62} 63 64// Parsed --------------------------------------------------------------------- 65 66// A structure that holds the identified parts of an input URL. This structure 67// does NOT store the URL itself. The caller will have to store the URL text 68// and its corresponding Parsed structure separately. 69// 70// Typical usage would be: 71// 72// url_parse::Parsed parsed; 73// url_parse::Component scheme; 74// if (!url_parse::ExtractScheme(url, url_len, &scheme)) 75// return I_CAN_NOT_FIND_THE_SCHEME_DUDE; 76// 77// if (IsStandardScheme(url, scheme)) // Not provided by this component 78// url_parseParseStandardURL(url, url_len, &parsed); 79// else if (IsFileURL(url, scheme)) // Not provided by this component 80// url_parse::ParseFileURL(url, url_len, &parsed); 81// else 82// url_parse::ParsePathURL(url, url_len, &parsed); 83// 84struct URL_EXPORT Parsed { 85 // Identifies different components. 86 enum ComponentType { 87 SCHEME, 88 USERNAME, 89 PASSWORD, 90 HOST, 91 PORT, 92 PATH, 93 QUERY, 94 REF, 95 }; 96 97 // The default constructor is sufficient for the components, but inner_parsed_ 98 // requires special handling. 99 Parsed(); 100 Parsed(const Parsed&); 101 Parsed& operator=(const Parsed&); 102 ~Parsed(); 103 104 // Returns the length of the URL (the end of the last component). 105 // 106 // Note that for some invalid, non-canonical URLs, this may not be the length 107 // of the string. For example "http://": the parsed structure will only 108 // contain an entry for the four-character scheme, and it doesn't know about 109 // the "://". For all other last-components, it will return the real length. 110 int Length() const; 111 112 // Returns the number of characters before the given component if it exists, 113 // or where the component would be if it did exist. This will return the 114 // string length if the component would be appended to the end. 115 // 116 // Note that this can get a little funny for the port, query, and ref 117 // components which have a delimiter that is not counted as part of the 118 // component. The |include_delimiter| flag controls if you want this counted 119 // as part of the component or not when the component exists. 120 // 121 // This example shows the difference between the two flags for two of these 122 // delimited components that is present (the port and query) and one that 123 // isn't (the reference). The components that this flag affects are marked 124 // with a *. 125 // 0 1 2 126 // 012345678901234567890 127 // Example input: http://foo:80/?query 128 // include_delim=true, ...=false ("<-" indicates different) 129 // SCHEME: 0 0 130 // USERNAME: 5 5 131 // PASSWORD: 5 5 132 // HOST: 7 7 133 // *PORT: 10 11 <- 134 // PATH: 13 13 135 // *QUERY: 14 15 <- 136 // *REF: 20 20 137 // 138 int CountCharactersBefore(ComponentType type, bool include_delimiter) const; 139 140 // Scheme without the colon: "http://foo"/ would have a scheme of "http". 141 // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there 142 // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed 143 // to start at the beginning of the string if there are preceeding whitespace 144 // or control characters. 145 Component scheme; 146 147 // Username. Specified in URLs with an @ sign before the host. See |password| 148 Component username; 149 150 // Password. The length will be -1 if unspecified, 0 if specified but empty. 151 // Not all URLs with a username have a password, as in "http://me@host/". 152 // The password is separated form the username with a colon, as in 153 // "http://me:secret@host/" 154 Component password; 155 156 // Host name. 157 Component host; 158 159 // Port number. 160 Component port; 161 162 // Path, this is everything following the host name. Length will be -1 if 163 // unspecified. This includes the preceeding slash, so the path on 164 // http://www.google.com/asdf" is "/asdf". As a result, it is impossible to 165 // have a 0 length path, it will be -1 in cases like "http://host?foo". 166 // Note that we treat backslashes the same as slashes. 167 Component path; 168 169 // Stuff between the ? and the # after the path. This does not include the 170 // preceeding ? character. Length will be -1 if unspecified, 0 if there is 171 // a question mark but no query string. 172 Component query; 173 174 // Indicated by a #, this is everything following the hash sign (not 175 // including it). If there are multiple hash signs, we'll use the last one. 176 // Length will be -1 if there is no hash sign, or 0 if there is one but 177 // nothing follows it. 178 Component ref; 179 180 // This is used for nested URL types, currently only filesystem. If you 181 // parse a filesystem URL, the resulting Parsed will have a nested 182 // inner_parsed_ to hold the parsed inner URL's component information. 183 // For all other url types [including the inner URL], it will be NULL. 184 Parsed* inner_parsed() const { 185 return inner_parsed_; 186 } 187 188 void set_inner_parsed(const Parsed& inner_parsed) { 189 if (!inner_parsed_) 190 inner_parsed_ = new Parsed(inner_parsed); 191 else 192 *inner_parsed_ = inner_parsed; 193 } 194 195 void clear_inner_parsed() { 196 if (inner_parsed_) { 197 delete inner_parsed_; 198 inner_parsed_ = NULL; 199 } 200 } 201 202 private: 203 Parsed* inner_parsed_; // This object is owned and managed by this struct. 204}; 205 206// Initialization functions --------------------------------------------------- 207// 208// These functions parse the given URL, filling in all of the structure's 209// components. These functions can not fail, they will always do their best 210// at interpreting the input given. 211// 212// The string length of the URL MUST be specified, we do not check for NULLs 213// at any point in the process, and will actually handle embedded NULLs. 214// 215// IMPORTANT: These functions do NOT hang on to the given pointer or copy it 216// in any way. See the comment above the struct. 217// 218// The 8-bit versions require UTF-8 encoding. 219 220// StandardURL is for when the scheme is known to be one that has an 221// authority (host) like "http". This function will not handle weird ones 222// like "about:" and "javascript:", or do the right thing for "file:" URLs. 223URL_EXPORT void ParseStandardURL(const char* url, 224 int url_len, 225 Parsed* parsed); 226URL_EXPORT void ParseStandardURL(const char16* url, 227 int url_len, 228 Parsed* parsed); 229 230// PathURL is for when the scheme is known not to have an authority (host) 231// section but that aren't file URLs either. The scheme is parsed, and 232// everything after the scheme is considered as the path. This is used for 233// things like "about:" and "javascript:" 234URL_EXPORT void ParsePathURL(const char* url, int url_len, Parsed* parsed); 235URL_EXPORT void ParsePathURL(const char16* url, int url_len, Parsed* parsed); 236 237// FileURL is for file URLs. There are some special rules for interpreting 238// these. 239URL_EXPORT void ParseFileURL(const char* url, int url_len, Parsed* parsed); 240URL_EXPORT void ParseFileURL(const char16* url, int url_len, Parsed* parsed); 241 242// Filesystem URLs are structured differently than other URLs. 243URL_EXPORT void ParseFileSystemURL(const char* url, 244 int url_len, 245 Parsed* parsed); 246URL_EXPORT void ParseFileSystemURL(const char16* url, 247 int url_len, 248 Parsed* parsed); 249 250// MailtoURL is for mailto: urls. They are made up scheme,path,query 251URL_EXPORT void ParseMailtoURL(const char* url, int url_len, Parsed* parsed); 252URL_EXPORT void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed); 253 254// Helper functions ----------------------------------------------------------- 255 256// Locates the scheme according to the URL parser's rules. This function is 257// designed so the caller can find the scheme and call the correct Init* 258// function according to their known scheme types. 259// 260// It also does not perform any validation on the scheme. 261// 262// This function will return true if the scheme is found and will put the 263// scheme's range into *scheme. False means no scheme could be found. Note 264// that a URL beginning with a colon has a scheme, but it is empty, so this 265// function will return true but *scheme will = (0,0). 266// 267// The scheme is found by skipping spaces and control characters at the 268// beginning, and taking everything from there to the first colon to be the 269// scheme. The character at scheme.end() will be the colon (we may enhance 270// this to handle full width colons or something, so don't count on the 271// actual character value). The character at scheme.end()+1 will be the 272// beginning of the rest of the URL, be it the authority or the path (or the 273// end of the string). 274// 275// The 8-bit version requires UTF-8 encoding. 276URL_EXPORT bool ExtractScheme(const char* url, 277 int url_len, 278 Component* scheme); 279URL_EXPORT bool ExtractScheme(const char16* url, 280 int url_len, 281 Component* scheme); 282 283// Returns true if ch is a character that terminates the authority segment 284// of a URL. 285URL_EXPORT bool IsAuthorityTerminator(char16 ch); 286 287// Does a best effort parse of input |spec|, in range |auth|. If a particular 288// component is not found, it will be set to invalid. 289URL_EXPORT void ParseAuthority(const char* spec, 290 const Component& auth, 291 Component* username, 292 Component* password, 293 Component* hostname, 294 Component* port_num); 295URL_EXPORT void ParseAuthority(const char16* spec, 296 const Component& auth, 297 Component* username, 298 Component* password, 299 Component* hostname, 300 Component* port_num); 301 302// Computes the integer port value from the given port component. The port 303// component should have been identified by one of the init functions on 304// |Parsed| for the given input url. 305// 306// The return value will be a positive integer between 0 and 64K, or one of 307// the two special values below. 308enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 }; 309URL_EXPORT int ParsePort(const char* url, const Component& port); 310URL_EXPORT int ParsePort(const char16* url, const Component& port); 311 312// Extracts the range of the file name in the given url. The path must 313// already have been computed by the parse function, and the matching URL 314// and extracted path are provided to this function. The filename is 315// defined as being everything from the last slash/backslash of the path 316// to the end of the path. 317// 318// The file name will be empty if the path is empty or there is nothing 319// following the last slash. 320// 321// The 8-bit version requires UTF-8 encoding. 322URL_EXPORT void ExtractFileName(const char* url, 323 const Component& path, 324 Component* file_name); 325URL_EXPORT void ExtractFileName(const char16* url, 326 const Component& path, 327 Component* file_name); 328 329// Extract the first key/value from the range defined by |*query|. Updates 330// |*query| to start at the end of the extracted key/value pair. This is 331// designed for use in a loop: you can keep calling it with the same query 332// object and it will iterate over all items in the query. 333// 334// Some key/value pairs may have the key, the value, or both be empty (for 335// example, the query string "?&"). These will be returned. Note that an empty 336// last parameter "foo.com?" or foo.com?a&" will not be returned, this case 337// is the same as "done." 338// 339// The initial query component should not include the '?' (this is the default 340// for parsed URLs). 341// 342// If no key/value are found |*key| and |*value| will be unchanged and it will 343// return false. 344URL_EXPORT bool ExtractQueryKeyValue(const char* url, 345 Component* query, 346 Component* key, 347 Component* value); 348URL_EXPORT bool ExtractQueryKeyValue(const char16* url, 349 Component* query, 350 Component* key, 351 Component* value); 352 353} // namespace url_parse 354 355#endif // URL_URL_PARSE_H_ 356