1// Copyright 2013 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#ifndef URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ 6#define URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ 7 8#include <string> 9 10#include "base/basictypes.h" 11#include "base/strings/string16.h" 12#include "url/url_export.h" 13 14namespace url { 15 16// Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and 17// KURLGoogle.cpp still rely on this type. 18typedef base::char16 UTF16Char; 19 20// Component ------------------------------------------------------------------ 21 22// Represents a substring for URL parsing. 23struct Component { 24 Component() : begin(0), len(-1) {} 25 26 // Normal constructor: takes an offset and a length. 27 Component(int b, int l) : begin(b), len(l) {} 28 29 int end() const { 30 return begin + len; 31 } 32 33 // Returns true if this component is valid, meaning the length is given. Even 34 // valid components may be empty to record the fact that they exist. 35 bool is_valid() const { 36 return (len != -1); 37 } 38 39 // Returns true if the given component is specified on false, the component 40 // is either empty or invalid. 41 bool is_nonempty() const { 42 return (len > 0); 43 } 44 45 void reset() { 46 begin = 0; 47 len = -1; 48 } 49 50 bool operator==(const Component& other) const { 51 return begin == other.begin && len == other.len; 52 } 53 54 int begin; // Byte offset in the string of this component. 55 int len; // Will be -1 if the component is unspecified. 56}; 57 58// Helper that returns a component created with the given begin and ending 59// points. The ending point is non-inclusive. 60inline Component MakeRange(int begin, int end) { 61 return Component(begin, end - begin); 62} 63 64// Parsed --------------------------------------------------------------------- 65 66// A structure that holds the identified parts of an input URL. This structure 67// does NOT store the URL itself. The caller will have to store the URL text 68// and its corresponding Parsed structure separately. 69// 70// Typical usage would be: 71// 72// Parsed parsed; 73// Component scheme; 74// if (!ExtractScheme(url, url_len, &scheme)) 75// return I_CAN_NOT_FIND_THE_SCHEME_DUDE; 76// 77// if (IsStandardScheme(url, scheme)) // Not provided by this component 78// ParseStandardURL(url, url_len, &parsed); 79// else if (IsFileURL(url, scheme)) // Not provided by this component 80// ParseFileURL(url, url_len, &parsed); 81// else 82// ParsePathURL(url, url_len, &parsed); 83// 84struct URL_EXPORT Parsed { 85 // Identifies different components. 86 enum ComponentType { 87 SCHEME, 88 USERNAME, 89 PASSWORD, 90 HOST, 91 PORT, 92 PATH, 93 QUERY, 94 REF, 95 }; 96 97 // The default constructor is sufficient for the components, but inner_parsed_ 98 // requires special handling. 99 Parsed(); 100 Parsed(const Parsed&); 101 Parsed& operator=(const Parsed&); 102 ~Parsed(); 103 104 // Returns the length of the URL (the end of the last component). 105 // 106 // Note that for some invalid, non-canonical URLs, this may not be the length 107 // of the string. For example "http://": the parsed structure will only 108 // contain an entry for the four-character scheme, and it doesn't know about 109 // the "://". For all other last-components, it will return the real length. 110 int Length() const; 111 112 // Returns the number of characters before the given component if it exists, 113 // or where the component would be if it did exist. This will return the 114 // string length if the component would be appended to the end. 115 // 116 // Note that this can get a little funny for the port, query, and ref 117 // components which have a delimiter that is not counted as part of the 118 // component. The |include_delimiter| flag controls if you want this counted 119 // as part of the component or not when the component exists. 120 // 121 // This example shows the difference between the two flags for two of these 122 // delimited components that is present (the port and query) and one that 123 // isn't (the reference). The components that this flag affects are marked 124 // with a *. 125 // 0 1 2 126 // 012345678901234567890 127 // Example input: http://foo:80/?query 128 // include_delim=true, ...=false ("<-" indicates different) 129 // SCHEME: 0 0 130 // USERNAME: 5 5 131 // PASSWORD: 5 5 132 // HOST: 7 7 133 // *PORT: 10 11 <- 134 // PATH: 13 13 135 // *QUERY: 14 15 <- 136 // *REF: 20 20 137 // 138 int CountCharactersBefore(ComponentType type, bool include_delimiter) const; 139 140 // Scheme without the colon: "http://foo"/ would have a scheme of "http". 141 // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there 142 // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed 143 // to start at the beginning of the string if there are preceeding whitespace 144 // or control characters. 145 Component scheme; 146 147 // Username. Specified in URLs with an @ sign before the host. See |password| 148 Component username; 149 150 // Password. The length will be -1 if unspecified, 0 if specified but empty. 151 // Not all URLs with a username have a password, as in "http://me@host/". 152 // The password is separated form the username with a colon, as in 153 // "http://me:secret@host/" 154 Component password; 155 156 // Host name. 157 Component host; 158 159 // Port number. 160 Component port; 161 162 // Path, this is everything following the host name, stopping at the query of 163 // ref delimiter (if any). Length will be -1 if unspecified. This includes 164 // the preceeding slash, so the path on http://www.google.com/asdf" is 165 // "/asdf". As a result, it is impossible to have a 0 length path, it will 166 // be -1 in cases like "http://host?foo". 167 // Note that we treat backslashes the same as slashes. 168 Component path; 169 170 // Stuff between the ? and the # after the path. This does not include the 171 // preceeding ? character. Length will be -1 if unspecified, 0 if there is 172 // a question mark but no query string. 173 Component query; 174 175 // Indicated by a #, this is everything following the hash sign (not 176 // including it). If there are multiple hash signs, we'll use the last one. 177 // Length will be -1 if there is no hash sign, or 0 if there is one but 178 // nothing follows it. 179 Component ref; 180 181 // The URL spec from the character after the scheme: until the end of the 182 // URL, regardless of the scheme. This is mostly useful for 'opaque' non- 183 // hierarchical schemes like data: and javascript: as a convient way to get 184 // the string with the scheme stripped off. 185 Component GetContent() const; 186 187 // This is used for nested URL types, currently only filesystem. If you 188 // parse a filesystem URL, the resulting Parsed will have a nested 189 // inner_parsed_ to hold the parsed inner URL's component information. 190 // For all other url types [including the inner URL], it will be NULL. 191 Parsed* inner_parsed() const { 192 return inner_parsed_; 193 } 194 195 void set_inner_parsed(const Parsed& inner_parsed) { 196 if (!inner_parsed_) 197 inner_parsed_ = new Parsed(inner_parsed); 198 else 199 *inner_parsed_ = inner_parsed; 200 } 201 202 void clear_inner_parsed() { 203 if (inner_parsed_) { 204 delete inner_parsed_; 205 inner_parsed_ = NULL; 206 } 207 } 208 209 private: 210 Parsed* inner_parsed_; // This object is owned and managed by this struct. 211}; 212 213// Initialization functions --------------------------------------------------- 214// 215// These functions parse the given URL, filling in all of the structure's 216// components. These functions can not fail, they will always do their best 217// at interpreting the input given. 218// 219// The string length of the URL MUST be specified, we do not check for NULLs 220// at any point in the process, and will actually handle embedded NULLs. 221// 222// IMPORTANT: These functions do NOT hang on to the given pointer or copy it 223// in any way. See the comment above the struct. 224// 225// The 8-bit versions require UTF-8 encoding. 226 227// StandardURL is for when the scheme is known to be one that has an 228// authority (host) like "http". This function will not handle weird ones 229// like "about:" and "javascript:", or do the right thing for "file:" URLs. 230URL_EXPORT void ParseStandardURL(const char* url, 231 int url_len, 232 Parsed* parsed); 233URL_EXPORT void ParseStandardURL(const base::char16* url, 234 int url_len, 235 Parsed* parsed); 236 237// PathURL is for when the scheme is known not to have an authority (host) 238// section but that aren't file URLs either. The scheme is parsed, and 239// everything after the scheme is considered as the path. This is used for 240// things like "about:" and "javascript:" 241URL_EXPORT void ParsePathURL(const char* url, 242 int url_len, 243 bool trim_path_end, 244 Parsed* parsed); 245URL_EXPORT void ParsePathURL(const base::char16* url, 246 int url_len, 247 bool trim_path_end, 248 Parsed* parsed); 249 250// FileURL is for file URLs. There are some special rules for interpreting 251// these. 252URL_EXPORT void ParseFileURL(const char* url, int url_len, Parsed* parsed); 253URL_EXPORT void ParseFileURL(const base::char16* url, 254 int url_len, 255 Parsed* parsed); 256 257// Filesystem URLs are structured differently than other URLs. 258URL_EXPORT void ParseFileSystemURL(const char* url, 259 int url_len, 260 Parsed* parsed); 261URL_EXPORT void ParseFileSystemURL(const base::char16* url, 262 int url_len, 263 Parsed* parsed); 264 265// MailtoURL is for mailto: urls. They are made up scheme,path,query 266URL_EXPORT void ParseMailtoURL(const char* url, int url_len, Parsed* parsed); 267URL_EXPORT void ParseMailtoURL(const base::char16* url, 268 int url_len, 269 Parsed* parsed); 270 271// Helper functions ----------------------------------------------------------- 272 273// Locates the scheme according to the URL parser's rules. This function is 274// designed so the caller can find the scheme and call the correct Init* 275// function according to their known scheme types. 276// 277// It also does not perform any validation on the scheme. 278// 279// This function will return true if the scheme is found and will put the 280// scheme's range into *scheme. False means no scheme could be found. Note 281// that a URL beginning with a colon has a scheme, but it is empty, so this 282// function will return true but *scheme will = (0,0). 283// 284// The scheme is found by skipping spaces and control characters at the 285// beginning, and taking everything from there to the first colon to be the 286// scheme. The character at scheme.end() will be the colon (we may enhance 287// this to handle full width colons or something, so don't count on the 288// actual character value). The character at scheme.end()+1 will be the 289// beginning of the rest of the URL, be it the authority or the path (or the 290// end of the string). 291// 292// The 8-bit version requires UTF-8 encoding. 293URL_EXPORT bool ExtractScheme(const char* url, 294 int url_len, 295 Component* scheme); 296URL_EXPORT bool ExtractScheme(const base::char16* url, 297 int url_len, 298 Component* scheme); 299 300// Returns true if ch is a character that terminates the authority segment 301// of a URL. 302URL_EXPORT bool IsAuthorityTerminator(base::char16 ch); 303 304// Does a best effort parse of input |spec|, in range |auth|. If a particular 305// component is not found, it will be set to invalid. 306URL_EXPORT void ParseAuthority(const char* spec, 307 const Component& auth, 308 Component* username, 309 Component* password, 310 Component* hostname, 311 Component* port_num); 312URL_EXPORT void ParseAuthority(const base::char16* spec, 313 const Component& auth, 314 Component* username, 315 Component* password, 316 Component* hostname, 317 Component* port_num); 318 319// Computes the integer port value from the given port component. The port 320// component should have been identified by one of the init functions on 321// |Parsed| for the given input url. 322// 323// The return value will be a positive integer between 0 and 64K, or one of 324// the two special values below. 325enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 }; 326URL_EXPORT int ParsePort(const char* url, const Component& port); 327URL_EXPORT int ParsePort(const base::char16* url, const Component& port); 328 329// Extracts the range of the file name in the given url. The path must 330// already have been computed by the parse function, and the matching URL 331// and extracted path are provided to this function. The filename is 332// defined as being everything from the last slash/backslash of the path 333// to the end of the path. 334// 335// The file name will be empty if the path is empty or there is nothing 336// following the last slash. 337// 338// The 8-bit version requires UTF-8 encoding. 339URL_EXPORT void ExtractFileName(const char* url, 340 const Component& path, 341 Component* file_name); 342URL_EXPORT void ExtractFileName(const base::char16* url, 343 const Component& path, 344 Component* file_name); 345 346// Extract the first key/value from the range defined by |*query|. Updates 347// |*query| to start at the end of the extracted key/value pair. This is 348// designed for use in a loop: you can keep calling it with the same query 349// object and it will iterate over all items in the query. 350// 351// Some key/value pairs may have the key, the value, or both be empty (for 352// example, the query string "?&"). These will be returned. Note that an empty 353// last parameter "foo.com?" or foo.com?a&" will not be returned, this case 354// is the same as "done." 355// 356// The initial query component should not include the '?' (this is the default 357// for parsed URLs). 358// 359// If no key/value are found |*key| and |*value| will be unchanged and it will 360// return false. 361URL_EXPORT bool ExtractQueryKeyValue(const char* url, 362 Component* query, 363 Component* key, 364 Component* value); 365URL_EXPORT bool ExtractQueryKeyValue(const base::char16* url, 366 Component* query, 367 Component* key, 368 Component* value); 369 370} // namespace url 371 372#endif // URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ 373