1// Copyright 2007, Google Inc. 2// All rights reserved. 3// 4// Redistribution and use in source and binary forms, with or without 5// modification, are permitted provided that the following conditions are 6// met: 7// 8// * Redistributions of source code must retain the above copyright 9// notice, this list of conditions and the following disclaimer. 10// * Redistributions in binary form must reproduce the above 11// copyright notice, this list of conditions and the following disclaimer 12// in the documentation and/or other materials provided with the 13// distribution. 14// * Neither the name of Google Inc. nor the names of its 15// contributors may be used to endorse or promote products derived from 16// this software without specific prior written permission. 17// 18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30#include "base/logging.h" 31#include "googleurl/src/url_file.h" 32#include "googleurl/src/url_parse.h" 33#include "googleurl/src/url_parse_internal.h" 34 35// Interesting IE file:isms... 36// 37// INPUT OUTPUT 38// ========================= ============================== 39// file:/foo/bar file:///foo/bar 40// The result here seems totally invalid!?!? This isn't UNC. 41// 42// file:/ 43// file:// or any other number of slashes 44// IE6 doesn't do anything at all if you click on this link. No error: 45// nothing. IE6's history system seems to always color this link, so I'm 46// guessing that it maps internally to the empty URL. 47// 48// C:\ file:///C:/ 49// When on a file: URL source page, this link will work. When over HTTP, 50// the file: URL will appear in the status bar but the link will not work 51// (security restriction for all file URLs). 52// 53// file:foo/ file:foo/ (invalid?!?!?) 54// file:/foo/ file:///foo/ (invalid?!?!?) 55// file://foo/ file://foo/ (UNC to server "foo") 56// file:///foo/ file:///foo/ (invalid, seems to be a file) 57// file:////foo/ file://foo/ (UNC to server "foo") 58// Any more than four slashes is also treated as UNC. 59// 60// file:C:/ file://C:/ 61// file:/C:/ file://C:/ 62// The number of slashes after "file:" don't matter if the thing following 63// it looks like an absolute drive path. Also, slashes and backslashes are 64// equally valid here. 65 66namespace url_parse { 67 68namespace { 69 70// A subcomponent of DoInitFileURL, the input of this function should be a UNC 71// path name, with the index of the first character after the slashes following 72// the scheme given in |after_slashes|. This will initialize the host, path, 73// query, and ref, and leave the other output components untouched 74// (DoInitFileURL handles these for us). 75template<typename CHAR> 76void DoParseUNC(const CHAR* spec, 77 int after_slashes, 78 int spec_len, 79 Parsed* parsed) { 80 int next_slash = FindNextSlash(spec, after_slashes, spec_len); 81 if (next_slash == spec_len) { 82 // No additional slash found, as in "file://foo", treat the text as the 83 // host with no path (this will end up being UNC to server "foo"). 84 int host_len = spec_len - after_slashes; 85 if (host_len) 86 parsed->host = Component(after_slashes, host_len); 87 else 88 parsed->host.reset(); 89 parsed->path.reset(); 90 return; 91 } 92 93#ifdef WIN32 94 // See if we have something that looks like a path following the first 95 // component. As in "file://localhost/c:/", we get "c:/" out. We want to 96 // treat this as a having no host but the path given. Works on Windows only. 97 if (DoesBeginWindowsDriveSpec(spec, next_slash + 1, spec_len)) { 98 parsed->host.reset(); 99 ParsePathInternal(spec, MakeRange(next_slash, spec_len), 100 &parsed->path, &parsed->query, &parsed->ref); 101 return; 102 } 103#endif 104 105 // Otherwise, everything up until that first slash we found is the host name, 106 // which will end up being the UNC host. For example "file://foo/bar.txt" 107 // will get a server name of "foo" and a path of "/bar". Later, on Windows, 108 // this should be treated as the filename "\\foo\bar.txt" in proper UNC 109 // notation. 110 int host_len = next_slash - after_slashes; 111 if (host_len) 112 parsed->host = MakeRange(after_slashes, next_slash); 113 else 114 parsed->host.reset(); 115 if (next_slash < spec_len) { 116 ParsePathInternal(spec, MakeRange(next_slash, spec_len), 117 &parsed->path, &parsed->query, &parsed->ref); 118 } else { 119 parsed->path.reset(); 120 } 121} 122 123// A subcomponent of DoParseFileURL, the input should be a local file, with the 124// beginning of the path indicated by the index in |path_begin|. This will 125// initialize the host, path, query, and ref, and leave the other output 126// components untouched (DoInitFileURL handles these for us). 127template<typename CHAR> 128void DoParseLocalFile(const CHAR* spec, 129 int path_begin, 130 int spec_len, 131 Parsed* parsed) { 132 parsed->host.reset(); 133 ParsePathInternal(spec, MakeRange(path_begin, spec_len), 134 &parsed->path, &parsed->query, &parsed->ref); 135} 136 137// Backend for the external functions that operates on either char type. 138// We are handed the character after the "file:" at the beginning of the spec. 139// Usually this is a slash, but needn't be; we allow paths like "file:c:\foo". 140template<typename CHAR> 141void DoParseFileURL(const CHAR* spec, int spec_len, Parsed* parsed) { 142 DCHECK(spec_len >= 0); 143 144 // Get the parts we never use for file URLs out of the way. 145 parsed->username.reset(); 146 parsed->password.reset(); 147 parsed->port.reset(); 148 149 // Many of the code paths don't set these, so it's convenient to just clear 150 // them. We'll write them in those cases we need them. 151 parsed->query.reset(); 152 parsed->ref.reset(); 153 154 // Strip leading & trailing spaces and control characters. 155 int begin = 0; 156 TrimURL(spec, &begin, &spec_len); 157 158 // Find the scheme. 159 int num_slashes; 160 int after_scheme; 161 int after_slashes; 162#ifdef WIN32 163 // See how many slashes there are. We want to handle cases like UNC but also 164 // "/c:/foo". This is when there is no scheme, so we can allow pages to do 165 // links like "c:/foo/bar" or "//foo/bar". This is also called by the 166 // relative URL resolver when it determines there is an absolute URL, which 167 // may give us input like "/c:/foo". 168 num_slashes = CountConsecutiveSlashes(spec, begin, spec_len); 169 after_slashes = begin + num_slashes; 170 if (DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len)) { 171 // Windows path, don't try to extract the scheme (for example, "c:\foo"). 172 parsed->scheme.reset(); 173 after_scheme = after_slashes; 174 } else if (DoesBeginUNCPath(spec, begin, spec_len, false)) { 175 // Windows UNC path: don't try to extract the scheme, but keep the slashes. 176 parsed->scheme.reset(); 177 after_scheme = begin; 178 } else 179#endif 180 { 181 if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { 182 // Offset the results since we gave ExtractScheme a substring. 183 parsed->scheme.begin += begin; 184 after_scheme = parsed->scheme.end() + 1; 185 } else { 186 // No scheme found, remember that. 187 parsed->scheme.reset(); 188 after_scheme = begin; 189 } 190 } 191 192 // Handle empty specs ones that contain only whitespace or control chars, 193 // or that are just the scheme (for example "file:"). 194 if (after_scheme == spec_len) { 195 parsed->host.reset(); 196 parsed->path.reset(); 197 return; 198 } 199 200 num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); 201 202 after_slashes = after_scheme + num_slashes; 203#ifdef WIN32 204 // Check whether the input is a drive again. We checked above for windows 205 // drive specs, but that's only at the very beginning to see if we have a 206 // scheme at all. This test will be duplicated in that case, but will 207 // additionally handle all cases with a real scheme such as "file:///C:/". 208 if (!DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len) && 209 num_slashes != 3) { 210 // Anything not beginning with a drive spec ("c:\") on Windows is treated 211 // as UNC, with the exception of three slashes which always means a file. 212 // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails. 213 DoParseUNC(spec, after_slashes, spec_len, parsed); 214 return; 215 } 216#else 217 // file: URL with exactly 2 slashes is considered to have a host component. 218 if (num_slashes == 2) { 219 DoParseUNC(spec, after_slashes, spec_len, parsed); 220 return; 221 } 222#endif // WIN32 223 224 // Easy and common case, the full path immediately follows the scheme 225 // (modulo slashes), as in "file://c:/foo". Just treat everything from 226 // there to the end as the path. Empty hosts have 0 length instead of -1. 227 // We include the last slash as part of the path if there is one. 228 DoParseLocalFile(spec, 229 num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme, 230 spec_len, parsed); 231} 232 233} // namespace 234 235void ParseFileURL(const char* url, int url_len, Parsed* parsed) { 236 DoParseFileURL(url, url_len, parsed); 237} 238 239void ParseFileURL(const char16* url, int url_len, Parsed* parsed) { 240 DoParseFileURL(url, url_len, parsed); 241} 242 243} // namespace url_parse 244