1// Copyright 2007, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14//     * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30#include "base/logging.h"
31#include "googleurl/src/url_file.h"
32#include "googleurl/src/url_parse.h"
33#include "googleurl/src/url_parse_internal.h"
34
35// Interesting IE file:isms...
36//
37//  INPUT                      OUTPUT
38//  =========================  ==============================
39//  file:/foo/bar              file:///foo/bar
40//      The result here seems totally invalid!?!? This isn't UNC.
41//
42//  file:/
43//  file:// or any other number of slashes
44//      IE6 doesn't do anything at all if you click on this link. No error:
45//      nothing. IE6's history system seems to always color this link, so I'm
46//      guessing that it maps internally to the empty URL.
47//
48//  C:\                        file:///C:/
49//      When on a file: URL source page, this link will work. When over HTTP,
50//      the file: URL will appear in the status bar but the link will not work
51//      (security restriction for all file URLs).
52//
53//  file:foo/                  file:foo/     (invalid?!?!?)
54//  file:/foo/                 file:///foo/  (invalid?!?!?)
55//  file://foo/                file://foo/   (UNC to server "foo")
56//  file:///foo/               file:///foo/  (invalid, seems to be a file)
57//  file:////foo/              file://foo/   (UNC to server "foo")
58//      Any more than four slashes is also treated as UNC.
59//
60//  file:C:/                   file://C:/
61//  file:/C:/                  file://C:/
62//      The number of slashes after "file:" don't matter if the thing following
63//      it looks like an absolute drive path. Also, slashes and backslashes are
64//      equally valid here.
65
66namespace url_parse {
67
68namespace {
69
70// A subcomponent of DoInitFileURL, the input of this function should be a UNC
71// path name, with the index of the first character after the slashes following
72// the scheme given in |after_slashes|. This will initialize the host, path,
73// query, and ref, and leave the other output components untouched
74// (DoInitFileURL handles these for us).
75template<typename CHAR>
76void DoParseUNC(const CHAR* spec,
77                int after_slashes,
78                int spec_len,
79               Parsed* parsed) {
80  int next_slash = FindNextSlash(spec, after_slashes, spec_len);
81  if (next_slash == spec_len) {
82    // No additional slash found, as in "file://foo", treat the text as the
83    // host with no path (this will end up being UNC to server "foo").
84    int host_len = spec_len - after_slashes;
85    if (host_len)
86      parsed->host = Component(after_slashes, host_len);
87    else
88      parsed->host.reset();
89    parsed->path.reset();
90    return;
91  }
92
93#ifdef WIN32
94  // See if we have something that looks like a path following the first
95  // component. As in "file://localhost/c:/", we get "c:/" out. We want to
96  // treat this as a having no host but the path given. Works on Windows only.
97  if (DoesBeginWindowsDriveSpec(spec, next_slash + 1, spec_len)) {
98    parsed->host.reset();
99    ParsePathInternal(spec, MakeRange(next_slash, spec_len),
100                      &parsed->path, &parsed->query, &parsed->ref);
101    return;
102  }
103#endif
104
105  // Otherwise, everything up until that first slash we found is the host name,
106  // which will end up being the UNC host. For example "file://foo/bar.txt"
107  // will get a server name of "foo" and a path of "/bar". Later, on Windows,
108  // this should be treated as the filename "\\foo\bar.txt" in proper UNC
109  // notation.
110  int host_len = next_slash - after_slashes;
111  if (host_len)
112    parsed->host = MakeRange(after_slashes, next_slash);
113  else
114    parsed->host.reset();
115  if (next_slash < spec_len) {
116    ParsePathInternal(spec, MakeRange(next_slash, spec_len),
117                      &parsed->path, &parsed->query, &parsed->ref);
118  } else {
119    parsed->path.reset();
120  }
121}
122
123// A subcomponent of DoParseFileURL, the input should be a local file, with the
124// beginning of the path indicated by the index in |path_begin|. This will
125// initialize the host, path, query, and ref, and leave the other output
126// components untouched (DoInitFileURL handles these for us).
127template<typename CHAR>
128void DoParseLocalFile(const CHAR* spec,
129                      int path_begin,
130                      int spec_len,
131                      Parsed* parsed) {
132  parsed->host.reset();
133  ParsePathInternal(spec, MakeRange(path_begin, spec_len),
134                    &parsed->path, &parsed->query, &parsed->ref);
135}
136
137// Backend for the external functions that operates on either char type.
138// We are handed the character after the "file:" at the beginning of the spec.
139// Usually this is a slash, but needn't be; we allow paths like "file:c:\foo".
140template<typename CHAR>
141void DoParseFileURL(const CHAR* spec, int spec_len, Parsed* parsed) {
142  DCHECK(spec_len >= 0);
143
144  // Get the parts we never use for file URLs out of the way.
145  parsed->username.reset();
146  parsed->password.reset();
147  parsed->port.reset();
148
149  // Many of the code paths don't set these, so it's convenient to just clear
150  // them. We'll write them in those cases we need them.
151  parsed->query.reset();
152  parsed->ref.reset();
153
154  // Strip leading & trailing spaces and control characters.
155  int begin = 0;
156  TrimURL(spec, &begin, &spec_len);
157
158  // Find the scheme.
159  int num_slashes;
160  int after_scheme;
161  int after_slashes;
162#ifdef WIN32
163  // See how many slashes there are. We want to handle cases like UNC but also
164  // "/c:/foo". This is when there is no scheme, so we can allow pages to do
165  // links like "c:/foo/bar" or "//foo/bar". This is also called by the
166  // relative URL resolver when it determines there is an absolute URL, which
167  // may give us input like "/c:/foo".
168  num_slashes = CountConsecutiveSlashes(spec, begin, spec_len);
169  after_slashes = begin + num_slashes;
170  if (DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len)) {
171    // Windows path, don't try to extract the scheme (for example, "c:\foo").
172    parsed->scheme.reset();
173    after_scheme = after_slashes;
174  } else if (DoesBeginUNCPath(spec, begin, spec_len, false)) {
175    // Windows UNC path: don't try to extract the scheme, but keep the slashes.
176    parsed->scheme.reset();
177    after_scheme = begin;
178  } else
179#endif
180  {
181    if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
182      // Offset the results since we gave ExtractScheme a substring.
183      parsed->scheme.begin += begin;
184      after_scheme = parsed->scheme.end() + 1;
185    } else {
186      // No scheme found, remember that.
187      parsed->scheme.reset();
188      after_scheme = begin;
189    }
190  }
191
192  // Handle empty specs ones that contain only whitespace or control chars,
193  // or that are just the scheme (for example "file:").
194  if (after_scheme == spec_len) {
195    parsed->host.reset();
196    parsed->path.reset();
197    return;
198  }
199
200  num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
201
202  after_slashes = after_scheme + num_slashes;
203#ifdef WIN32
204  // Check whether the input is a drive again. We checked above for windows
205  // drive specs, but that's only at the very beginning to see if we have a
206  // scheme at all. This test will be duplicated in that case, but will
207  // additionally handle all cases with a real scheme such as "file:///C:/".
208  if (!DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len) &&
209      num_slashes != 3) {
210    // Anything not beginning with a drive spec ("c:\") on Windows is treated
211    // as UNC, with the exception of three slashes which always means a file.
212    // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails.
213    DoParseUNC(spec, after_slashes, spec_len, parsed);
214    return;
215  }
216#else
217  // file: URL with exactly 2 slashes is considered to have a host component.
218  if (num_slashes == 2) {
219    DoParseUNC(spec, after_slashes, spec_len, parsed);
220    return;
221  }
222#endif  // WIN32
223
224  // Easy and common case, the full path immediately follows the scheme
225  // (modulo slashes), as in "file://c:/foo". Just treat everything from
226  // there to the end as the path. Empty hosts have 0 length instead of -1.
227  // We include the last slash as part of the path if there is one.
228  DoParseLocalFile(spec,
229      num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme,
230      spec_len, parsed);
231}
232
233}  // namespace
234
235void ParseFileURL(const char* url, int url_len, Parsed* parsed) {
236  DoParseFileURL(url, url_len, parsed);
237}
238
239void ParseFileURL(const char16* url, int url_len, Parsed* parsed) {
240  DoParseFileURL(url, url_len, parsed);
241}
242
243}  // namespace url_parse
244