1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "url/url_util.h"
6
7#include <string.h>
8#include <vector>
9
10#include "base/debug/leak_annotations.h"
11#include "base/logging.h"
12#include "url/url_canon_internal.h"
13#include "url/url_file.h"
14#include "url/url_util_internal.h"
15
16namespace url {
17
18namespace {
19
20// ASCII-specific tolower.  The standard library's tolower is locale sensitive,
21// so we don't want to use it here.
22template<class Char>
23inline Char ToLowerASCII(Char c) {
24  return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
25}
26
27// Backend for LowerCaseEqualsASCII.
28template<typename Iter>
29inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) {
30  for (Iter it = a_begin; it != a_end; ++it, ++b) {
31    if (!*b || ToLowerASCII(*it) != *b)
32      return false;
33  }
34  return *b == 0;
35}
36
37const int kNumStandardURLSchemes = 8;
38const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
39  kHttpScheme,
40  kHttpsScheme,
41  kFileScheme,  // Yes, file urls can have a hostname!
42  kFtpScheme,
43  kGopherScheme,
44  kWsScheme,    // WebSocket.
45  kWssScheme,   // WebSocket secure.
46  kFileSystemScheme,
47};
48
49// List of the currently installed standard schemes. This list is lazily
50// initialized by InitStandardSchemes and is leaked on shutdown to prevent
51// any destructors from being called that will slow us down or cause problems.
52std::vector<const char*>* standard_schemes = NULL;
53
54// See the LockStandardSchemes declaration in the header.
55bool standard_schemes_locked = false;
56
57// Ensures that the standard_schemes list is initialized, does nothing if it
58// already has values.
59void InitStandardSchemes() {
60  if (standard_schemes)
61    return;
62  standard_schemes = new std::vector<const char*>;
63  for (int i = 0; i < kNumStandardURLSchemes; i++)
64    standard_schemes->push_back(kStandardURLSchemes[i]);
65}
66
67// Given a string and a range inside the string, compares it to the given
68// lower-case |compare_to| buffer.
69template<typename CHAR>
70inline bool DoCompareSchemeComponent(const CHAR* spec,
71                                     const Component& component,
72                                     const char* compare_to) {
73  if (!component.is_nonempty())
74    return compare_to[0] == 0;  // When component is empty, match empty scheme.
75  return LowerCaseEqualsASCII(&spec[component.begin],
76                              &spec[component.end()],
77                              compare_to);
78}
79
80// Returns true if the given scheme identified by |scheme| within |spec| is one
81// of the registered "standard" schemes.
82template<typename CHAR>
83bool DoIsStandard(const CHAR* spec, const Component& scheme) {
84  if (!scheme.is_nonempty())
85    return false;  // Empty or invalid schemes are non-standard.
86
87  InitStandardSchemes();
88  for (size_t i = 0; i < standard_schemes->size(); i++) {
89    if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()],
90                             standard_schemes->at(i)))
91      return true;
92  }
93  return false;
94}
95
96template<typename CHAR>
97bool DoFindAndCompareScheme(const CHAR* str,
98                            int str_len,
99                            const char* compare,
100                            Component* found_scheme) {
101  // Before extracting scheme, canonicalize the URL to remove any whitespace.
102  // This matches the canonicalization done in DoCanonicalize function.
103  RawCanonOutputT<CHAR> whitespace_buffer;
104  int spec_len;
105  const CHAR* spec = RemoveURLWhitespace(str, str_len,
106                                         &whitespace_buffer, &spec_len);
107
108  Component our_scheme;
109  if (!ExtractScheme(spec, spec_len, &our_scheme)) {
110    // No scheme.
111    if (found_scheme)
112      *found_scheme = Component();
113    return false;
114  }
115  if (found_scheme)
116    *found_scheme = our_scheme;
117  return DoCompareSchemeComponent(spec, our_scheme, compare);
118}
119
120template<typename CHAR>
121bool DoCanonicalize(const CHAR* in_spec,
122                    int in_spec_len,
123                    bool trim_path_end,
124                    CharsetConverter* charset_converter,
125                    CanonOutput* output,
126                    Parsed* output_parsed) {
127  // Remove any whitespace from the middle of the relative URL, possibly
128  // copying to the new buffer.
129  RawCanonOutputT<CHAR> whitespace_buffer;
130  int spec_len;
131  const CHAR* spec = RemoveURLWhitespace(in_spec, in_spec_len,
132                                         &whitespace_buffer, &spec_len);
133
134  Parsed parsed_input;
135#ifdef WIN32
136  // For Windows, we allow things that look like absolute Windows paths to be
137  // fixed up magically to file URLs. This is done for IE compatability. For
138  // example, this will change "c:/foo" into a file URL rather than treating
139  // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt").
140  // There is similar logic in url_canon_relative.cc for
141  //
142  // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which
143  // has no meaning as an absolute path name. This is because browsers on Mac
144  // & Unix don't generally do this, so there is no compatibility reason for
145  // doing so.
146  if (DoesBeginUNCPath(spec, 0, spec_len, false) ||
147      DoesBeginWindowsDriveSpec(spec, 0, spec_len)) {
148    ParseFileURL(spec, spec_len, &parsed_input);
149    return CanonicalizeFileURL(spec, spec_len, parsed_input, charset_converter,
150                               output, output_parsed);
151  }
152#endif
153
154  Component scheme;
155  if (!ExtractScheme(spec, spec_len, &scheme))
156    return false;
157
158  // This is the parsed version of the input URL, we have to canonicalize it
159  // before storing it in our object.
160  bool success;
161  if (DoCompareSchemeComponent(spec, scheme, url::kFileScheme)) {
162    // File URLs are special.
163    ParseFileURL(spec, spec_len, &parsed_input);
164    success = CanonicalizeFileURL(spec, spec_len, parsed_input,
165                                  charset_converter, output, output_parsed);
166  } else if (DoCompareSchemeComponent(spec, scheme, url::kFileSystemScheme)) {
167    // Filesystem URLs are special.
168    ParseFileSystemURL(spec, spec_len, &parsed_input);
169    success = CanonicalizeFileSystemURL(spec, spec_len, parsed_input,
170                                        charset_converter, output,
171                                        output_parsed);
172
173  } else if (DoIsStandard(spec, scheme)) {
174    // All "normal" URLs.
175    ParseStandardURL(spec, spec_len, &parsed_input);
176    success = CanonicalizeStandardURL(spec, spec_len, parsed_input,
177                                      charset_converter, output, output_parsed);
178
179  } else if (DoCompareSchemeComponent(spec, scheme, url::kMailToScheme)) {
180    // Mailto are treated like a standard url with only a scheme, path, query
181    ParseMailtoURL(spec, spec_len, &parsed_input);
182    success = CanonicalizeMailtoURL(spec, spec_len, parsed_input, output,
183                                    output_parsed);
184
185  } else {
186    // "Weird" URLs like data: and javascript:
187    ParsePathURL(spec, spec_len, trim_path_end, &parsed_input);
188    success = CanonicalizePathURL(spec, spec_len, parsed_input, output,
189                                  output_parsed);
190  }
191  return success;
192}
193
194template<typename CHAR>
195bool DoResolveRelative(const char* base_spec,
196                       int base_spec_len,
197                       const Parsed& base_parsed,
198                       const CHAR* in_relative,
199                       int in_relative_length,
200                       CharsetConverter* charset_converter,
201                       CanonOutput* output,
202                       Parsed* output_parsed) {
203  // Remove any whitespace from the middle of the relative URL, possibly
204  // copying to the new buffer.
205  RawCanonOutputT<CHAR> whitespace_buffer;
206  int relative_length;
207  const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length,
208                                             &whitespace_buffer,
209                                             &relative_length);
210  bool base_is_authority_based = false;
211  bool base_is_hierarchical = false;
212  if (base_spec &&
213      base_parsed.scheme.is_nonempty()) {
214    int after_scheme = base_parsed.scheme.end() + 1;  // Skip past the colon.
215    int num_slashes = CountConsecutiveSlashes(base_spec, after_scheme,
216                                              base_spec_len);
217    base_is_authority_based = num_slashes > 1;
218    base_is_hierarchical = num_slashes > 0;
219  }
220
221  bool standard_base_scheme =
222      base_parsed.scheme.is_nonempty() &&
223      DoIsStandard(base_spec, base_parsed.scheme);
224
225  bool is_relative;
226  Component relative_component;
227  if (!IsRelativeURL(base_spec, base_parsed, relative, relative_length,
228                     (base_is_hierarchical || standard_base_scheme),
229                     &is_relative, &relative_component)) {
230    // Error resolving.
231    return false;
232  }
233
234  // Pretend for a moment that |base_spec| is a standard URL. Normally
235  // non-standard URLs are treated as PathURLs, but if the base has an
236  // authority we would like to preserve it.
237  if (is_relative && base_is_authority_based && !standard_base_scheme) {
238    Parsed base_parsed_authority;
239    ParseStandardURL(base_spec, base_spec_len, &base_parsed_authority);
240    if (base_parsed_authority.host.is_nonempty()) {
241      bool did_resolve_succeed =
242          ResolveRelativeURL(base_spec, base_parsed_authority, false, relative,
243                             relative_component, charset_converter, output,
244                             output_parsed);
245      // The output_parsed is incorrect at this point (because it was built
246      // based on base_parsed_authority instead of base_parsed) and needs to be
247      // re-created.
248      ParsePathURL(output->data(), output->length(), true,
249                   output_parsed);
250      return did_resolve_succeed;
251    }
252  } else if (is_relative) {
253    // Relative, resolve and canonicalize.
254    bool file_base_scheme = base_parsed.scheme.is_nonempty() &&
255        DoCompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme);
256    return ResolveRelativeURL(base_spec, base_parsed, file_base_scheme, relative,
257                              relative_component, charset_converter, output,
258                              output_parsed);
259  }
260
261  // Not relative, canonicalize the input.
262  return DoCanonicalize(relative, relative_length, true, charset_converter,
263                        output, output_parsed);
264}
265
266template<typename CHAR>
267bool DoReplaceComponents(const char* spec,
268                         int spec_len,
269                         const Parsed& parsed,
270                         const Replacements<CHAR>& replacements,
271                         CharsetConverter* charset_converter,
272                         CanonOutput* output,
273                         Parsed* out_parsed) {
274  // If the scheme is overridden, just do a simple string substitution and
275  // reparse the whole thing. There are lots of edge cases that we really don't
276  // want to deal with. Like what happens if I replace "http://e:8080/foo"
277  // with a file. Does it become "file:///E:/8080/foo" where the port number
278  // becomes part of the path? Parsing that string as a file URL says "yes"
279  // but almost no sane rule for dealing with the components individually would
280  // come up with that.
281  //
282  // Why allow these crazy cases at all? Programatically, there is almost no
283  // case for replacing the scheme. The most common case for hitting this is
284  // in JS when building up a URL using the location object. In this case, the
285  // JS code expects the string substitution behavior:
286  //   http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3
287  if (replacements.IsSchemeOverridden()) {
288    // Canonicalize the new scheme so it is 8-bit and can be concatenated with
289    // the existing spec.
290    RawCanonOutput<128> scheme_replaced;
291    Component scheme_replaced_parsed;
292    CanonicalizeScheme(replacements.sources().scheme,
293                       replacements.components().scheme,
294                       &scheme_replaced, &scheme_replaced_parsed);
295
296    // We can assume that the input is canonicalized, which means it always has
297    // a colon after the scheme (or where the scheme would be).
298    int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1
299                                                    : 1;
300    if (spec_len - spec_after_colon > 0) {
301      scheme_replaced.Append(&spec[spec_after_colon],
302                             spec_len - spec_after_colon);
303    }
304
305    // We now need to completely re-parse the resulting string since its meaning
306    // may have changed with the different scheme.
307    RawCanonOutput<128> recanonicalized;
308    Parsed recanonicalized_parsed;
309    DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(), true,
310                   charset_converter,
311                   &recanonicalized, &recanonicalized_parsed);
312
313    // Recurse using the version with the scheme already replaced. This will now
314    // use the replacement rules for the new scheme.
315    //
316    // Warning: this code assumes that ReplaceComponents will re-check all
317    // components for validity. This is because we can't fail if DoCanonicalize
318    // failed above since theoretically the thing making it fail could be
319    // getting replaced here. If ReplaceComponents didn't re-check everything,
320    // we wouldn't know if something *not* getting replaced is a problem.
321    // If the scheme-specific replacers are made more intelligent so they don't
322    // re-check everything, we should instead recanonicalize the whole thing
323    // after this call to check validity (this assumes replacing the scheme is
324    // much much less common than other types of replacements, like clearing the
325    // ref).
326    Replacements<CHAR> replacements_no_scheme = replacements;
327    replacements_no_scheme.SetScheme(NULL, Component());
328    return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),
329                               recanonicalized_parsed, replacements_no_scheme,
330                               charset_converter, output, out_parsed);
331  }
332
333  // If we get here, then we know the scheme doesn't need to be replaced, so can
334  // just key off the scheme in the spec to know how to do the replacements.
335  if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileScheme)) {
336    return ReplaceFileURL(spec, parsed, replacements, charset_converter, output,
337                          out_parsed);
338  }
339  if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileSystemScheme)) {
340    return ReplaceFileSystemURL(spec, parsed, replacements, charset_converter,
341                                output, out_parsed);
342  }
343  if (DoIsStandard(spec, parsed.scheme)) {
344    return ReplaceStandardURL(spec, parsed, replacements, charset_converter,
345                              output, out_parsed);
346  }
347  if (DoCompareSchemeComponent(spec, parsed.scheme, url::kMailToScheme)) {
348    return ReplaceMailtoURL(spec, parsed, replacements, output, out_parsed);
349  }
350
351  // Default is a path URL.
352  return ReplacePathURL(spec, parsed, replacements, output, out_parsed);
353}
354
355}  // namespace
356
357void Initialize() {
358  InitStandardSchemes();
359}
360
361void Shutdown() {
362  if (standard_schemes) {
363    delete standard_schemes;
364    standard_schemes = NULL;
365  }
366}
367
368void AddStandardScheme(const char* new_scheme) {
369  // If this assert triggers, it means you've called AddStandardScheme after
370  // LockStandardSchemes have been called (see the header file for
371  // LockStandardSchemes for more).
372  //
373  // This normally means you're trying to set up a new standard scheme too late
374  // in your application's init process. Locate where your app does this
375  // initialization and calls LockStandardScheme, and add your new standard
376  // scheme there.
377  DCHECK(!standard_schemes_locked) <<
378      "Trying to add a standard scheme after the list has been locked.";
379
380  size_t scheme_len = strlen(new_scheme);
381  if (scheme_len == 0)
382    return;
383
384  // Dulicate the scheme into a new buffer and add it to the list of standard
385  // schemes. This pointer will be leaked on shutdown.
386  char* dup_scheme = new char[scheme_len + 1];
387  ANNOTATE_LEAKING_OBJECT_PTR(dup_scheme);
388  memcpy(dup_scheme, new_scheme, scheme_len + 1);
389
390  InitStandardSchemes();
391  standard_schemes->push_back(dup_scheme);
392}
393
394void LockStandardSchemes() {
395  standard_schemes_locked = true;
396}
397
398bool IsStandard(const char* spec, const Component& scheme) {
399  return DoIsStandard(spec, scheme);
400}
401
402bool IsStandard(const base::char16* spec, const Component& scheme) {
403  return DoIsStandard(spec, scheme);
404}
405
406bool FindAndCompareScheme(const char* str,
407                          int str_len,
408                          const char* compare,
409                          Component* found_scheme) {
410  return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
411}
412
413bool FindAndCompareScheme(const base::char16* str,
414                          int str_len,
415                          const char* compare,
416                          Component* found_scheme) {
417  return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
418}
419
420bool Canonicalize(const char* spec,
421                  int spec_len,
422                  bool trim_path_end,
423                  CharsetConverter* charset_converter,
424                  CanonOutput* output,
425                  Parsed* output_parsed) {
426  return DoCanonicalize(spec, spec_len, trim_path_end, charset_converter,
427                        output, output_parsed);
428}
429
430bool Canonicalize(const base::char16* spec,
431                  int spec_len,
432                  bool trim_path_end,
433                  CharsetConverter* charset_converter,
434                  CanonOutput* output,
435                  Parsed* output_parsed) {
436  return DoCanonicalize(spec, spec_len, trim_path_end, charset_converter,
437                        output, output_parsed);
438}
439
440bool ResolveRelative(const char* base_spec,
441                     int base_spec_len,
442                     const Parsed& base_parsed,
443                     const char* relative,
444                     int relative_length,
445                     CharsetConverter* charset_converter,
446                     CanonOutput* output,
447                     Parsed* output_parsed) {
448  return DoResolveRelative(base_spec, base_spec_len, base_parsed,
449                           relative, relative_length,
450                           charset_converter, output, output_parsed);
451}
452
453bool ResolveRelative(const char* base_spec,
454                     int base_spec_len,
455                     const Parsed& base_parsed,
456                     const base::char16* relative,
457                     int relative_length,
458                     CharsetConverter* charset_converter,
459                     CanonOutput* output,
460                     Parsed* output_parsed) {
461  return DoResolveRelative(base_spec, base_spec_len, base_parsed,
462                           relative, relative_length,
463                           charset_converter, output, output_parsed);
464}
465
466bool ReplaceComponents(const char* spec,
467                       int spec_len,
468                       const Parsed& parsed,
469                       const Replacements<char>& replacements,
470                       CharsetConverter* charset_converter,
471                       CanonOutput* output,
472                       Parsed* out_parsed) {
473  return DoReplaceComponents(spec, spec_len, parsed, replacements,
474                             charset_converter, output, out_parsed);
475}
476
477bool ReplaceComponents(const char* spec,
478                       int spec_len,
479                       const Parsed& parsed,
480                       const Replacements<base::char16>& replacements,
481                       CharsetConverter* charset_converter,
482                       CanonOutput* output,
483                       Parsed* out_parsed) {
484  return DoReplaceComponents(spec, spec_len, parsed, replacements,
485                             charset_converter, output, out_parsed);
486}
487
488// Front-ends for LowerCaseEqualsASCII.
489bool LowerCaseEqualsASCII(const char* a_begin,
490                          const char* a_end,
491                          const char* b) {
492  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
493}
494
495bool LowerCaseEqualsASCII(const char* a_begin,
496                          const char* a_end,
497                          const char* b_begin,
498                          const char* b_end) {
499  while (a_begin != a_end && b_begin != b_end &&
500         ToLowerASCII(*a_begin) == *b_begin) {
501    a_begin++;
502    b_begin++;
503  }
504  return a_begin == a_end && b_begin == b_end;
505}
506
507bool LowerCaseEqualsASCII(const base::char16* a_begin,
508                          const base::char16* a_end,
509                          const char* b) {
510  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
511}
512
513void DecodeURLEscapeSequences(const char* input,
514                              int length,
515                              CanonOutputW* output) {
516  RawCanonOutputT<char> unescaped_chars;
517  for (int i = 0; i < length; i++) {
518    if (input[i] == '%') {
519      unsigned char ch;
520      if (DecodeEscaped(input, &i, length, &ch)) {
521        unescaped_chars.push_back(ch);
522      } else {
523        // Invalid escape sequence, copy the percent literal.
524        unescaped_chars.push_back('%');
525      }
526    } else {
527      // Regular non-escaped 8-bit character.
528      unescaped_chars.push_back(input[i]);
529    }
530  }
531
532  // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
533  // JavaScript URLs, but Firefox and Safari do.
534  for (int i = 0; i < unescaped_chars.length(); i++) {
535    unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i));
536    if (uch < 0x80) {
537      // Non-UTF-8, just append directly
538      output->push_back(uch);
539    } else {
540      // next_ch will point to the last character of the decoded
541      // character.
542      int next_character = i;
543      unsigned code_point;
544      if (ReadUTFChar(unescaped_chars.data(), &next_character,
545                      unescaped_chars.length(), &code_point)) {
546        // Valid UTF-8 character, convert to UTF-16.
547        AppendUTF16Value(code_point, output);
548        i = next_character;
549      } else {
550        // If there are any sequences that are not valid UTF-8, we keep
551        // invalid code points and promote to UTF-16. We copy all characters
552        // from the current position to the end of the identified sequence.
553        while (i < next_character) {
554          output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
555          i++;
556        }
557        output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
558      }
559    }
560  }
561}
562
563void EncodeURIComponent(const char* input, int length, CanonOutput* output) {
564  for (int i = 0; i < length; ++i) {
565    unsigned char c = static_cast<unsigned char>(input[i]);
566    if (IsComponentChar(c))
567      output->push_back(c);
568    else
569      AppendEscapedChar(c, output);
570  }
571}
572
573bool CompareSchemeComponent(const char* spec,
574                            const Component& component,
575                            const char* compare_to) {
576  return DoCompareSchemeComponent(spec, component, compare_to);
577}
578
579bool CompareSchemeComponent(const base::char16* spec,
580                            const Component& component,
581                            const char* compare_to) {
582  return DoCompareSchemeComponent(spec, component, compare_to);
583}
584
585}  // namespace url
586