1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "net/base/net_util.h"
6
7#include <map>
8#include <vector>
9
10#include "base/i18n/time_formatting.h"
11#include "base/json/string_escape.h"
12#include "base/lazy_instance.h"
13#include "base/logging.h"
14#include "base/memory/singleton.h"
15#include "base/stl_util.h"
16#include "base/strings/string_tokenizer.h"
17#include "base/strings/string_util.h"
18#include "base/strings/utf_offset_string_conversions.h"
19#include "base/strings/utf_string_conversions.h"
20#include "base/time/time.h"
21#include "url/gurl.h"
22#include "third_party/icu/source/common/unicode/uidna.h"
23#include "third_party/icu/source/common/unicode/uniset.h"
24#include "third_party/icu/source/common/unicode/uscript.h"
25#include "third_party/icu/source/common/unicode/uset.h"
26#include "third_party/icu/source/i18n/unicode/datefmt.h"
27#include "third_party/icu/source/i18n/unicode/regex.h"
28#include "third_party/icu/source/i18n/unicode/ulocdata.h"
29
30using base::Time;
31
32namespace net {
33
34namespace {
35
36typedef std::vector<size_t> Offsets;
37
38// Does some simple normalization of scripts so we can allow certain scripts
39// to exist together.
40// TODO(brettw) bug 880223: we should allow some other languages to be
41// oombined such as Chinese and Latin. We will probably need a more
42// complicated system of language pairs to have more fine-grained control.
43UScriptCode NormalizeScript(UScriptCode code) {
44  switch (code) {
45    case USCRIPT_KATAKANA:
46    case USCRIPT_HIRAGANA:
47    case USCRIPT_KATAKANA_OR_HIRAGANA:
48    case USCRIPT_HANGUL:  // This one is arguable.
49      return USCRIPT_HAN;
50    default:
51      return code;
52  }
53}
54
55bool IsIDNComponentInSingleScript(const base::char16* str, int str_len) {
56  UScriptCode first_script = USCRIPT_INVALID_CODE;
57  bool is_first = true;
58
59  int i = 0;
60  while (i < str_len) {
61    unsigned code_point;
62    U16_NEXT(str, i, str_len, code_point);
63
64    UErrorCode err = U_ZERO_ERROR;
65    UScriptCode cur_script = uscript_getScript(code_point, &err);
66    if (err != U_ZERO_ERROR)
67      return false;  // Report mixed on error.
68    cur_script = NormalizeScript(cur_script);
69
70    // TODO(brettw) We may have to check for USCRIPT_INHERENT as well.
71    if (is_first && cur_script != USCRIPT_COMMON) {
72      first_script = cur_script;
73      is_first = false;
74    } else {
75      if (cur_script != USCRIPT_COMMON && cur_script != first_script)
76        return false;
77    }
78  }
79  return true;
80}
81
82// Check if the script of a language can be 'safely' mixed with
83// Latin letters in the ASCII range.
84bool IsCompatibleWithASCIILetters(const std::string& lang) {
85  // For now, just list Chinese, Japanese and Korean (positive list).
86  // An alternative is negative-listing (languages using Greek and
87  // Cyrillic letters), but it can be more dangerous.
88  return !lang.substr(0, 2).compare("zh") ||
89         !lang.substr(0, 2).compare("ja") ||
90         !lang.substr(0, 2).compare("ko");
91}
92
93typedef std::map<std::string, icu::UnicodeSet*> LangToExemplarSetMap;
94
95class LangToExemplarSet {
96 public:
97  static LangToExemplarSet* GetInstance() {
98    return Singleton<LangToExemplarSet>::get();
99  }
100
101 private:
102  LangToExemplarSetMap map;
103  LangToExemplarSet() { }
104  ~LangToExemplarSet() {
105    STLDeleteContainerPairSecondPointers(map.begin(), map.end());
106  }
107
108  friend class Singleton<LangToExemplarSet>;
109  friend struct DefaultSingletonTraits<LangToExemplarSet>;
110  friend bool GetExemplarSetForLang(const std::string&, icu::UnicodeSet**);
111  friend void SetExemplarSetForLang(const std::string&, icu::UnicodeSet*);
112
113  DISALLOW_COPY_AND_ASSIGN(LangToExemplarSet);
114};
115
116bool GetExemplarSetForLang(const std::string& lang,
117                           icu::UnicodeSet** lang_set) {
118  const LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map;
119  LangToExemplarSetMap::const_iterator pos = map.find(lang);
120  if (pos != map.end()) {
121    *lang_set = pos->second;
122    return true;
123  }
124  return false;
125}
126
127void SetExemplarSetForLang(const std::string& lang,
128                           icu::UnicodeSet* lang_set) {
129  LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map;
130  map.insert(std::make_pair(lang, lang_set));
131}
132
133static base::LazyInstance<base::Lock>::Leaky
134    g_lang_set_lock = LAZY_INSTANCE_INITIALIZER;
135
136// Returns true if all the characters in component_characters are used by
137// the language |lang|.
138bool IsComponentCoveredByLang(const icu::UnicodeSet& component_characters,
139                              const std::string& lang) {
140  CR_DEFINE_STATIC_LOCAL(
141      const icu::UnicodeSet, kASCIILetters, ('a', 'z'));
142  icu::UnicodeSet* lang_set = NULL;
143  // We're called from both the UI thread and the history thread.
144  {
145    base::AutoLock lock(g_lang_set_lock.Get());
146    if (!GetExemplarSetForLang(lang, &lang_set)) {
147      UErrorCode status = U_ZERO_ERROR;
148      ULocaleData* uld = ulocdata_open(lang.c_str(), &status);
149      // TODO(jungshik) Turn this check on when the ICU data file is
150      // rebuilt with the minimal subset of locale data for languages
151      // to which Chrome is not localized but which we offer in the list
152      // of languages selectable for Accept-Languages. With the rebuilt ICU
153      // data, ulocdata_open never should fall back to the default locale.
154      // (issue 2078)
155      // DCHECK(U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING);
156      if (U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING) {
157        lang_set = reinterpret_cast<icu::UnicodeSet *>(
158            ulocdata_getExemplarSet(uld, NULL, 0,
159                                    ULOCDATA_ES_STANDARD, &status));
160        // If |lang| is compatible with ASCII Latin letters, add them.
161        if (IsCompatibleWithASCIILetters(lang))
162          lang_set->addAll(kASCIILetters);
163      } else {
164        lang_set = new icu::UnicodeSet(1, 0);
165      }
166      lang_set->freeze();
167      SetExemplarSetForLang(lang, lang_set);
168      ulocdata_close(uld);
169    }
170  }
171  return !lang_set->isEmpty() && lang_set->containsAll(component_characters);
172}
173
174// Returns true if the given Unicode host component is safe to display to the
175// user.
176bool IsIDNComponentSafe(const base::char16* str,
177                        int str_len,
178                        const std::string& languages) {
179  // Most common cases (non-IDN) do not reach here so that we don't
180  // need a fast return path.
181  // TODO(jungshik) : Check if there's any character inappropriate
182  // (although allowed) for domain names.
183  // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and
184  // http://www.unicode.org/reports/tr39/data/xidmodifications.txt
185  // For now, we borrow the list from Mozilla and tweaked it slightly.
186  // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because
187  //  they're gonna be canonicalized to U+0020 and full stop before
188  //  reaching here.)
189  // The original list is available at
190  // http://kb.mozillazine.org/Network.IDN.blacklist_chars and
191  // at http://mxr.mozilla.org/seamonkey/source/modules/libpref/src/init/all.js#703
192
193  UErrorCode status = U_ZERO_ERROR;
194#ifdef U_WCHAR_IS_UTF16
195  icu::UnicodeSet dangerous_characters(icu::UnicodeString(
196      L"[[\\ \u00ad\u00bc\u00bd\u01c3\u0337\u0338"
197      L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]"
198      L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]"
199      L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae"
200      L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014"
201      L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14"
202      L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]"
203      L"[\ufffa-\ufffd]]"), status);
204  DCHECK(U_SUCCESS(status));
205  icu::RegexMatcher dangerous_patterns(icu::UnicodeString(
206      // Lone katakana no, so, or n
207      L"[^\\p{Katakana}][\u30ce\u30f3\u30bd][^\\p{Katakana}]"
208      // Repeating Japanese accent characters
209      L"|[\u3099\u309a\u309b\u309c][\u3099\u309a\u309b\u309c]"),
210      0, status);
211#else
212  icu::UnicodeSet dangerous_characters(icu::UnicodeString(
213      "[[\\u0020\\u00ad\\u00bc\\u00bd\\u01c3\\u0337\\u0338"
214      "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]"
215      "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]"
216      "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae"
217      "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014"
218      "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14"
219      "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]"
220      "[\\ufffa-\\ufffd]]", -1, US_INV), status);
221  DCHECK(U_SUCCESS(status));
222  icu::RegexMatcher dangerous_patterns(icu::UnicodeString(
223      // Lone katakana no, so, or n
224      "[^\\p{Katakana}][\\u30ce\\u30f3\u30bd][^\\p{Katakana}]"
225      // Repeating Japanese accent characters
226      "|[\\u3099\\u309a\\u309b\\u309c][\\u3099\\u309a\\u309b\\u309c]"),
227      0, status);
228#endif
229  DCHECK(U_SUCCESS(status));
230  icu::UnicodeSet component_characters;
231  icu::UnicodeString component_string(str, str_len);
232  component_characters.addAll(component_string);
233  if (dangerous_characters.containsSome(component_characters))
234    return false;
235
236  DCHECK(U_SUCCESS(status));
237  dangerous_patterns.reset(component_string);
238  if (dangerous_patterns.find())
239    return false;
240
241  // If the language list is empty, the result is completely determined
242  // by whether a component is a single script or not. This will block
243  // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are
244  // allowed with |languages| (while it blocks Chinese + Latin letters with
245  // an accent as should be the case), but we want to err on the safe side
246  // when |languages| is empty.
247  if (languages.empty())
248    return IsIDNComponentInSingleScript(str, str_len);
249
250  // |common_characters| is made up of  ASCII numbers, hyphen, plus and
251  // underscore that are used across scripts and allowed in domain names.
252  // (sync'd with characters allowed in url_canon_host with square
253  // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc.
254  icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"),
255                                    status);
256  DCHECK(U_SUCCESS(status));
257  // Subtract common characters because they're always allowed so that
258  // we just have to check if a language-specific set contains
259  // the remainder.
260  component_characters.removeAll(common_characters);
261
262  base::StringTokenizer t(languages, ",");
263  while (t.GetNext()) {
264    if (IsComponentCoveredByLang(component_characters, t.token()))
265      return true;
266  }
267  return false;
268}
269
270// A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to
271// a UTS46/IDNA 2008 handling object opened with uidna_openUTS46().
272//
273// We use UTS46 with BiDiCheck to migrate from IDNA 2003 to IDNA 2008 with
274// the backward compatibility in mind. What it does:
275//
276// 1. Use the up-to-date Unicode data.
277// 2. Define a case folding/mapping with the up-to-date Unicode data as
278//    in IDNA 2003.
279// 3. Use transitional mechanism for 4 deviation characters (sharp-s,
280//    final sigma, ZWJ and ZWNJ) for now.
281// 4. Continue to allow symbols and punctuations.
282// 5. Apply new BiDi check rules more permissive than the IDNA 2003 BiDI rules.
283// 6. Do not apply STD3 rules
284// 7. Do not allow unassigned code points.
285//
286// It also closely matches what IE 10 does except for the BiDi check (
287// http://goo.gl/3XBhqw ).
288// See http://http://unicode.org/reports/tr46/ and references therein
289// for more details.
290struct UIDNAWrapper {
291  UIDNAWrapper() {
292    UErrorCode err = U_ZERO_ERROR;
293    // TODO(jungshik): Change options as different parties (browsers,
294    // registrars, search engines) converge toward a consensus.
295    value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err);
296    if (U_FAILURE(err))
297      value = NULL;
298  }
299
300  UIDNA* value;
301};
302
303static base::LazyInstance<UIDNAWrapper>::Leaky
304    g_uidna = LAZY_INSTANCE_INITIALIZER;
305
306// Converts one component of a host (between dots) to IDN if safe. The result
307// will be APPENDED to the given output string and will be the same as the input
308// if it is not IDN or the IDN is unsafe to display.  Returns whether any
309// conversion was performed.
310bool IDNToUnicodeOneComponent(const base::char16* comp,
311                              size_t comp_len,
312                              const std::string& languages,
313                              base::string16* out) {
314  DCHECK(out);
315  if (comp_len == 0)
316    return false;
317
318  // Only transform if the input can be an IDN component.
319  static const base::char16 kIdnPrefix[] = {'x', 'n', '-', '-'};
320  if ((comp_len > arraysize(kIdnPrefix)) &&
321      !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(base::char16))) {
322    UIDNA* uidna = g_uidna.Get().value;
323    DCHECK(uidna != NULL);
324    size_t original_length = out->length();
325    int output_length = 64;
326    UIDNAInfo info = UIDNA_INFO_INITIALIZER;
327    UErrorCode status;
328    do {
329      out->resize(original_length + output_length);
330      status = U_ZERO_ERROR;
331      // This returns the actual length required. If this is more than 64
332      // code units, |status| will be U_BUFFER_OVERFLOW_ERROR and we'll try
333      // the conversion again, but with a sufficiently large buffer.
334      output_length = uidna_labelToUnicode(
335          uidna, comp, static_cast<int32_t>(comp_len), &(*out)[original_length],
336          output_length, &info, &status);
337    } while ((status == U_BUFFER_OVERFLOW_ERROR && info.errors == 0));
338
339    if (U_SUCCESS(status) && info.errors == 0) {
340      // Converted successfully. Ensure that the converted component
341      // can be safely displayed to the user.
342      out->resize(original_length + output_length);
343      if (IsIDNComponentSafe(out->data() + original_length, output_length,
344                             languages))
345        return true;
346    }
347
348    // Something went wrong. Revert to original string.
349    out->resize(original_length);
350  }
351
352  // We get here with no IDN or on error, in which case we just append the
353  // literal input.
354  out->append(comp, comp_len);
355  return false;
356}
357
358// TODO(brettw) bug 734373: check the scripts for each host component and
359// don't un-IDN-ize if there is more than one. Alternatively, only IDN for
360// scripts that the user has installed. For now, just put the entire
361// path through IDN. Maybe this feature can be implemented in ICU itself?
362//
363// We may want to skip this step in the case of file URLs to allow unicode
364// UNC hostnames regardless of encodings.
365base::string16 IDNToUnicodeWithAdjustments(
366    const std::string& host,
367    const std::string& languages,
368    base::OffsetAdjuster::Adjustments* adjustments) {
369  if (adjustments)
370    adjustments->clear();
371  // Convert the ASCII input to a base::string16 for ICU.
372  base::string16 input16;
373  input16.reserve(host.length());
374  input16.insert(input16.end(), host.begin(), host.end());
375
376  // Do each component of the host separately, since we enforce script matching
377  // on a per-component basis.
378  base::string16 out16;
379  {
380    for (size_t component_start = 0, component_end;
381         component_start < input16.length();
382         component_start = component_end + 1) {
383      // Find the end of the component.
384      component_end = input16.find('.', component_start);
385      if (component_end == base::string16::npos)
386        component_end = input16.length();  // For getting the last component.
387      size_t component_length = component_end - component_start;
388      size_t new_component_start = out16.length();
389      bool converted_idn = false;
390      if (component_end > component_start) {
391        // Add the substring that we just found.
392        converted_idn = IDNToUnicodeOneComponent(
393            input16.data() + component_start, component_length, languages,
394            &out16);
395      }
396      size_t new_component_length = out16.length() - new_component_start;
397
398      if (converted_idn && adjustments) {
399        adjustments->push_back(base::OffsetAdjuster::Adjustment(
400            component_start, component_length, new_component_length));
401      }
402
403      // Need to add the dot we just found (if we found one).
404      if (component_end < input16.length())
405        out16.push_back('.');
406    }
407  }
408  return out16;
409}
410
411// If |component| is valid, its begin is incremented by |delta|.
412void AdjustComponent(int delta, url::Component* component) {
413  if (!component->is_valid())
414    return;
415
416  DCHECK(delta >= 0 || component->begin >= -delta);
417  component->begin += delta;
418}
419
420// Adjusts all the components of |parsed| by |delta|, except for the scheme.
421void AdjustAllComponentsButScheme(int delta, url::Parsed* parsed) {
422  AdjustComponent(delta, &(parsed->username));
423  AdjustComponent(delta, &(parsed->password));
424  AdjustComponent(delta, &(parsed->host));
425  AdjustComponent(delta, &(parsed->port));
426  AdjustComponent(delta, &(parsed->path));
427  AdjustComponent(delta, &(parsed->query));
428  AdjustComponent(delta, &(parsed->ref));
429}
430
431// Helper for FormatUrlWithOffsets().
432base::string16 FormatViewSourceUrl(
433    const GURL& url,
434    const std::string& languages,
435    FormatUrlTypes format_types,
436    UnescapeRule::Type unescape_rules,
437    url::Parsed* new_parsed,
438    size_t* prefix_end,
439    base::OffsetAdjuster::Adjustments* adjustments) {
440  DCHECK(new_parsed);
441  const char kViewSource[] = "view-source:";
442  const size_t kViewSourceLength = arraysize(kViewSource) - 1;
443
444  // Format the underlying URL and record adjustments.
445  const std::string& url_str(url.possibly_invalid_spec());
446  adjustments->clear();
447  base::string16 result(base::ASCIIToUTF16(kViewSource) +
448      FormatUrlWithAdjustments(GURL(url_str.substr(kViewSourceLength)),
449                               languages, format_types, unescape_rules,
450                               new_parsed, prefix_end, adjustments));
451  // Revise |adjustments| by shifting to the offsets to prefix that the above
452  // call to FormatUrl didn't get to see.
453  for (base::OffsetAdjuster::Adjustments::iterator it = adjustments->begin();
454       it != adjustments->end(); ++it)
455    it->original_offset += kViewSourceLength;
456
457  // Adjust positions of the parsed components.
458  if (new_parsed->scheme.is_nonempty()) {
459    // Assume "view-source:real-scheme" as a scheme.
460    new_parsed->scheme.len += kViewSourceLength;
461  } else {
462    new_parsed->scheme.begin = 0;
463    new_parsed->scheme.len = kViewSourceLength - 1;
464  }
465  AdjustAllComponentsButScheme(kViewSourceLength, new_parsed);
466
467  if (prefix_end)
468    *prefix_end += kViewSourceLength;
469
470  return result;
471}
472
473class AppendComponentTransform {
474 public:
475  AppendComponentTransform() {}
476  virtual ~AppendComponentTransform() {}
477
478  virtual base::string16 Execute(
479      const std::string& component_text,
480      base::OffsetAdjuster::Adjustments* adjustments) const = 0;
481
482  // NOTE: No DISALLOW_COPY_AND_ASSIGN here, since gcc < 4.3.0 requires an
483  // accessible copy constructor in order to call AppendFormattedComponent()
484  // with an inline temporary (see http://gcc.gnu.org/bugs/#cxx%5Frvalbind ).
485};
486
487class HostComponentTransform : public AppendComponentTransform {
488 public:
489  explicit HostComponentTransform(const std::string& languages)
490      : languages_(languages) {
491  }
492
493 private:
494  virtual base::string16 Execute(
495      const std::string& component_text,
496      base::OffsetAdjuster::Adjustments* adjustments) const OVERRIDE {
497    return IDNToUnicodeWithAdjustments(component_text, languages_,
498                                       adjustments);
499  }
500
501  const std::string& languages_;
502};
503
504class NonHostComponentTransform : public AppendComponentTransform {
505 public:
506  explicit NonHostComponentTransform(UnescapeRule::Type unescape_rules)
507      : unescape_rules_(unescape_rules) {
508  }
509
510 private:
511  virtual base::string16 Execute(
512      const std::string& component_text,
513      base::OffsetAdjuster::Adjustments* adjustments) const OVERRIDE {
514    return (unescape_rules_ == UnescapeRule::NONE) ?
515        base::UTF8ToUTF16WithAdjustments(component_text, adjustments) :
516        UnescapeAndDecodeUTF8URLComponentWithAdjustments(component_text,
517            unescape_rules_, adjustments);
518  }
519
520  const UnescapeRule::Type unescape_rules_;
521};
522
523// Transforms the portion of |spec| covered by |original_component| according to
524// |transform|.  Appends the result to |output|.  If |output_component| is
525// non-NULL, its start and length are set to the transformed component's new
526// start and length.  If |adjustments| is non-NULL, appends adjustments (if
527// any) that reflect the transformation the original component underwent to
528// become the transformed value appended to |output|.
529void AppendFormattedComponent(const std::string& spec,
530                              const url::Component& original_component,
531                              const AppendComponentTransform& transform,
532                              base::string16* output,
533                              url::Component* output_component,
534                              base::OffsetAdjuster::Adjustments* adjustments) {
535  DCHECK(output);
536  if (original_component.is_nonempty()) {
537    size_t original_component_begin =
538        static_cast<size_t>(original_component.begin);
539    size_t output_component_begin = output->length();
540    std::string component_str(spec, original_component_begin,
541                              static_cast<size_t>(original_component.len));
542
543    // Transform |component_str| and modify |adjustments| appropriately.
544    base::OffsetAdjuster::Adjustments component_transform_adjustments;
545    output->append(
546        transform.Execute(component_str, &component_transform_adjustments));
547
548    // Shift all the adjustments made for this component so the offsets are
549    // valid for the original string and add them to |adjustments|.
550    for (base::OffsetAdjuster::Adjustments::iterator comp_iter =
551         component_transform_adjustments.begin();
552         comp_iter != component_transform_adjustments.end(); ++comp_iter)
553      comp_iter->original_offset += original_component_begin;
554    if (adjustments) {
555      adjustments->insert(adjustments->end(),
556                          component_transform_adjustments.begin(),
557                          component_transform_adjustments.end());
558    }
559
560    // Set positions of the parsed component.
561    if (output_component) {
562      output_component->begin = static_cast<int>(output_component_begin);
563      output_component->len =
564          static_cast<int>(output->length() - output_component_begin);
565    }
566  } else if (output_component) {
567    output_component->reset();
568  }
569}
570
571}  // namespace
572
573const FormatUrlType kFormatUrlOmitNothing                     = 0;
574const FormatUrlType kFormatUrlOmitUsernamePassword            = 1 << 0;
575const FormatUrlType kFormatUrlOmitHTTP                        = 1 << 1;
576const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname = 1 << 2;
577const FormatUrlType kFormatUrlOmitAll = kFormatUrlOmitUsernamePassword |
578    kFormatUrlOmitHTTP | kFormatUrlOmitTrailingSlashOnBareHostname;
579
580base::string16 IDNToUnicode(const std::string& host,
581                            const std::string& languages) {
582  return IDNToUnicodeWithAdjustments(host, languages, NULL);
583}
584
585std::string GetDirectoryListingEntry(const base::string16& name,
586                                     const std::string& raw_bytes,
587                                     bool is_dir,
588                                     int64 size,
589                                     Time modified) {
590  std::string result;
591  result.append("<script>addRow(");
592  base::EscapeJSONString(name, true, &result);
593  result.append(",");
594  if (raw_bytes.empty()) {
595    base::EscapeJSONString(EscapePath(base::UTF16ToUTF8(name)), true, &result);
596  } else {
597    base::EscapeJSONString(EscapePath(raw_bytes), true, &result);
598  }
599  if (is_dir) {
600    result.append(",1,");
601  } else {
602    result.append(",0,");
603  }
604
605  // Negative size means unknown or not applicable (e.g. directory).
606  base::string16 size_string;
607  if (size >= 0)
608    size_string = FormatBytesUnlocalized(size);
609  base::EscapeJSONString(size_string, true, &result);
610
611  result.append(",");
612
613  base::string16 modified_str;
614  // |modified| can be NULL in FTP listings.
615  if (!modified.is_null()) {
616    modified_str = base::TimeFormatShortDateAndTime(modified);
617  }
618  base::EscapeJSONString(modified_str, true, &result);
619
620  result.append(");</script>\n");
621
622  return result;
623}
624
625void AppendFormattedHost(const GURL& url,
626                         const std::string& languages,
627                         base::string16* output) {
628  AppendFormattedComponent(url.possibly_invalid_spec(),
629      url.parsed_for_possibly_invalid_spec().host,
630      HostComponentTransform(languages), output, NULL, NULL);
631}
632
633base::string16 FormatUrlWithOffsets(
634    const GURL& url,
635    const std::string& languages,
636    FormatUrlTypes format_types,
637    UnescapeRule::Type unescape_rules,
638    url::Parsed* new_parsed,
639    size_t* prefix_end,
640    std::vector<size_t>* offsets_for_adjustment) {
641  base::OffsetAdjuster::Adjustments adjustments;
642  const base::string16& format_url_return_value =
643      FormatUrlWithAdjustments(url, languages, format_types, unescape_rules,
644                               new_parsed, prefix_end, &adjustments);
645  base::OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
646  if (offsets_for_adjustment) {
647    std::for_each(
648        offsets_for_adjustment->begin(),
649        offsets_for_adjustment->end(),
650        base::LimitOffset<std::string>(format_url_return_value.length()));
651  }
652  return format_url_return_value;
653}
654
655base::string16 FormatUrlWithAdjustments(
656    const GURL& url,
657    const std::string& languages,
658    FormatUrlTypes format_types,
659    UnescapeRule::Type unescape_rules,
660    url::Parsed* new_parsed,
661    size_t* prefix_end,
662    base::OffsetAdjuster::Adjustments* adjustments) {
663  DCHECK(adjustments != NULL);
664  adjustments->clear();
665  url::Parsed parsed_temp;
666  if (!new_parsed)
667    new_parsed = &parsed_temp;
668  else
669    *new_parsed = url::Parsed();
670
671  // Special handling for view-source:.  Don't use content::kViewSourceScheme
672  // because this library shouldn't depend on chrome.
673  const char* const kViewSource = "view-source";
674  // Reject "view-source:view-source:..." to avoid deep recursion.
675  const char* const kViewSourceTwice = "view-source:view-source:";
676  if (url.SchemeIs(kViewSource) &&
677      !StartsWithASCII(url.possibly_invalid_spec(), kViewSourceTwice, false)) {
678    return FormatViewSourceUrl(url, languages, format_types,
679                               unescape_rules, new_parsed, prefix_end,
680                               adjustments);
681  }
682
683  // We handle both valid and invalid URLs (this will give us the spec
684  // regardless of validity).
685  const std::string& spec = url.possibly_invalid_spec();
686  const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec();
687
688  // Scheme & separators.  These are ASCII.
689  base::string16 url_string;
690  url_string.insert(
691      url_string.end(), spec.begin(),
692      spec.begin() + parsed.CountCharactersBefore(url::Parsed::USERNAME, true));
693  const char kHTTP[] = "http://";
694  const char kFTP[] = "ftp.";
695  // url_fixer::FixupURL() treats "ftp.foo.com" as ftp://ftp.foo.com.  This
696  // means that if we trim "http://" off a URL whose host starts with "ftp." and
697  // the user inputs this into any field subject to fixup (which is basically
698  // all input fields), the meaning would be changed.  (In fact, often the
699  // formatted URL is directly pre-filled into an input field.)  For this reason
700  // we avoid stripping "http://" in this case.
701  bool omit_http = (format_types & kFormatUrlOmitHTTP) &&
702      EqualsASCII(url_string, kHTTP) &&
703      !StartsWithASCII(url.host(), kFTP, true);
704  new_parsed->scheme = parsed.scheme;
705
706  // Username & password.
707  if ((format_types & kFormatUrlOmitUsernamePassword) != 0) {
708    // Remove the username and password fields. We don't want to display those
709    // to the user since they can be used for attacks,
710    // e.g. "http://google.com:search@evil.ru/"
711    new_parsed->username.reset();
712    new_parsed->password.reset();
713    // Update the adjustments based on removed username and/or password.
714    if (parsed.username.is_nonempty() || parsed.password.is_nonempty()) {
715      if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) {
716        // The seeming off-by-two is to account for the ':' after the username
717        // and '@' after the password.
718        adjustments->push_back(base::OffsetAdjuster::Adjustment(
719            static_cast<size_t>(parsed.username.begin),
720            static_cast<size_t>(parsed.username.len + parsed.password.len + 2),
721            0));
722      } else {
723        const url::Component* nonempty_component =
724            parsed.username.is_nonempty() ? &parsed.username : &parsed.password;
725        // The seeming off-by-one is to account for the '@' after the
726        // username/password.
727        adjustments->push_back(base::OffsetAdjuster::Adjustment(
728            static_cast<size_t>(nonempty_component->begin),
729            static_cast<size_t>(nonempty_component->len + 1),
730            0));
731      }
732    }
733  } else {
734    AppendFormattedComponent(spec, parsed.username,
735                             NonHostComponentTransform(unescape_rules),
736                             &url_string, &new_parsed->username, adjustments);
737    if (parsed.password.is_valid())
738      url_string.push_back(':');
739    AppendFormattedComponent(spec, parsed.password,
740                             NonHostComponentTransform(unescape_rules),
741                             &url_string, &new_parsed->password, adjustments);
742    if (parsed.username.is_valid() || parsed.password.is_valid())
743      url_string.push_back('@');
744  }
745  if (prefix_end)
746    *prefix_end = static_cast<size_t>(url_string.length());
747
748  // Host.
749  AppendFormattedComponent(spec, parsed.host, HostComponentTransform(languages),
750                           &url_string, &new_parsed->host, adjustments);
751
752  // Port.
753  if (parsed.port.is_nonempty()) {
754    url_string.push_back(':');
755    new_parsed->port.begin = url_string.length();
756    url_string.insert(url_string.end(),
757                      spec.begin() + parsed.port.begin,
758                      spec.begin() + parsed.port.end());
759    new_parsed->port.len = url_string.length() - new_parsed->port.begin;
760  } else {
761    new_parsed->port.reset();
762  }
763
764  // Path & query.  Both get the same general unescape & convert treatment.
765  if (!(format_types & kFormatUrlOmitTrailingSlashOnBareHostname) ||
766      !CanStripTrailingSlash(url)) {
767    AppendFormattedComponent(spec, parsed.path,
768                             NonHostComponentTransform(unescape_rules),
769                             &url_string, &new_parsed->path, adjustments);
770  } else {
771    if (parsed.path.len > 0) {
772      adjustments->push_back(base::OffsetAdjuster::Adjustment(
773          parsed.path.begin, parsed.path.len, 0));
774    }
775  }
776  if (parsed.query.is_valid())
777    url_string.push_back('?');
778  AppendFormattedComponent(spec, parsed.query,
779                           NonHostComponentTransform(unescape_rules),
780                           &url_string, &new_parsed->query, adjustments);
781
782  // Ref.  This is valid, unescaped UTF-8, so we can just convert.
783  if (parsed.ref.is_valid())
784    url_string.push_back('#');
785  AppendFormattedComponent(spec, parsed.ref,
786                           NonHostComponentTransform(UnescapeRule::NONE),
787                           &url_string, &new_parsed->ref, adjustments);
788
789  // If we need to strip out http do it after the fact.
790  if (omit_http && StartsWith(url_string, base::ASCIIToUTF16(kHTTP), true)) {
791    const size_t kHTTPSize = arraysize(kHTTP) - 1;
792    url_string = url_string.substr(kHTTPSize);
793    // Because offsets in the |adjustments| are already calculated with respect
794    // to the string with the http:// prefix in it, those offsets remain correct
795    // after stripping the prefix.  The only thing necessary is to add an
796    // adjustment to reflect the stripped prefix.
797    adjustments->insert(adjustments->begin(),
798        base::OffsetAdjuster::Adjustment(0, kHTTPSize, 0));
799
800    if (prefix_end)
801      *prefix_end -= kHTTPSize;
802
803    // Adjust new_parsed.
804    DCHECK(new_parsed->scheme.is_valid());
805    int delta = -(new_parsed->scheme.len + 3);  // +3 for ://.
806    new_parsed->scheme.reset();
807    AdjustAllComponentsButScheme(delta, new_parsed);
808  }
809
810  return url_string;
811}
812
813base::string16 FormatUrl(const GURL& url,
814                         const std::string& languages,
815                         FormatUrlTypes format_types,
816                         UnescapeRule::Type unescape_rules,
817                         url::Parsed* new_parsed,
818                         size_t* prefix_end,
819                         size_t* offset_for_adjustment) {
820  Offsets offsets;
821  if (offset_for_adjustment)
822    offsets.push_back(*offset_for_adjustment);
823  base::string16 result = FormatUrlWithOffsets(url, languages, format_types,
824      unescape_rules, new_parsed, prefix_end, &offsets);
825  if (offset_for_adjustment)
826    *offset_for_adjustment = offsets[0];
827  return result;
828}
829
830}  // namespace net
831