1a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Use of this source code is governed by a BSD-style license that can be 3a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// found in the LICENSE file. 4a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 5a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "net/base/net_util.h" 6a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 7a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include <map> 8a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include <vector> 9a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 10a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "base/i18n/time_formatting.h" 11a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "base/json/string_escape.h" 12a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "base/lazy_instance.h" 13a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "base/logging.h" 14a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "base/memory/singleton.h" 15a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "base/stl_util.h" 16a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "base/strings/string_tokenizer.h" 17a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "base/strings/string_util.h" 18a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "base/strings/utf_offset_string_conversions.h" 19a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "base/strings/utf_string_conversions.h" 20a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "base/time/time.h" 21a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "url/gurl.h" 22a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "third_party/icu/source/common/unicode/uidna.h" 23a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "third_party/icu/source/common/unicode/uniset.h" 24a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "third_party/icu/source/common/unicode/uscript.h" 25a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "third_party/icu/source/common/unicode/uset.h" 26a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "third_party/icu/source/i18n/unicode/datefmt.h" 27a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "third_party/icu/source/i18n/unicode/regex.h" 28a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#include "third_party/icu/source/i18n/unicode/ulocdata.h" 29a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 30a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochusing base::Time; 31a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 32a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochnamespace net { 33a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 34a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochnamespace { 35a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 36a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochtypedef std::vector<size_t> Offsets; 37a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 38a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Does some simple normalization of scripts so we can allow certain scripts 39a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// to exist together. 40a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// TODO(brettw) bug 880223: we should allow some other languages to be 41a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// oombined such as Chinese and Latin. We will probably need a more 42a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// complicated system of language pairs to have more fine-grained control. 43a02191e04bc25c4935f804f2c080ae28663d096dBen MurdochUScriptCode NormalizeScript(UScriptCode code) { 44a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch switch (code) { 45a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch case USCRIPT_KATAKANA: 46a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch case USCRIPT_HIRAGANA: 47a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch case USCRIPT_KATAKANA_OR_HIRAGANA: 48a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch case USCRIPT_HANGUL: // This one is arguable. 49a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return USCRIPT_HAN; 50a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch default: 51a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return code; 52a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 53a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 54a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 55a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochbool IsIDNComponentInSingleScript(const base::char16* str, int str_len) { 56a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch UScriptCode first_script = USCRIPT_INVALID_CODE; 57a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch bool is_first = true; 58a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 59a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch int i = 0; 60a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch while (i < str_len) { 61a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch unsigned code_point; 62a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch U16_NEXT(str, i, str_len, code_point); 63a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 64a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch UErrorCode err = U_ZERO_ERROR; 65a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch UScriptCode cur_script = uscript_getScript(code_point, &err); 66a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (err != U_ZERO_ERROR) 67a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return false; // Report mixed on error. 68a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch cur_script = NormalizeScript(cur_script); 69a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 70a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // TODO(brettw) We may have to check for USCRIPT_INHERENT as well. 71a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (is_first && cur_script != USCRIPT_COMMON) { 72a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch first_script = cur_script; 73a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch is_first = false; 74a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } else { 75a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (cur_script != USCRIPT_COMMON && cur_script != first_script) 76a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return false; 77a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 78a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 79a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return true; 80a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 81a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 82a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Check if the script of a language can be 'safely' mixed with 83a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Latin letters in the ASCII range. 84a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochbool IsCompatibleWithASCIILetters(const std::string& lang) { 85a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // For now, just list Chinese, Japanese and Korean (positive list). 86a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // An alternative is negative-listing (languages using Greek and 87a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Cyrillic letters), but it can be more dangerous. 88a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return !lang.substr(0, 2).compare("zh") || 89a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch !lang.substr(0, 2).compare("ja") || 90a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch !lang.substr(0, 2).compare("ko"); 91a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 92a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 93a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochtypedef std::map<std::string, icu::UnicodeSet*> LangToExemplarSetMap; 94a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 95a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochclass LangToExemplarSet { 96a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch public: 97a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch static LangToExemplarSet* GetInstance() { 98a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return Singleton<LangToExemplarSet>::get(); 99a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 100a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 101a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch private: 102a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch LangToExemplarSetMap map; 103a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch LangToExemplarSet() { } 104a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch ~LangToExemplarSet() { 105a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch STLDeleteContainerPairSecondPointers(map.begin(), map.end()); 106a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 107a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 108a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch friend class Singleton<LangToExemplarSet>; 109a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch friend struct DefaultSingletonTraits<LangToExemplarSet>; 110a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch friend bool GetExemplarSetForLang(const std::string&, icu::UnicodeSet**); 111a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch friend void SetExemplarSetForLang(const std::string&, icu::UnicodeSet*); 112a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 113a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch DISALLOW_COPY_AND_ASSIGN(LangToExemplarSet); 114a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch}; 115a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 116a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochbool GetExemplarSetForLang(const std::string& lang, 117a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch icu::UnicodeSet** lang_set) { 118a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map; 119a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch LangToExemplarSetMap::const_iterator pos = map.find(lang); 120a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (pos != map.end()) { 121a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch *lang_set = pos->second; 122a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return true; 123a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 124a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return false; 125a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 126a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 127a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochvoid SetExemplarSetForLang(const std::string& lang, 128a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch icu::UnicodeSet* lang_set) { 129a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map; 130a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch map.insert(std::make_pair(lang, lang_set)); 131a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 132a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 133a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochstatic base::LazyInstance<base::Lock>::Leaky 134a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch g_lang_set_lock = LAZY_INSTANCE_INITIALIZER; 135a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 136a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Returns true if all the characters in component_characters are used by 137a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// the language |lang|. 138a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochbool IsComponentCoveredByLang(const icu::UnicodeSet& component_characters, 139a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const std::string& lang) { 140a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch CR_DEFINE_STATIC_LOCAL( 141a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const icu::UnicodeSet, kASCIILetters, ('a', 'z')); 142a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch icu::UnicodeSet* lang_set = NULL; 143a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // We're called from both the UI thread and the history thread. 144a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch { 145a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::AutoLock lock(g_lang_set_lock.Get()); 146a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (!GetExemplarSetForLang(lang, &lang_set)) { 147a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch UErrorCode status = U_ZERO_ERROR; 148a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch ULocaleData* uld = ulocdata_open(lang.c_str(), &status); 149a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // TODO(jungshik) Turn this check on when the ICU data file is 150a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // rebuilt with the minimal subset of locale data for languages 151a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // to which Chrome is not localized but which we offer in the list 152a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // of languages selectable for Accept-Languages. With the rebuilt ICU 153a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // data, ulocdata_open never should fall back to the default locale. 154a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // (issue 2078) 155a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // DCHECK(U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING); 156a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING) { 157a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch lang_set = reinterpret_cast<icu::UnicodeSet *>( 158a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch ulocdata_getExemplarSet(uld, NULL, 0, 159a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch ULOCDATA_ES_STANDARD, &status)); 160a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // If |lang| is compatible with ASCII Latin letters, add them. 161a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (IsCompatibleWithASCIILetters(lang)) 162a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch lang_set->addAll(kASCIILetters); 163a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } else { 164a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch lang_set = new icu::UnicodeSet(1, 0); 165a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 166a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch lang_set->freeze(); 167a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch SetExemplarSetForLang(lang, lang_set); 168a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch ulocdata_close(uld); 169a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 170a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 171a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return !lang_set->isEmpty() && lang_set->containsAll(component_characters); 172a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 173a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 174a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Returns true if the given Unicode host component is safe to display to the 175a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// user. 176a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochbool IsIDNComponentSafe(const base::char16* str, 177a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch int str_len, 178a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const std::string& languages) { 179a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Most common cases (non-IDN) do not reach here so that we don't 180a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // need a fast return path. 181a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // TODO(jungshik) : Check if there's any character inappropriate 182a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // (although allowed) for domain names. 183a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and 184a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // http://www.unicode.org/reports/tr39/data/xidmodifications.txt 185a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // For now, we borrow the list from Mozilla and tweaked it slightly. 186a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because 187a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // they're gonna be canonicalized to U+0020 and full stop before 188a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // reaching here.) 189a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // The original list is available at 190a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // http://kb.mozillazine.org/Network.IDN.blacklist_chars and 191a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // at http://mxr.mozilla.org/seamonkey/source/modules/libpref/src/init/all.js#703 192a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 193a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch UErrorCode status = U_ZERO_ERROR; 194a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#ifdef U_WCHAR_IS_UTF16 195a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch icu::UnicodeSet dangerous_characters(icu::UnicodeString( 196a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch L"[[\\ \u00ad\u00bc\u00bd\u01c3\u0337\u0338" 197a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]" 198a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]" 199a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae" 200a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014" 201a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14" 202a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]" 203a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch L"[\ufffa-\ufffd]]"), status); 204a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch DCHECK(U_SUCCESS(status)); 205a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch icu::RegexMatcher dangerous_patterns(icu::UnicodeString( 206a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Lone katakana no, so, or n 207a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch L"[^\\p{Katakana}][\u30ce\u30f3\u30bd][^\\p{Katakana}]" 208a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Repeating Japanese accent characters 209a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch L"|[\u3099\u309a\u309b\u309c][\u3099\u309a\u309b\u309c]"), 210a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 0, status); 211a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#else 212a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch icu::UnicodeSet dangerous_characters(icu::UnicodeString( 213a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "[[\\u0020\\u00ad\\u00bc\\u00bd\\u01c3\\u0337\\u0338" 214a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]" 215a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]" 216a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae" 217a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014" 218a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14" 219a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]" 220a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "[\\ufffa-\\ufffd]]", -1, US_INV), status); 221a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch DCHECK(U_SUCCESS(status)); 222a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch icu::RegexMatcher dangerous_patterns(icu::UnicodeString( 223a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Lone katakana no, so, or n 224a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "[^\\p{Katakana}][\\u30ce\\u30f3\u30bd][^\\p{Katakana}]" 225a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Repeating Japanese accent characters 226a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "|[\\u3099\\u309a\\u309b\\u309c][\\u3099\\u309a\\u309b\\u309c]"), 227a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 0, status); 228a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#endif 229a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch DCHECK(U_SUCCESS(status)); 230a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch icu::UnicodeSet component_characters; 231a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch icu::UnicodeString component_string(str, str_len); 232a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch component_characters.addAll(component_string); 233a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (dangerous_characters.containsSome(component_characters)) 234a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return false; 235a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 236a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch DCHECK(U_SUCCESS(status)); 237a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch dangerous_patterns.reset(component_string); 238a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (dangerous_patterns.find()) 239a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return false; 240a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 241a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // If the language list is empty, the result is completely determined 242a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // by whether a component is a single script or not. This will block 243a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are 244a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // allowed with |languages| (while it blocks Chinese + Latin letters with 245a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // an accent as should be the case), but we want to err on the safe side 246a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // when |languages| is empty. 247a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (languages.empty()) 248a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return IsIDNComponentInSingleScript(str, str_len); 249a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 250a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // |common_characters| is made up of ASCII numbers, hyphen, plus and 251a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // underscore that are used across scripts and allowed in domain names. 252a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // (sync'd with characters allowed in url_canon_host with square 253a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc. 254a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"), 255a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch status); 256a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch DCHECK(U_SUCCESS(status)); 257a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Subtract common characters because they're always allowed so that 258a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // we just have to check if a language-specific set contains 259a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // the remainder. 260a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch component_characters.removeAll(common_characters); 261a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 262a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::StringTokenizer t(languages, ","); 263a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch while (t.GetNext()) { 264a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (IsComponentCoveredByLang(component_characters, t.token())) 265a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return true; 266a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 267a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return false; 268a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 269a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 270a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to 271a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// a UTS46/IDNA 2008 handling object opened with uidna_openUTS46(). 272a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// 273a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// We use UTS46 with BiDiCheck to migrate from IDNA 2003 to IDNA 2008 with 274a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// the backward compatibility in mind. What it does: 275a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// 276a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// 1. Use the up-to-date Unicode data. 277a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// 2. Define a case folding/mapping with the up-to-date Unicode data as 278a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// in IDNA 2003. 279a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// 3. Use transitional mechanism for 4 deviation characters (sharp-s, 280a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// final sigma, ZWJ and ZWNJ) for now. 281a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// 4. Continue to allow symbols and punctuations. 282a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// 5. Apply new BiDi check rules more permissive than the IDNA 2003 BiDI rules. 283a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// 6. Do not apply STD3 rules 284a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// 7. Do not allow unassigned code points. 285a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// 286a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// It also closely matches what IE 10 does except for the BiDi check ( 287a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// http://goo.gl/3XBhqw ). 288a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// See http://http://unicode.org/reports/tr46/ and references therein 289a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// for more details. 290a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochstruct UIDNAWrapper { 291a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch UIDNAWrapper() { 292a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch UErrorCode err = U_ZERO_ERROR; 293a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // TODO(jungshik): Change options as different parties (browsers, 294a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // registrars, search engines) converge toward a consensus. 295a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err); 296a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (U_FAILURE(err)) 297a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch value = NULL; 298a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 299a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 300a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch UIDNA* value; 301a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch}; 302a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 303a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochstatic base::LazyInstance<UIDNAWrapper>::Leaky 304a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch g_uidna = LAZY_INSTANCE_INITIALIZER; 305a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 306a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Converts one component of a host (between dots) to IDN if safe. The result 307a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// will be APPENDED to the given output string and will be the same as the input 308a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// if it is not IDN or the IDN is unsafe to display. Returns whether any 309a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// conversion was performed. 310a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochbool IDNToUnicodeOneComponent(const base::char16* comp, 311a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch size_t comp_len, 312a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const std::string& languages, 313a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::string16* out) { 314a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch DCHECK(out); 315a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (comp_len == 0) 316a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return false; 317a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 318a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Only transform if the input can be an IDN component. 319a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch static const base::char16 kIdnPrefix[] = {'x', 'n', '-', '-'}; 320a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if ((comp_len > arraysize(kIdnPrefix)) && 321a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(base::char16))) { 322a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch UIDNA* uidna = g_uidna.Get().value; 323a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch DCHECK(uidna != NULL); 324a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch size_t original_length = out->length(); 325a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch int output_length = 64; 326a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch UIDNAInfo info = UIDNA_INFO_INITIALIZER; 327a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch UErrorCode status; 328a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch do { 329a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch out->resize(original_length + output_length); 330a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch status = U_ZERO_ERROR; 331a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // This returns the actual length required. If this is more than 64 332a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // code units, |status| will be U_BUFFER_OVERFLOW_ERROR and we'll try 333a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // the conversion again, but with a sufficiently large buffer. 334a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch output_length = uidna_labelToUnicode( 335a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch uidna, comp, static_cast<int32_t>(comp_len), &(*out)[original_length], 336a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch output_length, &info, &status); 337a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } while ((status == U_BUFFER_OVERFLOW_ERROR && info.errors == 0)); 338a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 339a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (U_SUCCESS(status) && info.errors == 0) { 340a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Converted successfully. Ensure that the converted component 341a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // can be safely displayed to the user. 342a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch out->resize(original_length + output_length); 343a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (IsIDNComponentSafe(out->data() + original_length, output_length, 344a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch languages)) 345a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return true; 346a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 347a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 348a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Something went wrong. Revert to original string. 349a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch out->resize(original_length); 350a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 351a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 352a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // We get here with no IDN or on error, in which case we just append the 353a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // literal input. 354a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch out->append(comp, comp_len); 355a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return false; 356a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 357a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 358a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// TODO(brettw) bug 734373: check the scripts for each host component and 359a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// don't un-IDN-ize if there is more than one. Alternatively, only IDN for 360a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// scripts that the user has installed. For now, just put the entire 361a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// path through IDN. Maybe this feature can be implemented in ICU itself? 362a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// 363a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// We may want to skip this step in the case of file URLs to allow unicode 364a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// UNC hostnames regardless of encodings. 3650529e5d033099cbfc42635f6f6183833b09dff6eBen Murdochbase::string16 IDNToUnicodeWithAdjustments( 3660529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch const std::string& host, 3670529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch const std::string& languages, 3680529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch base::OffsetAdjuster::Adjustments* adjustments) { 3690529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch if (adjustments) 3700529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch adjustments->clear(); 371a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Convert the ASCII input to a base::string16 for ICU. 372a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::string16 input16; 373a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch input16.reserve(host.length()); 374a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch input16.insert(input16.end(), host.begin(), host.end()); 375a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 376a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Do each component of the host separately, since we enforce script matching 377a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // on a per-component basis. 378a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::string16 out16; 379a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch { 380a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch for (size_t component_start = 0, component_end; 381a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch component_start < input16.length(); 382a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch component_start = component_end + 1) { 383a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Find the end of the component. 384a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch component_end = input16.find('.', component_start); 385a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (component_end == base::string16::npos) 386a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch component_end = input16.length(); // For getting the last component. 387a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch size_t component_length = component_end - component_start; 388a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch size_t new_component_start = out16.length(); 389a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch bool converted_idn = false; 390a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (component_end > component_start) { 391a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Add the substring that we just found. 392a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch converted_idn = IDNToUnicodeOneComponent( 393a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch input16.data() + component_start, component_length, languages, 394a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch &out16); 395a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 396a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch size_t new_component_length = out16.length() - new_component_start; 397a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 3980529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch if (converted_idn && adjustments) { 3990529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch adjustments->push_back(base::OffsetAdjuster::Adjustment( 4000529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch component_start, component_length, new_component_length)); 401a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 402a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 403a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Need to add the dot we just found (if we found one). 404a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (component_end < input16.length()) 405a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch out16.push_back('.'); 406a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 407a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 408a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return out16; 409a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 410a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 411a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// If |component| is valid, its begin is incremented by |delta|. 4125c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liuvoid AdjustComponent(int delta, url::Component* component) { 413a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (!component->is_valid()) 414a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return; 415a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 416a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch DCHECK(delta >= 0 || component->begin >= -delta); 417a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch component->begin += delta; 418a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 419a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 420a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Adjusts all the components of |parsed| by |delta|, except for the scheme. 4215c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liuvoid AdjustAllComponentsButScheme(int delta, url::Parsed* parsed) { 422a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch AdjustComponent(delta, &(parsed->username)); 423a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch AdjustComponent(delta, &(parsed->password)); 424a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch AdjustComponent(delta, &(parsed->host)); 425a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch AdjustComponent(delta, &(parsed->port)); 426a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch AdjustComponent(delta, &(parsed->path)); 427a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch AdjustComponent(delta, &(parsed->query)); 428a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch AdjustComponent(delta, &(parsed->ref)); 429a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 430a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 431a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Helper for FormatUrlWithOffsets(). 4320529e5d033099cbfc42635f6f6183833b09dff6eBen Murdochbase::string16 FormatViewSourceUrl( 4330529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch const GURL& url, 4340529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch const std::string& languages, 4350529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch FormatUrlTypes format_types, 4360529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch UnescapeRule::Type unescape_rules, 4375c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu url::Parsed* new_parsed, 4380529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch size_t* prefix_end, 4390529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch base::OffsetAdjuster::Adjustments* adjustments) { 440a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch DCHECK(new_parsed); 441a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const char kViewSource[] = "view-source:"; 442a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const size_t kViewSourceLength = arraysize(kViewSource) - 1; 443a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 4440529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch // Format the underlying URL and record adjustments. 445a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const std::string& url_str(url.possibly_invalid_spec()); 4460529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch adjustments->clear(); 447a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::string16 result(base::ASCIIToUTF16(kViewSource) + 4480529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch FormatUrlWithAdjustments(GURL(url_str.substr(kViewSourceLength)), 4490529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch languages, format_types, unescape_rules, 4500529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch new_parsed, prefix_end, adjustments)); 4510529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch // Revise |adjustments| by shifting to the offsets to prefix that the above 4520529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch // call to FormatUrl didn't get to see. 4530529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch for (base::OffsetAdjuster::Adjustments::iterator it = adjustments->begin(); 4540529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch it != adjustments->end(); ++it) 4550529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch it->original_offset += kViewSourceLength; 456a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 457a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Adjust positions of the parsed components. 458a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (new_parsed->scheme.is_nonempty()) { 459a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Assume "view-source:real-scheme" as a scheme. 460a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch new_parsed->scheme.len += kViewSourceLength; 461a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } else { 462a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch new_parsed->scheme.begin = 0; 463a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch new_parsed->scheme.len = kViewSourceLength - 1; 464a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 465a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch AdjustAllComponentsButScheme(kViewSourceLength, new_parsed); 466a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 467a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (prefix_end) 468a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch *prefix_end += kViewSourceLength; 469a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 470a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return result; 471a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 472a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 473a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochclass AppendComponentTransform { 474a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch public: 475a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch AppendComponentTransform() {} 476a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch virtual ~AppendComponentTransform() {} 477a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 4780529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch virtual base::string16 Execute( 4790529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch const std::string& component_text, 4800529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch base::OffsetAdjuster::Adjustments* adjustments) const = 0; 481a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 482a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // NOTE: No DISALLOW_COPY_AND_ASSIGN here, since gcc < 4.3.0 requires an 483a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // accessible copy constructor in order to call AppendFormattedComponent() 484a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // with an inline temporary (see http://gcc.gnu.org/bugs/#cxx%5Frvalbind ). 485a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch}; 486a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 487a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochclass HostComponentTransform : public AppendComponentTransform { 488a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch public: 489a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch explicit HostComponentTransform(const std::string& languages) 490a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch : languages_(languages) { 491a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 492a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 493a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch private: 494a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch virtual base::string16 Execute( 495a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const std::string& component_text, 4960529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch base::OffsetAdjuster::Adjustments* adjustments) const OVERRIDE { 4970529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch return IDNToUnicodeWithAdjustments(component_text, languages_, 4980529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch adjustments); 499a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 500a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 501a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const std::string& languages_; 502a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch}; 503a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 504a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochclass NonHostComponentTransform : public AppendComponentTransform { 505a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch public: 506a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch explicit NonHostComponentTransform(UnescapeRule::Type unescape_rules) 507a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch : unescape_rules_(unescape_rules) { 508a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 509a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 510a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch private: 511a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch virtual base::string16 Execute( 512a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const std::string& component_text, 5130529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch base::OffsetAdjuster::Adjustments* adjustments) const OVERRIDE { 514a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return (unescape_rules_ == UnescapeRule::NONE) ? 5150529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch base::UTF8ToUTF16WithAdjustments(component_text, adjustments) : 5160529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch UnescapeAndDecodeUTF8URLComponentWithAdjustments(component_text, 5170529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch unescape_rules_, adjustments); 518a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 519a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 520a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const UnescapeRule::Type unescape_rules_; 521a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch}; 522a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 523a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Transforms the portion of |spec| covered by |original_component| according to 524a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// |transform|. Appends the result to |output|. If |output_component| is 525a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// non-NULL, its start and length are set to the transformed component's new 5260529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// start and length. If |adjustments| is non-NULL, appends adjustments (if 5270529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// any) that reflect the transformation the original component underwent to 5280529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// become the transformed value appended to |output|. 529a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochvoid AppendFormattedComponent(const std::string& spec, 5305c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu const url::Component& original_component, 531a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const AppendComponentTransform& transform, 532a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::string16* output, 5335c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu url::Component* output_component, 5340529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch base::OffsetAdjuster::Adjustments* adjustments) { 535a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch DCHECK(output); 536a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (original_component.is_nonempty()) { 537a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch size_t original_component_begin = 538a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch static_cast<size_t>(original_component.begin); 539a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch size_t output_component_begin = output->length(); 540a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch std::string component_str(spec, original_component_begin, 541a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch static_cast<size_t>(original_component.len)); 542a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 5430529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch // Transform |component_str| and modify |adjustments| appropriately. 5440529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch base::OffsetAdjuster::Adjustments component_transform_adjustments; 5450529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch output->append( 5460529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch transform.Execute(component_str, &component_transform_adjustments)); 5470529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch 5480529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch // Shift all the adjustments made for this component so the offsets are 5490529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch // valid for the original string and add them to |adjustments|. 5500529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch for (base::OffsetAdjuster::Adjustments::iterator comp_iter = 5510529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch component_transform_adjustments.begin(); 5520529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch comp_iter != component_transform_adjustments.end(); ++comp_iter) 5530529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch comp_iter->original_offset += original_component_begin; 5540529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch if (adjustments) { 5550529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch adjustments->insert(adjustments->end(), 5560529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch component_transform_adjustments.begin(), 5570529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch component_transform_adjustments.end()); 558a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 559a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 560a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Set positions of the parsed component. 561a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (output_component) { 562a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch output_component->begin = static_cast<int>(output_component_begin); 563a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch output_component->len = 564a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch static_cast<int>(output->length() - output_component_begin); 565a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 566a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } else if (output_component) { 567a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch output_component->reset(); 568a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 569a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 570a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 571a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} // namespace 572a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 573a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochconst FormatUrlType kFormatUrlOmitNothing = 0; 574a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochconst FormatUrlType kFormatUrlOmitUsernamePassword = 1 << 0; 575a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochconst FormatUrlType kFormatUrlOmitHTTP = 1 << 1; 576a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochconst FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname = 1 << 2; 577a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochconst FormatUrlType kFormatUrlOmitAll = kFormatUrlOmitUsernamePassword | 578a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch kFormatUrlOmitHTTP | kFormatUrlOmitTrailingSlashOnBareHostname; 579a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 580a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochbase::string16 IDNToUnicode(const std::string& host, 581a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const std::string& languages) { 5820529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch return IDNToUnicodeWithAdjustments(host, languages, NULL); 583a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 584a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 585a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochstd::string GetDirectoryListingEntry(const base::string16& name, 586a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const std::string& raw_bytes, 587a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch bool is_dir, 588a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch int64 size, 589a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch Time modified) { 590a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch std::string result; 591a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch result.append("<script>addRow("); 592a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::EscapeJSONString(name, true, &result); 593a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch result.append(","); 594a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (raw_bytes.empty()) { 595a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::EscapeJSONString(EscapePath(base::UTF16ToUTF8(name)), true, &result); 596a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } else { 597a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::EscapeJSONString(EscapePath(raw_bytes), true, &result); 598a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 599a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (is_dir) { 600a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch result.append(",1,"); 601a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } else { 602a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch result.append(",0,"); 603a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 604a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 605a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Negative size means unknown or not applicable (e.g. directory). 606a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::string16 size_string; 607a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (size >= 0) 608a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch size_string = FormatBytesUnlocalized(size); 609a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::EscapeJSONString(size_string, true, &result); 610a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 611a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch result.append(","); 612a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 613a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::string16 modified_str; 614a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // |modified| can be NULL in FTP listings. 615a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (!modified.is_null()) { 616a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch modified_str = base::TimeFormatShortDateAndTime(modified); 617a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 618a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::EscapeJSONString(modified_str, true, &result); 619a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 620a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch result.append(");</script>\n"); 621a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 622a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return result; 623a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 624a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 625a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochvoid AppendFormattedHost(const GURL& url, 626a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const std::string& languages, 627a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::string16* output) { 628a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch AppendFormattedComponent(url.possibly_invalid_spec(), 6290529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch url.parsed_for_possibly_invalid_spec().host, 630a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch HostComponentTransform(languages), output, NULL, NULL); 631a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 632a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 633a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochbase::string16 FormatUrlWithOffsets( 634a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const GURL& url, 635a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const std::string& languages, 636a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch FormatUrlTypes format_types, 637a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch UnescapeRule::Type unescape_rules, 6385c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu url::Parsed* new_parsed, 639a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch size_t* prefix_end, 6400529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch std::vector<size_t>* offsets_for_adjustment) { 6410529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch base::OffsetAdjuster::Adjustments adjustments; 6420529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch const base::string16& format_url_return_value = 6430529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch FormatUrlWithAdjustments(url, languages, format_types, unescape_rules, 6440529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch new_parsed, prefix_end, &adjustments); 6450529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch base::OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); 6460529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch if (offsets_for_adjustment) { 6470529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch std::for_each( 6480529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch offsets_for_adjustment->begin(), 6490529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch offsets_for_adjustment->end(), 6500529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch base::LimitOffset<std::string>(format_url_return_value.length())); 6510529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch } 6520529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch return format_url_return_value; 6530529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch} 6540529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch 6550529e5d033099cbfc42635f6f6183833b09dff6eBen Murdochbase::string16 FormatUrlWithAdjustments( 6560529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch const GURL& url, 6570529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch const std::string& languages, 6580529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch FormatUrlTypes format_types, 6590529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch UnescapeRule::Type unescape_rules, 6605c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu url::Parsed* new_parsed, 6610529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch size_t* prefix_end, 6620529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch base::OffsetAdjuster::Adjustments* adjustments) { 6630529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch DCHECK(adjustments != NULL); 6640529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch adjustments->clear(); 6655c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu url::Parsed parsed_temp; 666a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (!new_parsed) 667a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch new_parsed = &parsed_temp; 668a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch else 6695c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu *new_parsed = url::Parsed(); 670a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 671a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Special handling for view-source:. Don't use content::kViewSourceScheme 672a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // because this library shouldn't depend on chrome. 673a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const char* const kViewSource = "view-source"; 674a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Reject "view-source:view-source:..." to avoid deep recursion. 675a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const char* const kViewSourceTwice = "view-source:view-source:"; 676a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (url.SchemeIs(kViewSource) && 677a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch !StartsWithASCII(url.possibly_invalid_spec(), kViewSourceTwice, false)) { 6780529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch return FormatViewSourceUrl(url, languages, format_types, 679a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch unescape_rules, new_parsed, prefix_end, 6800529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch adjustments); 681a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 682a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 683a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // We handle both valid and invalid URLs (this will give us the spec 684a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // regardless of validity). 685a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const std::string& spec = url.possibly_invalid_spec(); 6865c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec(); 687a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 688a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Scheme & separators. These are ASCII. 689a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::string16 url_string; 6900529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch url_string.insert( 6910529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch url_string.end(), spec.begin(), 6925c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu spec.begin() + parsed.CountCharactersBefore(url::Parsed::USERNAME, true)); 693a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const char kHTTP[] = "http://"; 694a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const char kFTP[] = "ftp."; 695f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) // url_fixer::FixupURL() treats "ftp.foo.com" as ftp://ftp.foo.com. This 696a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // means that if we trim "http://" off a URL whose host starts with "ftp." and 697a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // the user inputs this into any field subject to fixup (which is basically 698a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // all input fields), the meaning would be changed. (In fact, often the 699a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // formatted URL is directly pre-filled into an input field.) For this reason 700a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // we avoid stripping "http://" in this case. 701a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch bool omit_http = (format_types & kFormatUrlOmitHTTP) && 702a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch EqualsASCII(url_string, kHTTP) && 703a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch !StartsWithASCII(url.host(), kFTP, true); 704a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch new_parsed->scheme = parsed.scheme; 705a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 706a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Username & password. 707a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if ((format_types & kFormatUrlOmitUsernamePassword) != 0) { 708a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Remove the username and password fields. We don't want to display those 709a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // to the user since they can be used for attacks, 710a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // e.g. "http://google.com:search@evil.ru/" 711a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch new_parsed->username.reset(); 712a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch new_parsed->password.reset(); 7130529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch // Update the adjustments based on removed username and/or password. 7140529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch if (parsed.username.is_nonempty() || parsed.password.is_nonempty()) { 715a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) { 7160529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch // The seeming off-by-two is to account for the ':' after the username 7170529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch // and '@' after the password. 7180529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch adjustments->push_back(base::OffsetAdjuster::Adjustment( 719a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch static_cast<size_t>(parsed.username.begin), 720a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch static_cast<size_t>(parsed.username.len + parsed.password.len + 2), 721a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 0)); 722a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } else { 7235c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu const url::Component* nonempty_component = 724a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch parsed.username.is_nonempty() ? &parsed.username : &parsed.password; 7250529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch // The seeming off-by-one is to account for the '@' after the 726a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // username/password. 7270529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch adjustments->push_back(base::OffsetAdjuster::Adjustment( 728a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch static_cast<size_t>(nonempty_component->begin), 7290529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch static_cast<size_t>(nonempty_component->len + 1), 7300529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch 0)); 731a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 732a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 733a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } else { 7340529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch AppendFormattedComponent(spec, parsed.username, 7350529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch NonHostComponentTransform(unescape_rules), 7360529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch &url_string, &new_parsed->username, adjustments); 737a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (parsed.password.is_valid()) 738a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch url_string.push_back(':'); 7390529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch AppendFormattedComponent(spec, parsed.password, 7400529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch NonHostComponentTransform(unescape_rules), 7410529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch &url_string, &new_parsed->password, adjustments); 742a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (parsed.username.is_valid() || parsed.password.is_valid()) 743a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch url_string.push_back('@'); 744a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 745a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (prefix_end) 746a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch *prefix_end = static_cast<size_t>(url_string.length()); 747a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 748a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Host. 7490529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch AppendFormattedComponent(spec, parsed.host, HostComponentTransform(languages), 7500529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch &url_string, &new_parsed->host, adjustments); 751a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 752a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Port. 753a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (parsed.port.is_nonempty()) { 754a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch url_string.push_back(':'); 755a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch new_parsed->port.begin = url_string.length(); 756a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch url_string.insert(url_string.end(), 757a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch spec.begin() + parsed.port.begin, 758a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch spec.begin() + parsed.port.end()); 759a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch new_parsed->port.len = url_string.length() - new_parsed->port.begin; 760a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } else { 761a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch new_parsed->port.reset(); 762a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 763a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 764a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Path & query. Both get the same general unescape & convert treatment. 765a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (!(format_types & kFormatUrlOmitTrailingSlashOnBareHostname) || 766a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch !CanStripTrailingSlash(url)) { 7670529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch AppendFormattedComponent(spec, parsed.path, 7680529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch NonHostComponentTransform(unescape_rules), 7690529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch &url_string, &new_parsed->path, adjustments); 770a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } else { 7710529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch if (parsed.path.len > 0) { 7720529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch adjustments->push_back(base::OffsetAdjuster::Adjustment( 7730529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch parsed.path.begin, parsed.path.len, 0)); 7740529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch } 775a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 776a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (parsed.query.is_valid()) 777a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch url_string.push_back('?'); 7780529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch AppendFormattedComponent(spec, parsed.query, 7790529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch NonHostComponentTransform(unescape_rules), 7800529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch &url_string, &new_parsed->query, adjustments); 781a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 782a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Ref. This is valid, unescaped UTF-8, so we can just convert. 783a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (parsed.ref.is_valid()) 784a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch url_string.push_back('#'); 7850529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch AppendFormattedComponent(spec, parsed.ref, 7860529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch NonHostComponentTransform(UnescapeRule::NONE), 7870529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch &url_string, &new_parsed->ref, adjustments); 788a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 7890529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch // If we need to strip out http do it after the fact. 790a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (omit_http && StartsWith(url_string, base::ASCIIToUTF16(kHTTP), true)) { 791a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const size_t kHTTPSize = arraysize(kHTTP) - 1; 792a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch url_string = url_string.substr(kHTTPSize); 7930529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch // Because offsets in the |adjustments| are already calculated with respect 7940529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch // to the string with the http:// prefix in it, those offsets remain correct 7950529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch // after stripping the prefix. The only thing necessary is to add an 7960529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch // adjustment to reflect the stripped prefix. 7970529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch adjustments->insert(adjustments->begin(), 7980529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch base::OffsetAdjuster::Adjustment(0, kHTTPSize, 0)); 7990529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch 800a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (prefix_end) 801a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch *prefix_end -= kHTTPSize; 802a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 803a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Adjust new_parsed. 804a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch DCHECK(new_parsed->scheme.is_valid()); 805a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch int delta = -(new_parsed->scheme.len + 3); // +3 for ://. 806a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch new_parsed->scheme.reset(); 807a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch AdjustAllComponentsButScheme(delta, new_parsed); 808a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch } 809a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 810a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return url_string; 811a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 812a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 813a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochbase::string16 FormatUrl(const GURL& url, 814a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch const std::string& languages, 815a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch FormatUrlTypes format_types, 816a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch UnescapeRule::Type unescape_rules, 8175c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu url::Parsed* new_parsed, 818a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch size_t* prefix_end, 819a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch size_t* offset_for_adjustment) { 820a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch Offsets offsets; 821a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (offset_for_adjustment) 822a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch offsets.push_back(*offset_for_adjustment); 823a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch base::string16 result = FormatUrlWithOffsets(url, languages, format_types, 824a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch unescape_rules, new_parsed, prefix_end, &offsets); 825a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch if (offset_for_adjustment) 826a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch *offset_for_adjustment = offsets[0]; 827a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch return result; 828a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} 829a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 830a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch} // namespace net 831