1// Copyright (c) 2011 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "chrome/browser/safe_browsing/safe_browsing_util.h" 6 7#include "base/base64.h" 8#include "base/string_util.h" 9#include "crypto/hmac.h" 10#include "crypto/sha2.h" 11#include "chrome/browser/google/google_util.h" 12#include "googleurl/src/gurl.h" 13#include "googleurl/src/url_util.h" 14#include "net/base/escape.h" 15#include "unicode/locid.h" 16 17#if defined(OS_WIN) 18#include "chrome/installer/util/browser_distribution.h" 19#endif 20 21static const int kSafeBrowsingMacDigestSize = 20; 22 23// Continue to this URL after submitting the phishing report form. 24// TODO(paulg): Change to a Chrome specific URL. 25static const char kContinueUrlFormat[] = 26 "http://www.google.com/tools/firefox/toolbar/FT2/intl/%s/submit_success.html"; 27 28static const char kReportParams[] = "?tpl=%s&continue=%s&url=%s"; 29 30// SBChunk --------------------------------------------------------------------- 31 32SBChunk::SBChunk() 33 : chunk_number(0), 34 list_id(0), 35 is_add(false) { 36} 37 38SBChunk::~SBChunk() {} 39 40// SBChunkList ----------------------------------------------------------------- 41 42SBChunkList::SBChunkList() {} 43 44SBChunkList::~SBChunkList() { 45 clear(); 46} 47 48void SBChunkList::clear() { 49 for (std::vector<SBChunk>::iterator citer = chunks_.begin(); 50 citer != chunks_.end(); ++citer) { 51 for (std::deque<SBChunkHost>::iterator hiter = citer->hosts.begin(); 52 hiter != citer->hosts.end(); ++hiter) { 53 if (hiter->entry) { 54 hiter->entry->Destroy(); 55 hiter->entry = NULL; 56 } 57 } 58 } 59 chunks_.clear(); 60} 61 62// SBListChunkRanges ----------------------------------------------------------- 63 64SBListChunkRanges::SBListChunkRanges(const std::string& n) : name(n) {} 65 66// SBChunkDelete --------------------------------------------------------------- 67 68SBChunkDelete::SBChunkDelete() : is_sub_del(false) {} 69 70SBChunkDelete::~SBChunkDelete() {} 71 72// SBEntry --------------------------------------------------------------------- 73 74// static 75SBEntry* SBEntry::Create(Type type, int prefix_count) { 76 int size = Size(type, prefix_count); 77 SBEntry *rv = static_cast<SBEntry*>(malloc(size)); 78 memset(rv, 0, size); 79 rv->set_type(type); 80 rv->set_prefix_count(prefix_count); 81 return rv; 82} 83 84void SBEntry::Destroy() { 85 free(this); 86} 87 88// static 89int SBEntry::PrefixSize(Type type) { 90 switch (type) { 91 case ADD_PREFIX: 92 return sizeof(SBPrefix); 93 case ADD_FULL_HASH: 94 return sizeof(SBFullHash); 95 case SUB_PREFIX: 96 return sizeof(SBSubPrefix); 97 case SUB_FULL_HASH: 98 return sizeof(SBSubFullHash); 99 default: 100 NOTREACHED(); 101 return 0; 102 } 103} 104 105int SBEntry::Size() const { 106 return Size(type(), prefix_count()); 107} 108 109// static 110int SBEntry::Size(Type type, int prefix_count) { 111 return sizeof(Data) + prefix_count * PrefixSize(type); 112} 113 114int SBEntry::ChunkIdAtPrefix(int index) const { 115 if (type() == SUB_PREFIX) 116 return sub_prefixes_[index].add_chunk; 117 return (type() == SUB_FULL_HASH) ? 118 sub_full_hashes_[index].add_chunk : chunk_id(); 119} 120 121void SBEntry::SetChunkIdAtPrefix(int index, int chunk_id) { 122 DCHECK(IsSub()); 123 124 if (type() == SUB_PREFIX) 125 sub_prefixes_[index].add_chunk = chunk_id; 126 else 127 sub_full_hashes_[index].add_chunk = chunk_id; 128} 129 130const SBPrefix& SBEntry::PrefixAt(int index) const { 131 DCHECK(IsPrefix()); 132 133 return IsAdd() ? add_prefixes_[index] : sub_prefixes_[index].prefix; 134} 135 136const SBFullHash& SBEntry::FullHashAt(int index) const { 137 DCHECK(!IsPrefix()); 138 139 return IsAdd() ? add_full_hashes_[index] : sub_full_hashes_[index].prefix; 140} 141 142void SBEntry::SetPrefixAt(int index, const SBPrefix& prefix) { 143 DCHECK(IsPrefix()); 144 145 if (IsAdd()) 146 add_prefixes_[index] = prefix; 147 else 148 sub_prefixes_[index].prefix = prefix; 149} 150 151void SBEntry::SetFullHashAt(int index, const SBFullHash& full_hash) { 152 DCHECK(!IsPrefix()); 153 154 if (IsAdd()) 155 add_full_hashes_[index] = full_hash; 156 else 157 sub_full_hashes_[index].prefix = full_hash; 158} 159 160 161// Utility functions ----------------------------------------------------------- 162 163namespace safe_browsing_util { 164 165// Listnames that browser can process. 166const char kMalwareList[] = "goog-malware-shavar"; 167const char kPhishingList[] = "goog-phish-shavar"; 168const char kBinUrlList[] = "goog-badbinurl-shavar"; 169const char kBinHashList[] = "goog-badbin-digestvar"; 170const char kCsdWhiteList[] = "goog-csdwhite-sha256"; 171 172int GetListId(const std::string& name) { 173 int id; 174 if (name == safe_browsing_util::kMalwareList) { 175 id = MALWARE; 176 } else if (name == safe_browsing_util::kPhishingList) { 177 id = PHISH; 178 } else if (name == safe_browsing_util::kBinUrlList) { 179 id = BINURL; 180 } else if (name == safe_browsing_util::kBinHashList) { 181 id = BINHASH; 182 } else if (name == safe_browsing_util::kCsdWhiteList) { 183 id = CSDWHITELIST; 184 } else { 185 id = INVALID; 186 } 187 return id; 188} 189 190bool GetListName(int list_id, std::string* list) { 191 switch (list_id) { 192 case MALWARE: 193 *list = safe_browsing_util::kMalwareList; 194 break; 195 case PHISH: 196 *list = safe_browsing_util::kPhishingList; 197 break; 198 case BINURL: 199 *list = safe_browsing_util::kBinUrlList; 200 break; 201 case BINHASH: 202 *list = safe_browsing_util::kBinHashList; 203 break; 204 case CSDWHITELIST: 205 *list = safe_browsing_util::kCsdWhiteList; 206 break; 207 default: 208 return false; 209 } 210 return true; 211} 212 213std::string Unescape(const std::string& url) { 214 std::string unescaped_str(url); 215 std::string old_unescaped_str; 216 const int kMaxLoopIterations = 1024; 217 int loop_var = 0; 218 do { 219 old_unescaped_str = unescaped_str; 220 unescaped_str = UnescapeURLComponent(old_unescaped_str, 221 UnescapeRule::CONTROL_CHARS | UnescapeRule::SPACES | 222 UnescapeRule::URL_SPECIAL_CHARS); 223 } while (unescaped_str != old_unescaped_str && ++loop_var <= 224 kMaxLoopIterations); 225 226 return unescaped_str; 227} 228 229std::string Escape(const std::string& url) { 230 std::string escaped_str; 231 const char* kHexString = "0123456789ABCDEF"; 232 for (size_t i = 0; i < url.length(); i++) { 233 unsigned char c = static_cast<unsigned char>(url[i]); 234 if (c <= ' ' || c > '~' || c == '#' || c == '%') { 235 escaped_str.push_back('%'); 236 escaped_str.push_back(kHexString[c >> 4]); 237 escaped_str.push_back(kHexString[c & 0xf]); 238 } else { 239 escaped_str.push_back(c); 240 } 241 } 242 243 return escaped_str; 244} 245 246std::string RemoveConsecutiveChars(const std::string& str, const char c) { 247 std::string output(str); 248 std::string string_to_find; 249 std::string::size_type loc = 0; 250 string_to_find.append(2, c); 251 while ((loc = output.find(string_to_find, loc)) != std::string::npos) { 252 output.erase(loc, 1); 253 } 254 255 return output; 256} 257 258// Canonicalizes url as per Google Safe Browsing Specification. 259// See section 6.1 in 260// http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec. 261void CanonicalizeUrl(const GURL& url, 262 std::string* canonicalized_hostname, 263 std::string* canonicalized_path, 264 std::string* canonicalized_query) { 265 DCHECK(url.is_valid()); 266 267 // We only canonicalize "normal" URLs. 268 if (!url.IsStandard()) 269 return; 270 271 // Following canonicalization steps are excluded since url parsing takes care 272 // of those :- 273 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url. 274 // (Exclude escaped version of these chars). 275 // 2. Normalize hostname to 4 dot-seperated decimal values. 276 // 3. Lowercase hostname. 277 // 4. Resolve path sequences "/../" and "/./". 278 279 // That leaves us with the following :- 280 // 1. Remove fragment in URL. 281 GURL url_without_fragment; 282 GURL::Replacements f_replacements; 283 f_replacements.ClearRef(); 284 f_replacements.ClearUsername(); 285 f_replacements.ClearPassword(); 286 url_without_fragment = url.ReplaceComponents(f_replacements); 287 288 // 2. Do URL unescaping until no more hex encoded characters exist. 289 std::string url_unescaped_str(Unescape(url_without_fragment.spec())); 290 url_parse::Parsed parsed; 291 url_parse::ParseStandardURL(url_unescaped_str.data(), 292 url_unescaped_str.length(), &parsed); 293 294 // 3. In hostname, remove all leading and trailing dots. 295 const std::string host = (parsed.host.len > 0) ? url_unescaped_str.substr( 296 parsed.host.begin, parsed.host.len) : ""; 297 const char kCharsToTrim[] = "."; 298 std::string host_without_end_dots; 299 TrimString(host, kCharsToTrim, &host_without_end_dots); 300 301 // 4. In hostname, replace consecutive dots with a single dot. 302 std::string host_without_consecutive_dots(RemoveConsecutiveChars( 303 host_without_end_dots, '.')); 304 305 // 5. In path, replace runs of consecutive slashes with a single slash. 306 std::string path = (parsed.path.len > 0) ? url_unescaped_str.substr( 307 parsed.path.begin, parsed.path.len): ""; 308 std::string path_without_consecutive_slash(RemoveConsecutiveChars( 309 path, '/')); 310 311 url_canon::Replacements<char> hp_replacements; 312 hp_replacements.SetHost(host_without_consecutive_dots.data(), 313 url_parse::Component(0, host_without_consecutive_dots.length())); 314 hp_replacements.SetPath(path_without_consecutive_slash.data(), 315 url_parse::Component(0, path_without_consecutive_slash.length())); 316 317 std::string url_unescaped_with_can_hostpath; 318 url_canon::StdStringCanonOutput output(&url_unescaped_with_can_hostpath); 319 url_parse::Parsed temp_parsed; 320 url_util::ReplaceComponents(url_unescaped_str.data(), 321 url_unescaped_str.length(), parsed, 322 hp_replacements, NULL, &output, &temp_parsed); 323 output.Complete(); 324 325 // 6. Step needed to revert escaping done in url_util::ReplaceComponents. 326 url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath); 327 328 // 7. After performing all above steps, percent-escape all chars in url which 329 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters. 330 std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath)); 331 url_parse::Parsed final_parsed; 332 url_parse::ParseStandardURL(escaped_canon_url_str.data(), 333 escaped_canon_url_str.length(), &final_parsed); 334 335 if (canonicalized_hostname && final_parsed.host.len > 0) { 336 *canonicalized_hostname = 337 escaped_canon_url_str.substr(final_parsed.host.begin, 338 final_parsed.host.len); 339 } 340 if (canonicalized_path && final_parsed.path.len > 0) { 341 *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin, 342 final_parsed.path.len); 343 } 344 if (canonicalized_query && final_parsed.query.len > 0) { 345 *canonicalized_query = escaped_canon_url_str.substr( 346 final_parsed.query.begin, final_parsed.query.len); 347 } 348} 349 350void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) { 351 hosts->clear(); 352 353 std::string canon_host; 354 CanonicalizeUrl(url, &canon_host, NULL, NULL); 355 356 const std::string host = canon_host; // const sidesteps GCC bugs below! 357 if (host.empty()) 358 return; 359 360 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4 361 // hostnames formed by starting with the last 5 components and successively 362 // removing the leading component. The last component isn't examined alone, 363 // since it's the TLD or a subcomponent thereof. 364 // 365 // Note that we don't need to be clever about stopping at the "real" eTLD -- 366 // the data on the server side has been filtered to ensure it will not 367 // blacklist a whole TLD, and it's not significantly slower on our side to 368 // just check too much. 369 // 370 // Also note that because we have a simple blacklist, not some sort of complex 371 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check 372 // these in. 373 const size_t kMaxHostsToCheck = 4; 374 bool skipped_last_component = false; 375 for (std::string::const_reverse_iterator i(host.rbegin()); 376 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) { 377 if (*i == '.') { 378 if (skipped_last_component) 379 hosts->push_back(std::string(i.base(), host.end())); 380 else 381 skipped_last_component = true; 382 } 383 } 384 hosts->push_back(host); 385} 386 387void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) { 388 paths->clear(); 389 390 std::string canon_path; 391 std::string canon_query; 392 CanonicalizeUrl(url, NULL, &canon_path, &canon_query); 393 394 const std::string path = canon_path; // const sidesteps GCC bugs below! 395 const std::string query = canon_query; 396 if (path.empty()) 397 return; 398 399 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without 400 // the query parameters, and also up to 4 paths formed by starting at the root 401 // and adding more path components. 402 // 403 // As with the hosts above, it doesn't matter what order we check these in. 404 const size_t kMaxPathsToCheck = 4; 405 for (std::string::const_iterator i(path.begin()); 406 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) { 407 if (*i == '/') 408 paths->push_back(std::string(path.begin(), i + 1)); 409 } 410 411 if (!paths->empty() && paths->back() != path) 412 paths->push_back(path); 413 414 if (!query.empty()) 415 paths->push_back(path + "?" + query); 416} 417 418int GetHashIndex(const SBFullHash& hash, 419 const std::vector<SBFullHashResult>& full_hashes) { 420 for (size_t i = 0; i < full_hashes.size(); ++i) { 421 if (hash == full_hashes[i].hash) 422 return static_cast<int>(i); 423 } 424 return -1; 425} 426 427int GetUrlHashIndex(const GURL& url, 428 const std::vector<SBFullHashResult>& full_hashes) { 429 if (full_hashes.empty()) 430 return -1; 431 432 std::vector<std::string> hosts, paths; 433 GenerateHostsToCheck(url, &hosts); 434 GeneratePathsToCheck(url, &paths); 435 436 for (size_t h = 0; h < hosts.size(); ++h) { 437 for (size_t p = 0; p < paths.size(); ++p) { 438 SBFullHash key; 439 crypto::SHA256HashString(hosts[h] + paths[p], 440 key.full_hash, 441 sizeof(SBFullHash)); 442 int index = GetHashIndex(key, full_hashes); 443 if (index != -1) return index; 444 } 445 } 446 447 return -1; 448} 449 450bool IsPhishingList(const std::string& list_name) { 451 return list_name.compare(kPhishingList) == 0; 452} 453 454bool IsMalwareList(const std::string& list_name) { 455 return list_name.compare(kMalwareList) == 0; 456} 457 458bool IsBadbinurlList(const std::string& list_name) { 459 return list_name.compare(kBinUrlList) == 0; 460} 461 462bool IsBadbinhashList(const std::string& list_name) { 463 return list_name.compare(kBinHashList) == 0; 464} 465 466static void DecodeWebSafe(std::string* decoded) { 467 DCHECK(decoded); 468 for (std::string::iterator i(decoded->begin()); i != decoded->end(); ++i) { 469 if (*i == '_') 470 *i = '/'; 471 else if (*i == '-') 472 *i = '+'; 473 } 474} 475 476bool VerifyMAC(const std::string& key, const std::string& mac, 477 const char* data, int data_length) { 478 std::string key_copy = key; 479 DecodeWebSafe(&key_copy); 480 std::string decoded_key; 481 base::Base64Decode(key_copy, &decoded_key); 482 483 std::string mac_copy = mac; 484 DecodeWebSafe(&mac_copy); 485 std::string decoded_mac; 486 base::Base64Decode(mac_copy, &decoded_mac); 487 488 crypto::HMAC hmac(crypto::HMAC::SHA1); 489 if (!hmac.Init(decoded_key)) 490 return false; 491 const std::string data_str(data, data_length); 492 unsigned char digest[kSafeBrowsingMacDigestSize]; 493 if (!hmac.Sign(data_str, digest, kSafeBrowsingMacDigestSize)) 494 return false; 495 496 return !memcmp(digest, decoded_mac.data(), kSafeBrowsingMacDigestSize); 497} 498 499GURL GeneratePhishingReportUrl(const std::string& report_page, 500 const std::string& url_to_report) { 501 icu::Locale locale = icu::Locale::getDefault(); 502 const char* lang = locale.getLanguage(); 503 if (!lang) 504 lang = "en"; // fallback 505 const std::string continue_esc = 506 EscapeQueryParamValue(StringPrintf(kContinueUrlFormat, lang), true); 507 const std::string current_esc = EscapeQueryParamValue(url_to_report, true); 508 509#if defined(OS_WIN) 510 BrowserDistribution* dist = BrowserDistribution::GetDistribution(); 511 std::string client_name(dist->GetSafeBrowsingName()); 512#else 513 std::string client_name("googlechrome"); 514#endif 515 516 GURL report_url(report_page + 517 StringPrintf(kReportParams, client_name.c_str(), continue_esc.c_str(), 518 current_esc.c_str())); 519 return google_util::AppendGoogleLocaleParam(report_url); 520} 521 522void StringToSBFullHash(const std::string& hash_in, SBFullHash* hash_out) { 523 DCHECK_EQ(static_cast<size_t>(crypto::SHA256_LENGTH), hash_in.size()); 524 memcpy(hash_out->full_hash, hash_in.data(), crypto::SHA256_LENGTH); 525} 526 527std::string SBFullHashToString(const SBFullHash& hash) { 528 DCHECK_EQ(static_cast<size_t>(crypto::SHA256_LENGTH), sizeof(hash.full_hash)); 529 return std::string(hash.full_hash, sizeof(hash.full_hash)); 530} 531} // namespace safe_browsing_util 532