1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "chrome/browser/safe_browsing/safe_browsing_util.h" 6 7#include "base/logging.h" 8#include "base/strings/string_util.h" 9#include "base/strings/stringprintf.h" 10#include "chrome/browser/google/google_util.h" 11#include "crypto/sha2.h" 12#include "net/base/escape.h" 13#include "url/gurl.h" 14#include "url/url_util.h" 15 16#if defined(OS_WIN) 17#include "chrome/installer/util/browser_distribution.h" 18#endif 19 20static const char kReportParams[] = "?tpl=%s&url=%s"; 21 22// SBChunk --------------------------------------------------------------------- 23 24SBChunk::SBChunk() 25 : chunk_number(0), 26 list_id(0), 27 is_add(false) { 28} 29 30SBChunk::~SBChunk() {} 31 32// SBChunkList ----------------------------------------------------------------- 33 34SBChunkList::SBChunkList() {} 35 36SBChunkList::~SBChunkList() { 37 clear(); 38} 39 40void SBChunkList::clear() { 41 for (std::vector<SBChunk>::iterator citer = chunks_.begin(); 42 citer != chunks_.end(); ++citer) { 43 for (std::deque<SBChunkHost>::iterator hiter = citer->hosts.begin(); 44 hiter != citer->hosts.end(); ++hiter) { 45 if (hiter->entry) { 46 hiter->entry->Destroy(); 47 hiter->entry = NULL; 48 } 49 } 50 } 51 chunks_.clear(); 52} 53 54// SBListChunkRanges ----------------------------------------------------------- 55 56SBListChunkRanges::SBListChunkRanges(const std::string& n) : name(n) {} 57 58// SBChunkDelete --------------------------------------------------------------- 59 60SBChunkDelete::SBChunkDelete() : is_sub_del(false) {} 61 62SBChunkDelete::~SBChunkDelete() {} 63 64// SBEntry --------------------------------------------------------------------- 65 66// static 67SBEntry* SBEntry::Create(Type type, int prefix_count) { 68 int size = Size(type, prefix_count); 69 SBEntry *rv = static_cast<SBEntry*>(malloc(size)); 70 memset(rv, 0, size); 71 rv->set_type(type); 72 rv->set_prefix_count(prefix_count); 73 return rv; 74} 75 76void SBEntry::Destroy() { 77 free(this); 78} 79 80// static 81int SBEntry::PrefixSize(Type type) { 82 switch (type) { 83 case ADD_PREFIX: 84 return sizeof(SBPrefix); 85 case ADD_FULL_HASH: 86 return sizeof(SBFullHash); 87 case SUB_PREFIX: 88 return sizeof(SBSubPrefix); 89 case SUB_FULL_HASH: 90 return sizeof(SBSubFullHash); 91 default: 92 NOTREACHED(); 93 return 0; 94 } 95} 96 97int SBEntry::Size() const { 98 return Size(type(), prefix_count()); 99} 100 101// static 102int SBEntry::Size(Type type, int prefix_count) { 103 return sizeof(Data) + prefix_count * PrefixSize(type); 104} 105 106int SBEntry::ChunkIdAtPrefix(int index) const { 107 if (type() == SUB_PREFIX) 108 return sub_prefixes_[index].add_chunk; 109 return (type() == SUB_FULL_HASH) ? 110 sub_full_hashes_[index].add_chunk : chunk_id(); 111} 112 113void SBEntry::SetChunkIdAtPrefix(int index, int chunk_id) { 114 DCHECK(IsSub()); 115 116 if (type() == SUB_PREFIX) 117 sub_prefixes_[index].add_chunk = chunk_id; 118 else 119 sub_full_hashes_[index].add_chunk = chunk_id; 120} 121 122const SBPrefix& SBEntry::PrefixAt(int index) const { 123 DCHECK(IsPrefix()); 124 125 return IsAdd() ? add_prefixes_[index] : sub_prefixes_[index].prefix; 126} 127 128const SBFullHash& SBEntry::FullHashAt(int index) const { 129 DCHECK(!IsPrefix()); 130 131 return IsAdd() ? add_full_hashes_[index] : sub_full_hashes_[index].prefix; 132} 133 134void SBEntry::SetPrefixAt(int index, const SBPrefix& prefix) { 135 DCHECK(IsPrefix()); 136 137 if (IsAdd()) 138 add_prefixes_[index] = prefix; 139 else 140 sub_prefixes_[index].prefix = prefix; 141} 142 143void SBEntry::SetFullHashAt(int index, const SBFullHash& full_hash) { 144 DCHECK(!IsPrefix()); 145 146 if (IsAdd()) 147 add_full_hashes_[index] = full_hash; 148 else 149 sub_full_hashes_[index].prefix = full_hash; 150} 151 152 153// Utility functions ----------------------------------------------------------- 154 155namespace { 156bool IsKnownList(const std::string& name) { 157 for (size_t i = 0; i < arraysize(safe_browsing_util::kAllLists); ++i) { 158 if (!strcmp(safe_browsing_util::kAllLists[i], name.c_str())) { 159 return true; 160 } 161 } 162 return false; 163} 164} // namespace 165 166namespace safe_browsing_util { 167 168// Listnames that browser can process. 169const char kMalwareList[] = "goog-malware-shavar"; 170const char kPhishingList[] = "goog-phish-shavar"; 171const char kBinUrlList[] = "goog-badbinurl-shavar"; 172// We don't use the bad binary digest list anymore. Use a fake listname to be 173// sure we don't request it accidentally. 174const char kBinHashList[] = "goog-badbin-digestvar-disabled"; 175const char kCsdWhiteList[] = "goog-csdwhite-sha256"; 176const char kDownloadWhiteList[] = "goog-downloadwhite-digest256"; 177const char kExtensionBlacklist[] = "goog-badcrxids-digestvar"; 178const char kSideEffectFreeWhitelist[] = "goog-sideeffectfree-shavar"; 179const char kIPBlacklist[] = "goog-badip-digest256"; 180 181const char* kAllLists[10] = { 182 kMalwareList, 183 kPhishingList, 184 kBinUrlList, 185 kBinHashList, 186 kCsdWhiteList, 187 kDownloadWhiteList, 188 kDownloadWhiteList, 189 kExtensionBlacklist, 190 kSideEffectFreeWhitelist, 191 kIPBlacklist, 192}; 193 194ListType GetListId(const std::string& name) { 195 ListType id; 196 if (name == safe_browsing_util::kMalwareList) { 197 id = MALWARE; 198 } else if (name == safe_browsing_util::kPhishingList) { 199 id = PHISH; 200 } else if (name == safe_browsing_util::kBinUrlList) { 201 id = BINURL; 202 } else if (name == safe_browsing_util::kBinHashList) { 203 id = BINHASH; 204 } else if (name == safe_browsing_util::kCsdWhiteList) { 205 id = CSDWHITELIST; 206 } else if (name == safe_browsing_util::kDownloadWhiteList) { 207 id = DOWNLOADWHITELIST; 208 } else if (name == safe_browsing_util::kExtensionBlacklist) { 209 id = EXTENSIONBLACKLIST; 210 } else if (name == safe_browsing_util::kSideEffectFreeWhitelist) { 211 id = SIDEEFFECTFREEWHITELIST; 212 } else if (name == safe_browsing_util::kIPBlacklist) { 213 id = IPBLACKLIST; 214 } else { 215 id = INVALID; 216 } 217 return id; 218} 219 220bool GetListName(ListType list_id, std::string* list) { 221 switch (list_id) { 222 case MALWARE: 223 *list = safe_browsing_util::kMalwareList; 224 break; 225 case PHISH: 226 *list = safe_browsing_util::kPhishingList; 227 break; 228 case BINURL: 229 *list = safe_browsing_util::kBinUrlList; 230 break; 231 case BINHASH: 232 *list = safe_browsing_util::kBinHashList; 233 break; 234 case CSDWHITELIST: 235 *list = safe_browsing_util::kCsdWhiteList; 236 break; 237 case DOWNLOADWHITELIST: 238 *list = safe_browsing_util::kDownloadWhiteList; 239 break; 240 case EXTENSIONBLACKLIST: 241 *list = safe_browsing_util::kExtensionBlacklist; 242 break; 243 case SIDEEFFECTFREEWHITELIST: 244 *list = safe_browsing_util::kSideEffectFreeWhitelist; 245 break; 246 case IPBLACKLIST: 247 *list = safe_browsing_util::kIPBlacklist; 248 break; 249 default: 250 return false; 251 } 252 DCHECK(IsKnownList(*list)); 253 return true; 254} 255 256std::string Unescape(const std::string& url) { 257 std::string unescaped_str(url); 258 std::string old_unescaped_str; 259 const int kMaxLoopIterations = 1024; 260 int loop_var = 0; 261 do { 262 old_unescaped_str = unescaped_str; 263 unescaped_str = net::UnescapeURLComponent(old_unescaped_str, 264 net::UnescapeRule::CONTROL_CHARS | net::UnescapeRule::SPACES | 265 net::UnescapeRule::URL_SPECIAL_CHARS); 266 } while (unescaped_str != old_unescaped_str && ++loop_var <= 267 kMaxLoopIterations); 268 269 return unescaped_str; 270} 271 272std::string Escape(const std::string& url) { 273 std::string escaped_str; 274 const char* kHexString = "0123456789ABCDEF"; 275 for (size_t i = 0; i < url.length(); i++) { 276 unsigned char c = static_cast<unsigned char>(url[i]); 277 if (c <= ' ' || c > '~' || c == '#' || c == '%') { 278 escaped_str.push_back('%'); 279 escaped_str.push_back(kHexString[c >> 4]); 280 escaped_str.push_back(kHexString[c & 0xf]); 281 } else { 282 escaped_str.push_back(c); 283 } 284 } 285 286 return escaped_str; 287} 288 289std::string RemoveConsecutiveChars(const std::string& str, const char c) { 290 std::string output(str); 291 std::string string_to_find; 292 std::string::size_type loc = 0; 293 string_to_find.append(2, c); 294 while ((loc = output.find(string_to_find, loc)) != std::string::npos) { 295 output.erase(loc, 1); 296 } 297 298 return output; 299} 300 301// Canonicalizes url as per Google Safe Browsing Specification. 302// See section 6.1 in 303// http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec. 304void CanonicalizeUrl(const GURL& url, 305 std::string* canonicalized_hostname, 306 std::string* canonicalized_path, 307 std::string* canonicalized_query) { 308 DCHECK(url.is_valid()); 309 310 // We only canonicalize "normal" URLs. 311 if (!url.IsStandard()) 312 return; 313 314 // Following canonicalization steps are excluded since url parsing takes care 315 // of those :- 316 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url. 317 // (Exclude escaped version of these chars). 318 // 2. Normalize hostname to 4 dot-seperated decimal values. 319 // 3. Lowercase hostname. 320 // 4. Resolve path sequences "/../" and "/./". 321 322 // That leaves us with the following :- 323 // 1. Remove fragment in URL. 324 GURL url_without_fragment; 325 GURL::Replacements f_replacements; 326 f_replacements.ClearRef(); 327 f_replacements.ClearUsername(); 328 f_replacements.ClearPassword(); 329 url_without_fragment = url.ReplaceComponents(f_replacements); 330 331 // 2. Do URL unescaping until no more hex encoded characters exist. 332 std::string url_unescaped_str(Unescape(url_without_fragment.spec())); 333 url_parse::Parsed parsed; 334 url_parse::ParseStandardURL(url_unescaped_str.data(), 335 url_unescaped_str.length(), &parsed); 336 337 // 3. In hostname, remove all leading and trailing dots. 338 const std::string host = 339 (parsed.host.len > 0) 340 ? url_unescaped_str.substr(parsed.host.begin, parsed.host.len) 341 : std::string(); 342 const char kCharsToTrim[] = "."; 343 std::string host_without_end_dots; 344 base::TrimString(host, kCharsToTrim, &host_without_end_dots); 345 346 // 4. In hostname, replace consecutive dots with a single dot. 347 std::string host_without_consecutive_dots(RemoveConsecutiveChars( 348 host_without_end_dots, '.')); 349 350 // 5. In path, replace runs of consecutive slashes with a single slash. 351 std::string path = 352 (parsed.path.len > 0) 353 ? url_unescaped_str.substr(parsed.path.begin, parsed.path.len) 354 : std::string(); 355 std::string path_without_consecutive_slash(RemoveConsecutiveChars(path, '/')); 356 357 url_canon::Replacements<char> hp_replacements; 358 hp_replacements.SetHost(host_without_consecutive_dots.data(), 359 url_parse::Component(0, host_without_consecutive_dots.length())); 360 hp_replacements.SetPath(path_without_consecutive_slash.data(), 361 url_parse::Component(0, path_without_consecutive_slash.length())); 362 363 std::string url_unescaped_with_can_hostpath; 364 url_canon::StdStringCanonOutput output(&url_unescaped_with_can_hostpath); 365 url_parse::Parsed temp_parsed; 366 url_util::ReplaceComponents(url_unescaped_str.data(), 367 url_unescaped_str.length(), parsed, 368 hp_replacements, NULL, &output, &temp_parsed); 369 output.Complete(); 370 371 // 6. Step needed to revert escaping done in url_util::ReplaceComponents. 372 url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath); 373 374 // 7. After performing all above steps, percent-escape all chars in url which 375 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters. 376 std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath)); 377 url_parse::Parsed final_parsed; 378 url_parse::ParseStandardURL(escaped_canon_url_str.data(), 379 escaped_canon_url_str.length(), &final_parsed); 380 381 if (canonicalized_hostname && final_parsed.host.len > 0) { 382 *canonicalized_hostname = 383 escaped_canon_url_str.substr(final_parsed.host.begin, 384 final_parsed.host.len); 385 } 386 if (canonicalized_path && final_parsed.path.len > 0) { 387 *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin, 388 final_parsed.path.len); 389 } 390 if (canonicalized_query && final_parsed.query.len > 0) { 391 *canonicalized_query = escaped_canon_url_str.substr( 392 final_parsed.query.begin, final_parsed.query.len); 393 } 394} 395 396void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) { 397 hosts->clear(); 398 399 std::string canon_host; 400 CanonicalizeUrl(url, &canon_host, NULL, NULL); 401 402 const std::string host = canon_host; // const sidesteps GCC bugs below! 403 if (host.empty()) 404 return; 405 406 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4 407 // hostnames formed by starting with the last 5 components and successively 408 // removing the leading component. The last component isn't examined alone, 409 // since it's the TLD or a subcomponent thereof. 410 // 411 // Note that we don't need to be clever about stopping at the "real" eTLD -- 412 // the data on the server side has been filtered to ensure it will not 413 // blacklist a whole TLD, and it's not significantly slower on our side to 414 // just check too much. 415 // 416 // Also note that because we have a simple blacklist, not some sort of complex 417 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check 418 // these in. 419 const size_t kMaxHostsToCheck = 4; 420 bool skipped_last_component = false; 421 for (std::string::const_reverse_iterator i(host.rbegin()); 422 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) { 423 if (*i == '.') { 424 if (skipped_last_component) 425 hosts->push_back(std::string(i.base(), host.end())); 426 else 427 skipped_last_component = true; 428 } 429 } 430 hosts->push_back(host); 431} 432 433void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) { 434 paths->clear(); 435 436 std::string canon_path; 437 std::string canon_query; 438 CanonicalizeUrl(url, NULL, &canon_path, &canon_query); 439 440 const std::string path = canon_path; // const sidesteps GCC bugs below! 441 const std::string query = canon_query; 442 if (path.empty()) 443 return; 444 445 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without 446 // the query parameters, and also up to 4 paths formed by starting at the root 447 // and adding more path components. 448 // 449 // As with the hosts above, it doesn't matter what order we check these in. 450 const size_t kMaxPathsToCheck = 4; 451 for (std::string::const_iterator i(path.begin()); 452 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) { 453 if (*i == '/') 454 paths->push_back(std::string(path.begin(), i + 1)); 455 } 456 457 if (!paths->empty() && paths->back() != path) 458 paths->push_back(path); 459 460 if (!query.empty()) 461 paths->push_back(path + "?" + query); 462} 463 464void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) { 465 std::vector<std::string> hosts, paths; 466 GenerateHostsToCheck(url, &hosts); 467 GeneratePathsToCheck(url, &paths); 468 for (size_t h = 0; h < hosts.size(); ++h) { 469 for (size_t p = 0; p < paths.size(); ++p) { 470 urls->push_back(hosts[h] + paths[p]); 471 } 472 } 473} 474 475int GetHashIndex(const SBFullHash& hash, 476 const std::vector<SBFullHashResult>& full_hashes) { 477 for (size_t i = 0; i < full_hashes.size(); ++i) { 478 if (hash == full_hashes[i].hash) 479 return static_cast<int>(i); 480 } 481 return -1; 482} 483 484int GetUrlHashIndex(const GURL& url, 485 const std::vector<SBFullHashResult>& full_hashes) { 486 if (full_hashes.empty()) 487 return -1; 488 489 std::vector<std::string> patterns; 490 GeneratePatternsToCheck(url, &patterns); 491 492 for (size_t i = 0; i < patterns.size(); ++i) { 493 SBFullHash key; 494 crypto::SHA256HashString(patterns[i], key.full_hash, sizeof(SBFullHash)); 495 int index = GetHashIndex(key, full_hashes); 496 if (index != -1) 497 return index; 498 } 499 return -1; 500} 501 502bool IsPhishingList(const std::string& list_name) { 503 return list_name.compare(kPhishingList) == 0; 504} 505 506bool IsMalwareList(const std::string& list_name) { 507 return list_name.compare(kMalwareList) == 0; 508} 509 510bool IsBadbinurlList(const std::string& list_name) { 511 return list_name.compare(kBinUrlList) == 0; 512} 513 514bool IsBadbinhashList(const std::string& list_name) { 515 return list_name.compare(kBinHashList) == 0; 516} 517 518bool IsExtensionList(const std::string& list_name) { 519 return list_name.compare(kExtensionBlacklist) == 0; 520} 521 522GURL GeneratePhishingReportUrl(const std::string& report_page, 523 const std::string& url_to_report, 524 bool is_client_side_detection) { 525 const std::string current_esc = net::EscapeQueryParamValue(url_to_report, 526 true); 527 528#if defined(OS_WIN) 529 BrowserDistribution* dist = BrowserDistribution::GetDistribution(); 530 std::string client_name(dist->GetSafeBrowsingName()); 531#else 532 std::string client_name("googlechrome"); 533#endif 534 if (is_client_side_detection) 535 client_name.append("_csd"); 536 537 GURL report_url(report_page + base::StringPrintf(kReportParams, 538 client_name.c_str(), 539 current_esc.c_str())); 540 return google_util::AppendGoogleLocaleParam(report_url); 541} 542 543SBFullHash StringToSBFullHash(const std::string& hash_in) { 544 DCHECK_EQ(crypto::kSHA256Length, hash_in.size()); 545 SBFullHash hash_out; 546 memcpy(hash_out.full_hash, hash_in.data(), crypto::kSHA256Length); 547 return hash_out; 548} 549 550std::string SBFullHashToString(const SBFullHash& hash) { 551 DCHECK_EQ(crypto::kSHA256Length, sizeof(hash.full_hash)); 552 return std::string(hash.full_hash, sizeof(hash.full_hash)); 553} 554 555} // namespace safe_browsing_util 556