1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h" 6 7#include <algorithm> 8#include <string> 9#include <vector> 10 11#include "base/logging.h" 12#include "base/metrics/histogram.h" 13#include "base/strings/string_split.h" 14#include "base/strings/string_util.h" 15#include "base/timer/elapsed_timer.h" 16#include "chrome/renderer/safe_browsing/features.h" 17#include "net/base/registry_controlled_domains/registry_controlled_domain.h" 18#include "url/gurl.h" 19 20namespace safe_browsing { 21 22PhishingUrlFeatureExtractor::PhishingUrlFeatureExtractor() {} 23 24PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {} 25 26bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL& url, 27 FeatureMap* features) { 28 base::ElapsedTimer timer; 29 if (url.HostIsIPAddress()) { 30 if (!features->AddBooleanFeature(features::kUrlHostIsIpAddress)) 31 return false; 32 } else { 33 // Remove any leading/trailing dots. 34 std::string host; 35 base::TrimString(url.host(), ".", &host); 36 37 // TODO(bryner): Ensure that the url encoding is consistent with 38 // the features in the model. 39 40 // Disallow unknown registries so that we don't classify 41 // partial hostnames (e.g. "www.subdomain"). 42 size_t registry_length = 43 net::registry_controlled_domains::GetRegistryLength( 44 host, 45 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES, 46 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); 47 48 if (registry_length == 0 || registry_length == std::string::npos) { 49 DVLOG(1) << "Could not find TLD for host: " << host; 50 return false; 51 } 52 DCHECK_LT(registry_length, host.size()) << "Non-zero registry length, but " 53 "host is only a TLD: " << host; 54 size_t tld_start = host.size() - registry_length; 55 if (!features->AddBooleanFeature(features::kUrlTldToken + 56 host.substr(tld_start))) 57 return false; 58 59 // Pull off the TLD and the preceeding dot. 60 host.erase(tld_start - 1); 61 std::vector<std::string> host_tokens; 62 base::SplitStringDontTrim(host, '.', &host_tokens); 63 // Get rid of any empty components. 64 std::vector<std::string>::iterator new_end = 65 std::remove(host_tokens.begin(), host_tokens.end(), ""); 66 host_tokens.erase(new_end, host_tokens.end()); 67 if (host_tokens.empty()) { 68 DVLOG(1) << "Could not find domain for host: " << host; 69 return false; 70 } 71 if (!features->AddBooleanFeature(features::kUrlDomainToken + 72 host_tokens.back())) 73 return false; 74 host_tokens.pop_back(); 75 76 // Now we're just left with the "other" host tokens. 77 for (std::vector<std::string>::iterator it = host_tokens.begin(); 78 it != host_tokens.end(); ++it) { 79 if (!features->AddBooleanFeature(features::kUrlOtherHostToken + *it)) 80 return false; 81 } 82 83 if (host_tokens.size() > 1) { 84 if (!features->AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne)) 85 return false; 86 if (host_tokens.size() > 3) { 87 if (!features->AddBooleanFeature( 88 features::kUrlNumOtherHostTokensGTThree)) 89 return false; 90 } 91 } 92 } 93 94 std::vector<std::string> long_tokens; 95 SplitStringIntoLongAlphanumTokens(url.path(), &long_tokens); 96 for (std::vector<std::string>::iterator it = long_tokens.begin(); 97 it != long_tokens.end(); ++it) { 98 if (!features->AddBooleanFeature(features::kUrlPathToken + *it)) 99 return false; 100 } 101 102 UMA_HISTOGRAM_TIMES("SBClientPhishing.URLFeatureTime", timer.Elapsed()); 103 return true; 104} 105 106// static 107void PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens( 108 const std::string& full, 109 std::vector<std::string>* tokens) { 110 // Split on common non-alphanumerics. 111 // TODO(bryner): Split on all(?) non-alphanumerics and handle %XX properly. 112 static const char kTokenSeparators[] = ".,\\/_-|=%:!&"; 113 std::vector<std::string> raw_splits; 114 Tokenize(full, kTokenSeparators, &raw_splits); 115 116 // Copy over only the splits that are 3 or more chars long. 117 // TODO(bryner): Determine a meaningful min size. 118 for (std::vector<std::string>::iterator it = raw_splits.begin(); 119 it != raw_splits.end(); ++it) { 120 if (it->length() >= kMinPathComponentLength) 121 tokens->push_back(*it); 122 } 123} 124 125} // namespace safe_browsing 126