phishing_url_feature_extractor.cc revision 2a99a7e74a7f215066514fe81d2bfa6639d9eddd
1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h" 6 7#include <algorithm> 8#include <string> 9#include <vector> 10 11#include "base/logging.h" 12#include "base/metrics/histogram.h" 13#include "base/perftimer.h" 14#include "base/string_util.h" 15#include "base/strings/string_split.h" 16#include "chrome/renderer/safe_browsing/features.h" 17#include "googleurl/src/gurl.h" 18#include "net/base/registry_controlled_domains/registry_controlled_domain.h" 19 20namespace safe_browsing { 21 22PhishingUrlFeatureExtractor::PhishingUrlFeatureExtractor() {} 23 24PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {} 25 26bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL& url, 27 FeatureMap* features) { 28 PerfTimer timer; 29 if (url.HostIsIPAddress()) { 30 if (!features->AddBooleanFeature(features::kUrlHostIsIpAddress)) 31 return false; 32 } else { 33 std::string host; 34 TrimString(url.host(), ".", &host); // Remove any leading/trailing dots. 35 36 // TODO(bryner): Ensure that the url encoding is consistent with 37 // the features in the model. 38 39 // Disallow unknown registries so that we don't classify 40 // partial hostnames (e.g. "www.subdomain"). 41 size_t registry_length = 42 net::RegistryControlledDomainService::GetRegistryLength(host, false); 43 44 if (registry_length == 0 || registry_length == std::string::npos) { 45 DVLOG(1) << "Could not find TLD for host: " << host; 46 return false; 47 } 48 DCHECK_LT(registry_length, host.size()) << "Non-zero registry length, but " 49 "host is only a TLD: " << host; 50 size_t tld_start = host.size() - registry_length; 51 if (!features->AddBooleanFeature(features::kUrlTldToken + 52 host.substr(tld_start))) 53 return false; 54 55 // Pull off the TLD and the preceeding dot. 56 host.erase(tld_start - 1); 57 std::vector<std::string> host_tokens; 58 base::SplitStringDontTrim(host, '.', &host_tokens); 59 // Get rid of any empty components. 60 std::vector<std::string>::iterator new_end = 61 std::remove(host_tokens.begin(), host_tokens.end(), ""); 62 host_tokens.erase(new_end, host_tokens.end()); 63 if (host_tokens.empty()) { 64 DVLOG(1) << "Could not find domain for host: " << host; 65 return false; 66 } 67 if (!features->AddBooleanFeature(features::kUrlDomainToken + 68 host_tokens.back())) 69 return false; 70 host_tokens.pop_back(); 71 72 // Now we're just left with the "other" host tokens. 73 for (std::vector<std::string>::iterator it = host_tokens.begin(); 74 it != host_tokens.end(); ++it) { 75 if (!features->AddBooleanFeature(features::kUrlOtherHostToken + *it)) 76 return false; 77 } 78 79 if (host_tokens.size() > 1) { 80 if (!features->AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne)) 81 return false; 82 if (host_tokens.size() > 3) { 83 if (!features->AddBooleanFeature( 84 features::kUrlNumOtherHostTokensGTThree)) 85 return false; 86 } 87 } 88 } 89 90 std::vector<std::string> long_tokens; 91 SplitStringIntoLongAlphanumTokens(url.path(), &long_tokens); 92 for (std::vector<std::string>::iterator it = long_tokens.begin(); 93 it != long_tokens.end(); ++it) { 94 if (!features->AddBooleanFeature(features::kUrlPathToken + *it)) 95 return false; 96 } 97 98 UMA_HISTOGRAM_TIMES("SBClientPhishing.URLFeatureTime", timer.Elapsed()); 99 return true; 100} 101 102// static 103void PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens( 104 const std::string& full, 105 std::vector<std::string>* tokens) { 106 // Split on common non-alphanumerics. 107 // TODO(bryner): Split on all(?) non-alphanumerics and handle %XX properly. 108 static const char kTokenSeparators[] = ".,\\/_-|=%:!&"; 109 std::vector<std::string> raw_splits; 110 Tokenize(full, kTokenSeparators, &raw_splits); 111 112 // Copy over only the splits that are 3 or more chars long. 113 // TODO(bryner): Determine a meaningful min size. 114 for (std::vector<std::string>::iterator it = raw_splits.begin(); 115 it != raw_splits.end(); ++it) { 116 if (it->length() >= kMinPathComponentLength) 117 tokens->push_back(*it); 118 } 119} 120 121} // namespace safe_browsing 122