phishing_url_feature_extractor.cc revision 2a99a7e74a7f215066514fe81d2bfa6639d9eddd
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
6
7#include <algorithm>
8#include <string>
9#include <vector>
10
11#include "base/logging.h"
12#include "base/metrics/histogram.h"
13#include "base/perftimer.h"
14#include "base/string_util.h"
15#include "base/strings/string_split.h"
16#include "chrome/renderer/safe_browsing/features.h"
17#include "googleurl/src/gurl.h"
18#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
19
20namespace safe_browsing {
21
22PhishingUrlFeatureExtractor::PhishingUrlFeatureExtractor() {}
23
24PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {}
25
26bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL& url,
27                                                  FeatureMap* features) {
28  PerfTimer timer;
29  if (url.HostIsIPAddress()) {
30    if (!features->AddBooleanFeature(features::kUrlHostIsIpAddress))
31      return false;
32  } else {
33    std::string host;
34    TrimString(url.host(), ".", &host);  // Remove any leading/trailing dots.
35
36    // TODO(bryner): Ensure that the url encoding is consistent with
37    // the features in the model.
38
39    // Disallow unknown registries so that we don't classify
40    // partial hostnames (e.g. "www.subdomain").
41    size_t registry_length =
42        net::RegistryControlledDomainService::GetRegistryLength(host, false);
43
44    if (registry_length == 0 || registry_length == std::string::npos) {
45      DVLOG(1) << "Could not find TLD for host: " << host;
46      return false;
47    }
48    DCHECK_LT(registry_length, host.size()) << "Non-zero registry length, but "
49        "host is only a TLD: " << host;
50    size_t tld_start = host.size() - registry_length;
51    if (!features->AddBooleanFeature(features::kUrlTldToken +
52                                     host.substr(tld_start)))
53      return false;
54
55    // Pull off the TLD and the preceeding dot.
56    host.erase(tld_start - 1);
57    std::vector<std::string> host_tokens;
58    base::SplitStringDontTrim(host, '.', &host_tokens);
59    // Get rid of any empty components.
60    std::vector<std::string>::iterator new_end =
61        std::remove(host_tokens.begin(), host_tokens.end(), "");
62    host_tokens.erase(new_end, host_tokens.end());
63    if (host_tokens.empty()) {
64      DVLOG(1) << "Could not find domain for host: " << host;
65      return false;
66    }
67    if (!features->AddBooleanFeature(features::kUrlDomainToken +
68                                     host_tokens.back()))
69      return false;
70    host_tokens.pop_back();
71
72    // Now we're just left with the "other" host tokens.
73    for (std::vector<std::string>::iterator it = host_tokens.begin();
74         it != host_tokens.end(); ++it) {
75      if (!features->AddBooleanFeature(features::kUrlOtherHostToken + *it))
76        return false;
77    }
78
79    if (host_tokens.size() > 1) {
80      if (!features->AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne))
81        return false;
82      if (host_tokens.size() > 3) {
83        if (!features->AddBooleanFeature(
84                features::kUrlNumOtherHostTokensGTThree))
85          return false;
86      }
87    }
88  }
89
90  std::vector<std::string> long_tokens;
91  SplitStringIntoLongAlphanumTokens(url.path(), &long_tokens);
92  for (std::vector<std::string>::iterator it = long_tokens.begin();
93       it != long_tokens.end(); ++it) {
94    if (!features->AddBooleanFeature(features::kUrlPathToken + *it))
95      return false;
96  }
97
98  UMA_HISTOGRAM_TIMES("SBClientPhishing.URLFeatureTime", timer.Elapsed());
99  return true;
100}
101
102// static
103void PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(
104    const std::string& full,
105    std::vector<std::string>* tokens) {
106  // Split on common non-alphanumerics.
107  // TODO(bryner): Split on all(?) non-alphanumerics and handle %XX properly.
108  static const char kTokenSeparators[] = ".,\\/_-|=%:!&";
109  std::vector<std::string> raw_splits;
110  Tokenize(full, kTokenSeparators, &raw_splits);
111
112  // Copy over only the splits that are 3 or more chars long.
113  // TODO(bryner): Determine a meaningful min size.
114  for (std::vector<std::string>::iterator it = raw_splits.begin();
115       it != raw_splits.end(); ++it) {
116    if (it->length() >= kMinPathComponentLength)
117      tokens->push_back(*it);
118  }
119}
120
121}  // namespace safe_browsing
122