1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// Common types and constants for extracting and evaluating features in the
6// client-side phishing detection model.  A feature is simply a string and an
7// associated floating-point value between 0 and 1.  The phishing
8// classification model contains rules which give an appropriate weight to each
9// feature or combination of features.  These values can then be summed to
10// compute a final phishiness score.
11//
12// Some features are boolean features.  If these features are set, they always
13// have a value of 0.0 or 1.0.  In practice, the features are only set if the
14// value is true (1.0).
15//
16// We also use token features.  These features have a unique name that is
17// constructed from the URL or page contents that we are classifying, for
18// example, "UrlDomain=chromium".  These features are also always set to 1.0
19// if they are present.
20//
21// The intermediate storage of the features for a URL is a FeatureMap, which is
22// just a thin wrapper around a map of feature name to value.  The entire set
23// of features for a URL is extracted before we do any scoring.
24
25#ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
26#define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
27
28#include <string>
29#include "base/basictypes.h"
30#include "base/containers/hash_tables.h"
31
32namespace safe_browsing {
33
34// Container for a map of features to values, which enforces behavior
35// such as a maximum number of features in the map.
36class FeatureMap {
37 public:
38  FeatureMap();
39  ~FeatureMap();
40
41  // Adds a boolean feature to a FeatureMap with a value of 1.0.
42  // Returns true on success, or false if the feature map exceeds
43  // kMaxFeatureMapSize.
44  bool AddBooleanFeature(const std::string& name);
45
46  // Adds a real-valued feature to a FeatureMap with the given value.
47  // Values must always be in the range [0.0, 1.0].  Returns true on
48  // success, or false if the feature map exceeds kMaxFeatureMapSize
49  // or the value is outside of the allowed range.
50  bool AddRealFeature(const std::string& name, double value);
51
52  // Provides read-only access to the current set of features.
53  const base::hash_map<std::string, double>& features() const {
54    return features_;
55  }
56
57  // Clears the set of features in the map.
58  void Clear();
59
60  // This is an upper bound on the number of features that will be extracted.
61  // We should never hit this cap; it is intended as a sanity check to prevent
62  // the FeatureMap from growing too large.
63  static const size_t kMaxFeatureMapSize;
64
65 private:
66  base::hash_map<std::string, double> features_;
67
68  DISALLOW_COPY_AND_ASSIGN(FeatureMap);
69};
70
71namespace features {
72// Constants for the various feature names that we use.
73//
74// IMPORTANT: when adding new features, you must update kAllowedFeatures in
75// chrome/browser/safe_browsing/client_side_detection_service.cc if the feature
76// should be sent in sanitized pingbacks.
77
78////////////////////////////////////////////////////
79// URL host features
80////////////////////////////////////////////////////
81
82// Set if the URL's hostname is an IP address.
83extern const char kUrlHostIsIpAddress[];
84// Token feature containing the portion of the hostname controlled by a
85// registrar, for example "com" or "co.uk".
86extern const char kUrlTldToken[];
87// Token feature containing the first host component below the registrar.
88// For example, in "www.google.com", the domain would be "google".
89extern const char kUrlDomainToken[];
90// Token feature containing each host component below the domain.
91// For example, in "www.host.example.com", both "www" and "host" would be
92// "other host tokens".
93extern const char kUrlOtherHostToken[];
94
95////////////////////////////////////////////////////
96// Aggregate features for URL host tokens
97////////////////////////////////////////////////////
98
99// Set if the number of "other" host tokens for a URL is greater than one.
100// Longer hostnames, regardless of the specific tokens, can be a signal that
101// the URL is phishy.
102extern const char kUrlNumOtherHostTokensGTOne[];
103// Set if the number of "other" host tokens for a URL is greater than three.
104extern const char kUrlNumOtherHostTokensGTThree[];
105
106////////////////////////////////////////////////////
107// URL path token features
108////////////////////////////////////////////////////
109
110// Token feature containing each alphanumeric string in the path that is at
111// least 3 characters long.  For example, "/abc/d/efg" would have 2 path
112// token features, "abc" and "efg".  Query parameters are not included.
113extern const char kUrlPathToken[];
114
115////////////////////////////////////////////////////
116// DOM HTML form features
117////////////////////////////////////////////////////
118
119// Set if the page has any <form> elements.
120extern const char kPageHasForms[];
121// The fraction of form elements whose |action| attribute points to a
122// URL on a different domain from the document URL.
123extern const char kPageActionOtherDomainFreq[];
124
125// Set if the page has any <input type="text"> elements
126// (includes inputs with missing or unknown types).
127extern const char kPageHasTextInputs[];
128// Set if the page has any <input type="password"> elements.
129extern const char kPageHasPswdInputs[];
130// Set if the page has any <input type="radio"> elements.
131extern const char kPageHasRadioInputs[];
132// Set if the page has any <input type="checkbox"> elements.
133extern const char kPageHasCheckInputs[];
134
135////////////////////////////////////////////////////
136// DOM HTML link features
137////////////////////////////////////////////////////
138
139// The fraction of links in the page which point to a domain other than the
140// domain of the document.  See "URL host features" above for a discussion
141// of how the doamin is computed.
142extern const char kPageExternalLinksFreq[];
143// Token feature containing each external domain that is linked to.
144extern const char kPageLinkDomain[];
145// Fraction of links in the page that use https.
146extern const char kPageSecureLinksFreq[];
147
148////////////////////////////////////////////////////
149// DOM HTML script features
150////////////////////////////////////////////////////
151
152// Set if the number of <script> elements in the page is greater than 1.
153extern const char kPageNumScriptTagsGTOne[];
154// Set if the number of <script> elements in the page is greater than 6.
155extern const char kPageNumScriptTagsGTSix[];
156
157////////////////////////////////////////////////////
158// Other DOM HTML features
159////////////////////////////////////////////////////
160
161// The fraction of images whose src attribute points to an external domain.
162extern const char kPageImgOtherDomainFreq[];
163
164////////////////////////////////////////////////////
165// Page term features
166////////////////////////////////////////////////////
167
168// Token feature for a term (whitespace-delimited) on a page.  Terms can be
169// single words or multi-word n-grams.  Rather than adding this feature for
170// every possible token on a page, only the terms that are mentioned in the
171// classification model are added.
172extern const char kPageTerm[];
173
174}  // namespace features
175}  // namepsace safe_browsing
176
177#endif  // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
178