1// Copyright (c) 2011 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4// 5// Common types and constants for extracting and evaluating features in the 6// client-side phishing detection model. A feature is simply a string and an 7// associated floating-point value between 0 and 1. The phishing 8// classification model contains rules which give an appropriate weight to each 9// feature or combination of features. These values can then be summed to 10// compute a final phishiness score. 11// 12// Some features are boolean features. If these features are set, they always 13// have a value of 0.0 or 1.0. In practice, the features are only set if the 14// value is true (1.0). 15// 16// We also use token features. These features have a unique name that is 17// constructed from the URL or page contents that we are classifying, for 18// example, "UrlDomain=chromium". These features are also always set to 1.0 19// if they are present. 20// 21// The intermediate storage of the features for a URL is a FeatureMap, which is 22// just a thin wrapper around a map of feature name to value. The entire set 23// of features for a URL is extracted before we do any scoring. 24 25#ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ 26#define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ 27 28#include <string> 29#include "base/basictypes.h" 30#include "base/containers/hash_tables.h" 31 32namespace safe_browsing { 33 34// Container for a map of features to values, which enforces behavior 35// such as a maximum number of features in the map. 36class FeatureMap { 37 public: 38 FeatureMap(); 39 ~FeatureMap(); 40 41 // Adds a boolean feature to a FeatureMap with a value of 1.0. 42 // Returns true on success, or false if the feature map exceeds 43 // kMaxFeatureMapSize. 44 bool AddBooleanFeature(const std::string& name); 45 46 // Adds a real-valued feature to a FeatureMap with the given value. 47 // Values must always be in the range [0.0, 1.0]. Returns true on 48 // success, or false if the feature map exceeds kMaxFeatureMapSize 49 // or the value is outside of the allowed range. 50 bool AddRealFeature(const std::string& name, double value); 51 52 // Provides read-only access to the current set of features. 53 const base::hash_map<std::string, double>& features() const { 54 return features_; 55 } 56 57 // Clears the set of features in the map. 58 void Clear(); 59 60 // This is an upper bound on the number of features that will be extracted. 61 // We should never hit this cap; it is intended as a sanity check to prevent 62 // the FeatureMap from growing too large. 63 static const size_t kMaxFeatureMapSize; 64 65 private: 66 base::hash_map<std::string, double> features_; 67 68 DISALLOW_COPY_AND_ASSIGN(FeatureMap); 69}; 70 71namespace features { 72// Constants for the various feature names that we use. 73// 74// IMPORTANT: when adding new features, you must update kAllowedFeatures in 75// chrome/browser/safe_browsing/client_side_detection_service.cc if the feature 76// should be sent in sanitized pingbacks. 77 78//////////////////////////////////////////////////// 79// URL host features 80//////////////////////////////////////////////////// 81 82// Set if the URL's hostname is an IP address. 83extern const char kUrlHostIsIpAddress[]; 84// Token feature containing the portion of the hostname controlled by a 85// registrar, for example "com" or "co.uk". 86extern const char kUrlTldToken[]; 87// Token feature containing the first host component below the registrar. 88// For example, in "www.google.com", the domain would be "google". 89extern const char kUrlDomainToken[]; 90// Token feature containing each host component below the domain. 91// For example, in "www.host.example.com", both "www" and "host" would be 92// "other host tokens". 93extern const char kUrlOtherHostToken[]; 94 95//////////////////////////////////////////////////// 96// Aggregate features for URL host tokens 97//////////////////////////////////////////////////// 98 99// Set if the number of "other" host tokens for a URL is greater than one. 100// Longer hostnames, regardless of the specific tokens, can be a signal that 101// the URL is phishy. 102extern const char kUrlNumOtherHostTokensGTOne[]; 103// Set if the number of "other" host tokens for a URL is greater than three. 104extern const char kUrlNumOtherHostTokensGTThree[]; 105 106//////////////////////////////////////////////////// 107// URL path token features 108//////////////////////////////////////////////////// 109 110// Token feature containing each alphanumeric string in the path that is at 111// least 3 characters long. For example, "/abc/d/efg" would have 2 path 112// token features, "abc" and "efg". Query parameters are not included. 113extern const char kUrlPathToken[]; 114 115//////////////////////////////////////////////////// 116// DOM HTML form features 117//////////////////////////////////////////////////// 118 119// Set if the page has any <form> elements. 120extern const char kPageHasForms[]; 121// The fraction of form elements whose |action| attribute points to a 122// URL on a different domain from the document URL. 123extern const char kPageActionOtherDomainFreq[]; 124 125// Set if the page has any <input type="text"> elements 126// (includes inputs with missing or unknown types). 127extern const char kPageHasTextInputs[]; 128// Set if the page has any <input type="password"> elements. 129extern const char kPageHasPswdInputs[]; 130// Set if the page has any <input type="radio"> elements. 131extern const char kPageHasRadioInputs[]; 132// Set if the page has any <input type="checkbox"> elements. 133extern const char kPageHasCheckInputs[]; 134 135//////////////////////////////////////////////////// 136// DOM HTML link features 137//////////////////////////////////////////////////// 138 139// The fraction of links in the page which point to a domain other than the 140// domain of the document. See "URL host features" above for a discussion 141// of how the doamin is computed. 142extern const char kPageExternalLinksFreq[]; 143// Token feature containing each external domain that is linked to. 144extern const char kPageLinkDomain[]; 145// Fraction of links in the page that use https. 146extern const char kPageSecureLinksFreq[]; 147 148//////////////////////////////////////////////////// 149// DOM HTML script features 150//////////////////////////////////////////////////// 151 152// Set if the number of <script> elements in the page is greater than 1. 153extern const char kPageNumScriptTagsGTOne[]; 154// Set if the number of <script> elements in the page is greater than 6. 155extern const char kPageNumScriptTagsGTSix[]; 156 157//////////////////////////////////////////////////// 158// Other DOM HTML features 159//////////////////////////////////////////////////// 160 161// The fraction of images whose src attribute points to an external domain. 162extern const char kPageImgOtherDomainFreq[]; 163 164//////////////////////////////////////////////////// 165// Page term features 166//////////////////////////////////////////////////// 167 168// Token feature for a term (whitespace-delimited) on a page. Terms can be 169// single words or multi-word n-grams. Rather than adding this feature for 170// every possible token on a page, only the terms that are mentioned in the 171// classification model are added. 172extern const char kPageTerm[]; 173 174} // namespace features 175} // namepsace safe_browsing 176 177#endif // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ 178