features.h revision 5821806d5e7f356e8fa4b058a389a808ea183019
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Common types and constants for extracting and evaluating features in the 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// client-side phishing detection model. A feature is simply a string and an 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// associated floating-point value between 0 and 1. The phishing 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// classification model contains rules which give an appropriate weight to each 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// feature or combination of features. These values can then be summed to 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// compute a final phishiness score. 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Some features are boolean features. If these features are set, they always 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// have a value of 0.0 or 1.0. In practice, the features are only set if the 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// value is true (1.0). 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// We also use token features. These features have a unique name that is 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// constructed from the URL or page contents that we are classifying, for 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// example, "UrlDomain=chromium". These features are also always set to 1.0 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// if they are present. 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The intermediate storage of the features for a URL is a FeatureMap, which is 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// just a thin wrapper around a map of feature name to value. The entire set 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of features for a URL is extracted before we do any scoring. 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string> 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h" 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/hash_tables.h" 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing { 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Container for a map of features to values, which enforces behavior 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// such as a maximum number of features in the map. 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FeatureMap { 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FeatureMap(); 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~FeatureMap(); 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Adds a boolean feature to a FeatureMap with a value of 1.0. 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns true on success, or false if the feature map exceeds 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // kMaxFeatureMapSize. 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool AddBooleanFeature(const std::string& name); 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Adds a real-valued feature to a FeatureMap with the given value. 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Values must always be in the range [0.0, 1.0]. Returns true on 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // success, or false if the feature map exceeds kMaxFeatureMapSize 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // or the value is outside of the allowed range. 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool AddRealFeature(const std::string& name, double value); 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Provides read-only access to the current set of features. 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const base::hash_map<std::string, double>& features() const { 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return features_; 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Clears the set of features in the map. 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void Clear(); 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This is an upper bound on the number of features that will be extracted. 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We should never hit this cap; it is intended as a sanity check to prevent 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the FeatureMap from growing too large. 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static const size_t kMaxFeatureMapSize; 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::hash_map<std::string, double> features_; 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_COPY_AND_ASSIGN(FeatureMap); 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace features { 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Constants for the various feature names that we use. 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// IMPORTANT: when adding new features, you must update kAllowedFeatures in 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// chrome/browser/safe_browsing/client_side_detection_service.cc if the feature 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// should be sent in sanitized pingbacks. 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// URL host features 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the URL's hostname is an IP address. 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kUrlHostIsIpAddress[]; 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Token feature containing the portion of the hostname controlled by a 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// registrar, for example "com" or "co.uk". 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kUrlTldToken[]; 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Token feature containing the first host component below the registrar. 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For example, in "www.google.com", the domain would be "google". 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kUrlDomainToken[]; 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Token feature containing each host component below the domain. 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For example, in "www.host.example.com", both "www" and "host" would be 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// "other host tokens". 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kUrlOtherHostToken[]; 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Aggregate features for URL host tokens 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the number of "other" host tokens for a URL is greater than one. 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Longer hostnames, regardless of the specific tokens, can be a signal that 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the URL is phishy. 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kUrlNumOtherHostTokensGTOne[]; 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the number of "other" host tokens for a URL is greater than three. 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kUrlNumOtherHostTokensGTThree[]; 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// URL path token features 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Token feature containing each alphanumeric string in the path that is at 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// least 3 characters long. For example, "/abc/d/efg" would have 2 path 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// token features, "abc" and "efg". Query parameters are not included. 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kUrlPathToken[]; 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DOM HTML form features 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the page has any <form> elements. 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageHasForms[]; 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The fraction of form elements whose |action| attribute points to a 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// URL on a different domain from the document URL. 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageActionOtherDomainFreq[]; 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the page has any <input type="text"> elements 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (includes inputs with missing or unknown types). 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageHasTextInputs[]; 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the page has any <input type="password"> elements. 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageHasPswdInputs[]; 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the page has any <input type="radio"> elements. 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageHasRadioInputs[]; 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the page has any <input type="checkbox"> elements. 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageHasCheckInputs[]; 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DOM HTML link features 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The fraction of links in the page which point to a domain other than the 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// domain of the document. See "URL host features" above for a discussion 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of how the doamin is computed. 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageExternalLinksFreq[]; 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Token feature containing each external domain that is linked to. 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageLinkDomain[]; 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Fraction of links in the page that use https. 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageSecureLinksFreq[]; 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DOM HTML script features 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the number of <script> elements in the page is greater than 1. 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageNumScriptTagsGTOne[]; 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the number of <script> elements in the page is greater than 6. 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageNumScriptTagsGTSix[]; 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Other DOM HTML features 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The fraction of images whose src attribute points to an external domain. 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageImgOtherDomainFreq[]; 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Page term features 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//////////////////////////////////////////////////// 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Token feature for a term (whitespace-delimited) on a page. Terms can be 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// single words or multi-word n-grams. Rather than adding this feature for 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// every possible token on a page, only the terms that are mentioned in the 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// classification model are added. 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageTerm[]; 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace features 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namepsace safe_browsing 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ 178