features.h revision 5821806d5e7f356e8fa4b058a389a808ea183019
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Common types and constants for extracting and evaluating features in the
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// client-side phishing detection model.  A feature is simply a string and an
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// associated floating-point value between 0 and 1.  The phishing
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// classification model contains rules which give an appropriate weight to each
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// feature or combination of features.  These values can then be summed to
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// compute a final phishiness score.
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Some features are boolean features.  If these features are set, they always
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// have a value of 0.0 or 1.0.  In practice, the features are only set if the
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// value is true (1.0).
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// We also use token features.  These features have a unique name that is
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// constructed from the URL or page contents that we are classifying, for
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// example, "UrlDomain=chromium".  These features are also always set to 1.0
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// if they are present.
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The intermediate storage of the features for a URL is a FeatureMap, which is
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// just a thin wrapper around a map of feature name to value.  The entire set
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of features for a URL is extracted before we do any scoring.
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h"
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/hash_tables.h"
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing {
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Container for a map of features to values, which enforces behavior
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// such as a maximum number of features in the map.
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FeatureMap {
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FeatureMap();
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~FeatureMap();
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Adds a boolean feature to a FeatureMap with a value of 1.0.
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns true on success, or false if the feature map exceeds
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // kMaxFeatureMapSize.
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool AddBooleanFeature(const std::string& name);
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Adds a real-valued feature to a FeatureMap with the given value.
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Values must always be in the range [0.0, 1.0].  Returns true on
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // success, or false if the feature map exceeds kMaxFeatureMapSize
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // or the value is outside of the allowed range.
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool AddRealFeature(const std::string& name, double value);
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Provides read-only access to the current set of features.
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const base::hash_map<std::string, double>& features() const {
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return features_;
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Clears the set of features in the map.
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void Clear();
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This is an upper bound on the number of features that will be extracted.
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // We should never hit this cap; it is intended as a sanity check to prevent
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the FeatureMap from growing too large.
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const size_t kMaxFeatureMapSize;
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  base::hash_map<std::string, double> features_;
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(FeatureMap);
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace features {
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Constants for the various feature names that we use.
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// IMPORTANT: when adding new features, you must update kAllowedFeatures in
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// chrome/browser/safe_browsing/client_side_detection_service.cc if the feature
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// should be sent in sanitized pingbacks.
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// URL host features
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the URL's hostname is an IP address.
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kUrlHostIsIpAddress[];
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Token feature containing the portion of the hostname controlled by a
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// registrar, for example "com" or "co.uk".
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kUrlTldToken[];
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Token feature containing the first host component below the registrar.
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For example, in "www.google.com", the domain would be "google".
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kUrlDomainToken[];
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Token feature containing each host component below the domain.
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For example, in "www.host.example.com", both "www" and "host" would be
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// "other host tokens".
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kUrlOtherHostToken[];
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Aggregate features for URL host tokens
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the number of "other" host tokens for a URL is greater than one.
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Longer hostnames, regardless of the specific tokens, can be a signal that
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the URL is phishy.
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kUrlNumOtherHostTokensGTOne[];
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the number of "other" host tokens for a URL is greater than three.
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kUrlNumOtherHostTokensGTThree[];
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// URL path token features
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Token feature containing each alphanumeric string in the path that is at
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// least 3 characters long.  For example, "/abc/d/efg" would have 2 path
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// token features, "abc" and "efg".  Query parameters are not included.
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kUrlPathToken[];
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DOM HTML form features
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the page has any <form> elements.
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageHasForms[];
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The fraction of form elements whose |action| attribute points to a
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// URL on a different domain from the document URL.
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageActionOtherDomainFreq[];
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the page has any <input type="text"> elements
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (includes inputs with missing or unknown types).
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageHasTextInputs[];
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the page has any <input type="password"> elements.
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageHasPswdInputs[];
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the page has any <input type="radio"> elements.
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageHasRadioInputs[];
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the page has any <input type="checkbox"> elements.
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageHasCheckInputs[];
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DOM HTML link features
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The fraction of links in the page which point to a domain other than the
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// domain of the document.  See "URL host features" above for a discussion
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of how the doamin is computed.
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageExternalLinksFreq[];
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Token feature containing each external domain that is linked to.
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageLinkDomain[];
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Fraction of links in the page that use https.
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageSecureLinksFreq[];
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DOM HTML script features
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the number of <script> elements in the page is greater than 1.
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageNumScriptTagsGTOne[];
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Set if the number of <script> elements in the page is greater than 6.
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageNumScriptTagsGTSix[];
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Other DOM HTML features
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The fraction of images whose src attribute points to an external domain.
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageImgOtherDomainFreq[];
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Page term features
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Token feature for a term (whitespace-delimited) on a page.  Terms can be
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// single words or multi-word n-grams.  Rather than adding this feature for
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// every possible token on a page, only the terms that are mentioned in the
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// classification model are added.
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern const char kPageTerm[];
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace features
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namepsace safe_browsing
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
178