15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2010 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/features.h"
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h"
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/metrics/histogram.h"
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing {
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const size_t FeatureMap::kMaxFeatureMapSize = 10000;
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)FeatureMap::FeatureMap() {}
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)FeatureMap::~FeatureMap() {}
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool FeatureMap::AddBooleanFeature(const std::string& name) {
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return AddRealFeature(name, 1.0);
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool FeatureMap::AddRealFeature(const std::string& name, double value) {
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (features_.size() >= kMaxFeatureMapSize) {
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // If we hit this case, it indicates that either kMaxFeatureMapSize is
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // too small, or there is a bug causing too many features to be added.
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // In this case, we'll log to a histogram so we can see that this is
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // happening, and make phishing classification fail silently.
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    LOG(ERROR) << "Not adding feature: " << name << " because the "
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)               << "feature map is too large.";
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    UMA_HISTOGRAM_COUNTS("SBClientPhishing.TooManyFeatures", 1);
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // We only expect features in the range [0.0, 1.0], so fail if the feature is
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // outside this range.
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (value < 0.0 || value > 1.0) {
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    LOG(ERROR) << "Not adding feature: " << name << " because the value "
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)               << value << " is not in the range [0.0, 1.0].";
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    UMA_HISTOGRAM_COUNTS("SBClientPhishing.IllegalFeatureValue", 1);
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  features_[name] = value;
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void FeatureMap::Clear() {
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  features_.clear();
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace features {
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// URL host features
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kUrlHostIsIpAddress[] = "UrlHostIsIpAddress";
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kUrlTldToken[] = "UrlTld=";
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kUrlDomainToken[] = "UrlDomain=";
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kUrlOtherHostToken[] = "UrlOtherHostToken=";
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// URL host aggregate features
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kUrlNumOtherHostTokensGTOne[] = "UrlNumOtherHostTokens>1";
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kUrlNumOtherHostTokensGTThree[] = "UrlNumOtherHostTokens>3";
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// URL path features
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kUrlPathToken[] = "UrlPathToken=";
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DOM HTML form features
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kPageHasForms[] = "PageHasForms";
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kPageActionOtherDomainFreq[] = "PageActionOtherDomainFreq";
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kPageHasTextInputs[] = "PageHasTextInputs";
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kPageHasPswdInputs[] = "PageHasPswdInputs";
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kPageHasRadioInputs[] = "PageHasRadioInputs";
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kPageHasCheckInputs[] = "PageHasCheckInputs";
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DOM HTML link features
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kPageExternalLinksFreq[] = "PageExternalLinksFreq";
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kPageLinkDomain[] = "PageLinkDomain=";
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kPageSecureLinksFreq[] = "PageSecureLinksFreq";
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DOM HTML script features
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kPageNumScriptTagsGTOne[] = "PageNumScriptTags>1";
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kPageNumScriptTagsGTSix[] = "PageNumScriptTags>6";
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Other DOM HTML features
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kPageImgOtherDomainFreq[] = "PageImgOtherDomainFreq";
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Page term features
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char kPageTerm[] = "PageTerm=";
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace features
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace safe_browsing
88