15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/bind.h"
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/compiler_specific.h"
97d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/containers/hash_tables.h"
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h"
119ab5563a3196760eb381d102cbb2bc0f7abc6a50Ben Murdoch#include "base/message_loop/message_loop.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/metrics/histogram.h"
137d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/strings/string_util.h"
14eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch#include "base/time/time.h"
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/features.h"
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "content/public/renderer/render_view.h"
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
19eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch#include "third_party/WebKit/public/platform/WebString.h"
207d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "third_party/WebKit/public/web/WebElement.h"
215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "third_party/WebKit/public/web/WebElementCollection.h"
220529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch#include "third_party/WebKit/public/web/WebLocalFrame.h"
237d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "third_party/WebKit/public/web/WebView.h"
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing {
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This time should be short enough that it doesn't noticeably disrupt the
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// user's interaction with the page.
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10;
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Experimenting shows that we get a reasonable gain in performance by
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// increasing this up to around 10, but there's not much benefit in
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// increasing it past that.
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10;
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This should be longer than we expect feature extraction to take on any
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// actual phishing page.
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500;
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Intermediate state used for computing features.  See features.h for
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// descriptions of the DOM features that are computed.
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct PhishingDOMFeatureExtractor::PageFeatureState {
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Link related features
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int external_links;
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  base::hash_set<std::string> external_domains;
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int secure_links;
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int total_links;
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Form related features
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int num_forms;
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int num_text_inputs;
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int num_pswd_inputs;
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int num_radio_inputs;
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int num_check_inputs;
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int action_other_domain;
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int total_actions;
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Image related features
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int img_other_domain;
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int total_imgs;
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // How many script tags
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int num_script_tags;
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The time at which we started feature extraction for the current page.
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  base::TimeTicks start_time;
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The number of iterations we've done for the current extraction.
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int num_iterations;
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  explicit PageFeatureState(base::TimeTicks start_time_ticks)
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      : external_links(0),
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        secure_links(0),
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        total_links(0),
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        num_forms(0),
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        num_text_inputs(0),
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        num_pswd_inputs(0),
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        num_radio_inputs(0),
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        num_check_inputs(0),
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        action_other_domain(0),
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        total_actions(0),
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        img_other_domain(0),
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        total_imgs(0),
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        num_script_tags(0),
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        start_time(start_time_ticks),
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        num_iterations(0) {}
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~PageFeatureState() {}
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Per-frame state
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct PhishingDOMFeatureExtractor::FrameData {
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This is our reference to document.all, which is an iterator over all
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // of the elements in the document.  It keeps track of our current position.
955d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  blink::WebElementCollection elements;
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The domain of the document URL, stored here so that we don't need to
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // recompute it every time it's needed.
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string domain;
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    content::RenderView* render_view,
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    FeatureExtractorClock* clock)
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : render_view_(render_view),
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      clock_(clock),
106c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)      weak_factory_(this) {
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Clear();
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The RenderView should have called CancelPendingExtraction() before
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // we are destroyed.
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  CheckNoPendingExtraction();
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::ExtractFeatures(
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    FeatureMap* features,
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const DoneCallback& done_callback) {
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The RenderView should have called CancelPendingExtraction() before
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // starting a new extraction, so DCHECK this.
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  CheckNoPendingExtraction();
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // However, in an opt build, we will go ahead and clean up the pending
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // extraction so that we can start in a known state.
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  CancelPendingExtraction();
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  features_ = features;
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  done_callback_ = done_callback;
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  page_feature_state_.reset(new PageFeatureState(clock_->Now()));
130f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  blink::WebView* web_view = render_view_->GetWebView();
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (web_view && web_view->mainFrame()) {
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    cur_document_ = web_view->mainFrame()->document();
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
135a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)  base::MessageLoop::current()->PostTask(
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      FROM_HERE,
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 weak_factory_.GetWeakPtr()));
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Cancel any pending callbacks, and clear our state.
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  weak_factory_.InvalidateWeakPtrs();
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Clear();
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(page_feature_state_.get());
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ++page_feature_state_->num_iterations;
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  base::TimeTicks current_chunk_start_time = clock_->Now();
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (cur_document_.isNull()) {
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // This will only happen if we weren't able to get the document for the
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // main frame.  We'll treat this as an extraction failure.
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    RunCallback(false);
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int num_elements = 0;
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) {
1615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    blink::WebElement cur_element;
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (cur_frame_data_.get()) {
1635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      // We're resuming traversal of a frame, so just advance to the next
1645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      // element.
1655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      cur_element = cur_frame_data_->elements.nextItem();
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // When we resume the traversal, the first call to nextItem() potentially
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // has to walk through the document again from the beginning, if it was
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // modified between our chunks of work.  Log how long this takes, so we
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // can tell if it's too slow.
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          clock_->Now() - current_chunk_start_time);
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    } else {
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // We just moved to a new frame, so update our frame state
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // and advance to the first element.
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ResetFrameData();
1765d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      cur_element = cur_frame_data_->elements.firstItem();
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1795d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    for (; !cur_element.isNull();
1805d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)         cur_element = cur_frame_data_->elements.nextItem()) {
1815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      if (cur_element.hasTagName("a")) {
1825d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        HandleLink(cur_element);
1835d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      } else if (cur_element.hasTagName("form")) {
1845d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        HandleForm(cur_element);
1855d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      } else if (cur_element.hasTagName("img")) {
1865d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        HandleImage(cur_element);
1875d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      } else if (cur_element.hasTagName("input")) {
1885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        HandleInput(cur_element);
1895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      } else if (cur_element.hasTagName("script")) {
1905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        HandleScript(cur_element);
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (++num_elements >= kClockCheckGranularity) {
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        num_elements = 0;
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        base::TimeTicks now = clock_->Now();
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (now - page_feature_state_->start_time >=
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          DLOG(ERROR) << "Feature extraction took too long, giving up";
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // We expect this to happen infrequently, so record when it does.
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1);
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          RunCallback(false);
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          return;
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (chunk_elapsed >=
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // The time limit for the current chunk is up, so post a task to
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // continue extraction.
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          //
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // Record how much time we actually spent on the chunk. If this is
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // much higher than kMaxTimePerChunkMs, we may need to adjust the
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // clock granularity.
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime",
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                              chunk_elapsed);
215a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)          base::MessageLoop::current()->PostTask(
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              FROM_HERE,
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              base::Bind(
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                  &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                  weak_factory_.GetWeakPtr()));
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          return;
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Otherwise, continue.
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // We're done with this frame, recalculate the FrameData when we
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // advance to the next frame.
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    cur_frame_data_.reset();
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  InsertFeatures();
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  RunCallback(true);
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::HandleLink(
236f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    const blink::WebElement& element) {
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Count the number of times we link to a different host.
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!element.hasAttribute("href")) {
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DVLOG(1) << "Skipping anchor tag with no href";
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Retrieve the link and resolve the link in case it's relative.
244f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  blink::WebURL full_url = element.document().completeURL(
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      element.getAttribute("href"));
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string domain;
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool is_external = IsExternalDomain(full_url, &domain);
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (domain.empty()) {
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DVLOG(1) << "Could not extract domain from link: " << full_url;
2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (is_external) {
2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++page_feature_state_->external_links;
2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Record each unique domain that we link to.
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    page_feature_state_->external_domains.insert(domain);
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Check how many are https links.
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (GURL(full_url).SchemeIs("https")) {
2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++page_feature_state_->secure_links;
2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ++page_feature_state_->total_links;
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::HandleForm(
270f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    const blink::WebElement& element) {
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Increment the number of forms on this page.
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ++page_feature_state_->num_forms;
2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Record whether the action points to a different domain.
2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!element.hasAttribute("action")) {
2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
279f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  blink::WebURL full_url = element.document().completeURL(
2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      element.getAttribute("action"));
2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string domain;
2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool is_external = IsExternalDomain(full_url, &domain);
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (domain.empty()) {
2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DVLOG(1) << "Could not extract domain from form action: " << full_url;
2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (is_external) {
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++page_feature_state_->action_other_domain;
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ++page_feature_state_->total_actions;
2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::HandleImage(
296f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    const blink::WebElement& element) {
2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!element.hasAttribute("src")) {
2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DVLOG(1) << "Skipping img tag with no src";
2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Record whether the image points to a different domain.
302f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  blink::WebURL full_url = element.document().completeURL(
3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      element.getAttribute("src"));
3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string domain;
3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool is_external = IsExternalDomain(full_url, &domain);
3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (domain.empty()) {
3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DVLOG(1) << "Could not extract domain from image src: " << full_url;
3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (is_external) {
3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++page_feature_state_->img_other_domain;
3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ++page_feature_state_->total_imgs;
3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::HandleInput(
318f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    const blink::WebElement& element) {
3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The HTML spec says that if the type is unspecified, it defaults to text.
3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // In addition, any unrecognized type will be treated as a text input.
3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //
3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Note that we use the attribute value rather than
3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // WebFormControlElement::formControlType() for consistency with the
3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // way the phishing classification model is created.
3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string type = element.getAttribute("type").utf8();
3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  StringToLowerASCII(&type);
3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (type == "password") {
3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++page_feature_state_->num_pswd_inputs;
3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else if (type == "radio") {
3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++page_feature_state_->num_radio_inputs;
3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else if (type == "checkbox") {
3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++page_feature_state_->num_check_inputs;
3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else if (type != "submit" && type != "reset" && type != "file" &&
3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)             type != "hidden" && type != "image" && type != "button") {
3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Note that there are a number of new input types in HTML5 that are not
3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // handled above.  For now, we will consider these as text inputs since
3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // they could be used to capture user input.
3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++page_feature_state_->num_text_inputs;
3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::HandleScript(
343f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    const blink::WebElement& element) {
3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ++page_feature_state_->num_script_tags;
3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(done_callback_.is_null());
3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(!cur_frame_data_.get());
3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(cur_document_.isNull());
3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!done_callback_.is_null() || cur_frame_data_.get() ||
3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      !cur_document_.isNull()) {
3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    LOG(ERROR) << "Extraction in progress, missing call to "
3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)               << "CancelPendingExtraction";
3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::RunCallback(bool success) {
3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Record some timing stats that we can use to evaluate feature extraction
3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // performance.  These include both successful and failed extractions.
3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(page_feature_state_.get());
3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations",
3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       page_feature_state_->num_iterations);
3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      clock_->Now() - page_feature_state_->start_time);
3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(!done_callback_.is_null());
3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  done_callback_.Run(success);
3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Clear();
3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::Clear() {
3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  features_ = NULL;
3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  done_callback_.Reset();
3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  cur_frame_data_.reset(NULL);
3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  cur_document_.reset();
3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::ResetFrameData() {
3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(!cur_document_.isNull());
3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(!cur_frame_data_.get());
3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  cur_frame_data_.reset(new FrameData());
3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  cur_frame_data_->elements = cur_document_.all();
3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  cur_frame_data_->domain =
386a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      net::registry_controlled_domains::GetDomainAndRegistry(
387a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)          cur_document_.url(),
388a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)          net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
391f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() {
3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(!cur_document_.isNull());
393f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  blink::WebFrame* frame = cur_document_.frame();
3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Advance to the next frame that contains a document, with no wrapping.
3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (frame) {
3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    while ((frame = frame->traverseNext(false))) {
3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (!frame->document().isNull()) {
3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return frame->document();
3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else {
4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Keep track of how often frame traversal got "stuck" due to the
4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // current subdocument getting removed from the frame tree.
4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1);
4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
406f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  return blink::WebDocument();
4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                                   std::string* domain) const {
4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(domain);
4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(cur_frame_data_.get());
4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (cur_frame_data_->domain.empty()) {
4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // TODO(bryner): Ensure that the url encoding is consistent with the features
4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // in the model.
4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (url.HostIsIPAddress()) {
4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    domain->assign(url.host());
4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else {
423a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)    domain->assign(net::registry_controlled_domains::GetDomainAndRegistry(
424a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)        url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES));
4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return !domain->empty() && *domain != cur_frame_data_->domain;
4285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::InsertFeatures() {
4315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(page_feature_state_.get());
4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (page_feature_state_->total_links > 0) {
4345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Add a feature for the fraction of times the page links to an external
4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // domain vs. an internal domain.
4365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    double link_freq = static_cast<double>(
4375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        page_feature_state_->external_links) /
4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        page_feature_state_->total_links;
4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq);
4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Add a feature for each unique domain that we're linking to
4425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    for (base::hash_set<std::string>::iterator it =
4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)             page_feature_state_->external_domains.begin();
4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)         it != page_feature_state_->external_domains.end(); ++it) {
4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      features_->AddBooleanFeature(features::kPageLinkDomain + *it);
4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
4475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Fraction of links that use https.
4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    double secure_freq = static_cast<double>(
4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        page_feature_state_->secure_links) / page_feature_state_->total_links;
4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq);
4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Record whether forms appear and whether various form elements appear.
4555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (page_feature_state_->num_forms > 0) {
4565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    features_->AddBooleanFeature(features::kPageHasForms);
4575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (page_feature_state_->num_text_inputs > 0) {
4595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    features_->AddBooleanFeature(features::kPageHasTextInputs);
4605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (page_feature_state_->num_pswd_inputs > 0) {
4625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    features_->AddBooleanFeature(features::kPageHasPswdInputs);
4635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (page_feature_state_->num_radio_inputs > 0) {
4655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    features_->AddBooleanFeature(features::kPageHasRadioInputs);
4665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (page_feature_state_->num_check_inputs > 0) {
4685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    features_->AddBooleanFeature(features::kPageHasCheckInputs);
4695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Record fraction of form actions that point to a different domain.
4725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (page_feature_state_->total_actions > 0) {
4735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    double action_freq = static_cast<double>(
4745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        page_feature_state_->action_other_domain) /
4755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        page_feature_state_->total_actions;
4765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    features_->AddRealFeature(features::kPageActionOtherDomainFreq,
4775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                              action_freq);
4785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Record how many image src attributes point to a different domain.
4815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (page_feature_state_->total_imgs > 0) {
4825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    double img_freq = static_cast<double>(
4835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        page_feature_state_->img_other_domain) /
4845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        page_feature_state_->total_imgs;
4855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq);
4865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Record number of script tags (discretized for numerical stability.)
4895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (page_feature_state_->num_script_tags > 1) {
4905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
4915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (page_feature_state_->num_script_tags > 6) {
4925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
4935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
4945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace safe_browsing
498