phishing_dom_feature_extractor.cc revision 9ab5563a3196760eb381d102cbb2bc0f7abc6a50
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
6
7#include "base/bind.h"
8#include "base/compiler_specific.h"
9#include "base/containers/hash_tables.h"
10#include "base/logging.h"
11#include "base/message_loop/message_loop.h"
12#include "base/metrics/histogram.h"
13#include "base/strings/string_util.h"
14#include "base/time/time.h"
15#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
16#include "chrome/renderer/safe_browsing/features.h"
17#include "content/public/renderer/render_view.h"
18#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
19#include "third_party/WebKit/public/platform/WebString.h"
20#include "third_party/WebKit/public/web/WebElement.h"
21#include "third_party/WebKit/public/web/WebFrame.h"
22#include "third_party/WebKit/public/web/WebNodeCollection.h"
23#include "third_party/WebKit/public/web/WebView.h"
24
25namespace safe_browsing {
26
27// This time should be short enough that it doesn't noticeably disrupt the
28// user's interaction with the page.
29const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10;
30
31// Experimenting shows that we get a reasonable gain in performance by
32// increasing this up to around 10, but there's not much benefit in
33// increasing it past that.
34const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10;
35
36// This should be longer than we expect feature extraction to take on any
37// actual phishing page.
38const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500;
39
40// Intermediate state used for computing features.  See features.h for
41// descriptions of the DOM features that are computed.
42struct PhishingDOMFeatureExtractor::PageFeatureState {
43  // Link related features
44  int external_links;
45  base::hash_set<std::string> external_domains;
46  int secure_links;
47  int total_links;
48
49  // Form related features
50  int num_forms;
51  int num_text_inputs;
52  int num_pswd_inputs;
53  int num_radio_inputs;
54  int num_check_inputs;
55  int action_other_domain;
56  int total_actions;
57
58  // Image related features
59  int img_other_domain;
60  int total_imgs;
61
62  // How many script tags
63  int num_script_tags;
64
65  // The time at which we started feature extraction for the current page.
66  base::TimeTicks start_time;
67
68  // The number of iterations we've done for the current extraction.
69  int num_iterations;
70
71  explicit PageFeatureState(base::TimeTicks start_time_ticks)
72      : external_links(0),
73        secure_links(0),
74        total_links(0),
75        num_forms(0),
76        num_text_inputs(0),
77        num_pswd_inputs(0),
78        num_radio_inputs(0),
79        num_check_inputs(0),
80        action_other_domain(0),
81        total_actions(0),
82        img_other_domain(0),
83        total_imgs(0),
84        num_script_tags(0),
85        start_time(start_time_ticks),
86        num_iterations(0) {}
87
88  ~PageFeatureState() {}
89};
90
91// Per-frame state
92struct PhishingDOMFeatureExtractor::FrameData {
93  // This is our reference to document.all, which is an iterator over all
94  // of the elements in the document.  It keeps track of our current position.
95  WebKit::WebNodeCollection elements;
96  // The domain of the document URL, stored here so that we don't need to
97  // recompute it every time it's needed.
98  std::string domain;
99};
100
101PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
102    content::RenderView* render_view,
103    FeatureExtractorClock* clock)
104    : render_view_(render_view),
105      clock_(clock),
106      weak_factory_(this) {
107  Clear();
108}
109
110PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
111  // The RenderView should have called CancelPendingExtraction() before
112  // we are destroyed.
113  CheckNoPendingExtraction();
114}
115
116void PhishingDOMFeatureExtractor::ExtractFeatures(
117    FeatureMap* features,
118    const DoneCallback& done_callback) {
119  // The RenderView should have called CancelPendingExtraction() before
120  // starting a new extraction, so DCHECK this.
121  CheckNoPendingExtraction();
122  // However, in an opt build, we will go ahead and clean up the pending
123  // extraction so that we can start in a known state.
124  CancelPendingExtraction();
125
126  features_ = features;
127  done_callback_ = done_callback;
128
129  page_feature_state_.reset(new PageFeatureState(clock_->Now()));
130  WebKit::WebView* web_view = render_view_->GetWebView();
131  if (web_view && web_view->mainFrame()) {
132    cur_document_ = web_view->mainFrame()->document();
133  }
134
135  base::MessageLoop::current()->PostTask(
136      FROM_HERE,
137      base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
138                 weak_factory_.GetWeakPtr()));
139}
140
141void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
142  // Cancel any pending callbacks, and clear our state.
143  weak_factory_.InvalidateWeakPtrs();
144  Clear();
145}
146
147void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
148  DCHECK(page_feature_state_.get());
149  ++page_feature_state_->num_iterations;
150  base::TimeTicks current_chunk_start_time = clock_->Now();
151
152  if (cur_document_.isNull()) {
153    // This will only happen if we weren't able to get the document for the
154    // main frame.  We'll treat this as an extraction failure.
155    RunCallback(false);
156    return;
157  }
158
159  int num_elements = 0;
160  for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) {
161    WebKit::WebNode cur_node;
162    if (cur_frame_data_.get()) {
163      // We're resuming traversal of a frame, so just advance to the next node.
164      cur_node = cur_frame_data_->elements.nextItem();
165      // When we resume the traversal, the first call to nextItem() potentially
166      // has to walk through the document again from the beginning, if it was
167      // modified between our chunks of work.  Log how long this takes, so we
168      // can tell if it's too slow.
169      UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
170                          clock_->Now() - current_chunk_start_time);
171    } else {
172      // We just moved to a new frame, so update our frame state
173      // and advance to the first element.
174      ResetFrameData();
175      cur_node = cur_frame_data_->elements.firstItem();
176    }
177
178    for (; !cur_node.isNull();
179         cur_node = cur_frame_data_->elements.nextItem()) {
180      if (!cur_node.isElementNode()) {
181        continue;
182      }
183      WebKit::WebElement element = cur_node.to<WebKit::WebElement>();
184      if (element.hasTagName("a")) {
185        HandleLink(element);
186      } else if (element.hasTagName("form")) {
187        HandleForm(element);
188      } else if (element.hasTagName("img")) {
189        HandleImage(element);
190      } else if (element.hasTagName("input")) {
191        HandleInput(element);
192      } else if (element.hasTagName("script")) {
193        HandleScript(element);
194      }
195
196      if (++num_elements >= kClockCheckGranularity) {
197        num_elements = 0;
198        base::TimeTicks now = clock_->Now();
199        if (now - page_feature_state_->start_time >=
200            base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
201          DLOG(ERROR) << "Feature extraction took too long, giving up";
202          // We expect this to happen infrequently, so record when it does.
203          UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1);
204          RunCallback(false);
205          return;
206        }
207        base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
208        if (chunk_elapsed >=
209            base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
210          // The time limit for the current chunk is up, so post a task to
211          // continue extraction.
212          //
213          // Record how much time we actually spent on the chunk. If this is
214          // much higher than kMaxTimePerChunkMs, we may need to adjust the
215          // clock granularity.
216          UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime",
217                              chunk_elapsed);
218          base::MessageLoop::current()->PostTask(
219              FROM_HERE,
220              base::Bind(
221                  &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
222                  weak_factory_.GetWeakPtr()));
223          return;
224        }
225        // Otherwise, continue.
226      }
227    }
228
229    // We're done with this frame, recalculate the FrameData when we
230    // advance to the next frame.
231    cur_frame_data_.reset();
232  }
233
234  InsertFeatures();
235  RunCallback(true);
236}
237
238void PhishingDOMFeatureExtractor::HandleLink(
239    const WebKit::WebElement& element) {
240  // Count the number of times we link to a different host.
241  if (!element.hasAttribute("href")) {
242    DVLOG(1) << "Skipping anchor tag with no href";
243    return;
244  }
245
246  // Retrieve the link and resolve the link in case it's relative.
247  WebKit::WebURL full_url = element.document().completeURL(
248      element.getAttribute("href"));
249
250  std::string domain;
251  bool is_external = IsExternalDomain(full_url, &domain);
252  if (domain.empty()) {
253    DVLOG(1) << "Could not extract domain from link: " << full_url;
254    return;
255  }
256
257  if (is_external) {
258    ++page_feature_state_->external_links;
259
260    // Record each unique domain that we link to.
261    page_feature_state_->external_domains.insert(domain);
262  }
263
264  // Check how many are https links.
265  if (GURL(full_url).SchemeIs("https")) {
266    ++page_feature_state_->secure_links;
267  }
268
269  ++page_feature_state_->total_links;
270}
271
272void PhishingDOMFeatureExtractor::HandleForm(
273    const WebKit::WebElement& element) {
274  // Increment the number of forms on this page.
275  ++page_feature_state_->num_forms;
276
277  // Record whether the action points to a different domain.
278  if (!element.hasAttribute("action")) {
279    return;
280  }
281
282  WebKit::WebURL full_url = element.document().completeURL(
283      element.getAttribute("action"));
284
285  std::string domain;
286  bool is_external = IsExternalDomain(full_url, &domain);
287  if (domain.empty()) {
288    DVLOG(1) << "Could not extract domain from form action: " << full_url;
289    return;
290  }
291
292  if (is_external) {
293    ++page_feature_state_->action_other_domain;
294  }
295  ++page_feature_state_->total_actions;
296}
297
298void PhishingDOMFeatureExtractor::HandleImage(
299    const WebKit::WebElement& element) {
300  if (!element.hasAttribute("src")) {
301    DVLOG(1) << "Skipping img tag with no src";
302  }
303
304  // Record whether the image points to a different domain.
305  WebKit::WebURL full_url = element.document().completeURL(
306      element.getAttribute("src"));
307  std::string domain;
308  bool is_external = IsExternalDomain(full_url, &domain);
309  if (domain.empty()) {
310    DVLOG(1) << "Could not extract domain from image src: " << full_url;
311    return;
312  }
313
314  if (is_external) {
315    ++page_feature_state_->img_other_domain;
316  }
317  ++page_feature_state_->total_imgs;
318}
319
320void PhishingDOMFeatureExtractor::HandleInput(
321    const WebKit::WebElement& element) {
322  // The HTML spec says that if the type is unspecified, it defaults to text.
323  // In addition, any unrecognized type will be treated as a text input.
324  //
325  // Note that we use the attribute value rather than
326  // WebFormControlElement::formControlType() for consistency with the
327  // way the phishing classification model is created.
328  std::string type = element.getAttribute("type").utf8();
329  StringToLowerASCII(&type);
330  if (type == "password") {
331    ++page_feature_state_->num_pswd_inputs;
332  } else if (type == "radio") {
333    ++page_feature_state_->num_radio_inputs;
334  } else if (type == "checkbox") {
335    ++page_feature_state_->num_check_inputs;
336  } else if (type != "submit" && type != "reset" && type != "file" &&
337             type != "hidden" && type != "image" && type != "button") {
338    // Note that there are a number of new input types in HTML5 that are not
339    // handled above.  For now, we will consider these as text inputs since
340    // they could be used to capture user input.
341    ++page_feature_state_->num_text_inputs;
342  }
343}
344
345void PhishingDOMFeatureExtractor::HandleScript(
346    const WebKit::WebElement& element) {
347  ++page_feature_state_->num_script_tags;
348}
349
350void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
351  DCHECK(done_callback_.is_null());
352  DCHECK(!cur_frame_data_.get());
353  DCHECK(cur_document_.isNull());
354  if (!done_callback_.is_null() || cur_frame_data_.get() ||
355      !cur_document_.isNull()) {
356    LOG(ERROR) << "Extraction in progress, missing call to "
357               << "CancelPendingExtraction";
358  }
359}
360
361void PhishingDOMFeatureExtractor::RunCallback(bool success) {
362  // Record some timing stats that we can use to evaluate feature extraction
363  // performance.  These include both successful and failed extractions.
364  DCHECK(page_feature_state_.get());
365  UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations",
366                       page_feature_state_->num_iterations);
367  UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
368                      clock_->Now() - page_feature_state_->start_time);
369
370  DCHECK(!done_callback_.is_null());
371  done_callback_.Run(success);
372  Clear();
373}
374
375void PhishingDOMFeatureExtractor::Clear() {
376  features_ = NULL;
377  done_callback_.Reset();
378  cur_frame_data_.reset(NULL);
379  cur_document_.reset();
380}
381
382void PhishingDOMFeatureExtractor::ResetFrameData() {
383  DCHECK(!cur_document_.isNull());
384  DCHECK(!cur_frame_data_.get());
385
386  cur_frame_data_.reset(new FrameData());
387  cur_frame_data_->elements = cur_document_.all();
388  cur_frame_data_->domain =
389      net::registry_controlled_domains::GetDomainAndRegistry(
390          cur_document_.url(),
391          net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
392}
393
394WebKit::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() {
395  DCHECK(!cur_document_.isNull());
396  WebKit::WebFrame* frame = cur_document_.frame();
397  // Advance to the next frame that contains a document, with no wrapping.
398  if (frame) {
399    while ((frame = frame->traverseNext(false))) {
400      if (!frame->document().isNull()) {
401        return frame->document();
402      }
403    }
404  } else {
405    // Keep track of how often frame traversal got "stuck" due to the
406    // current subdocument getting removed from the frame tree.
407    UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1);
408  }
409  return WebKit::WebDocument();
410}
411
412bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
413                                                   std::string* domain) const {
414  DCHECK(domain);
415  DCHECK(cur_frame_data_.get());
416
417  if (cur_frame_data_->domain.empty()) {
418    return false;
419  }
420
421  // TODO(bryner): Ensure that the url encoding is consistent with the features
422  // in the model.
423  if (url.HostIsIPAddress()) {
424    domain->assign(url.host());
425  } else {
426    domain->assign(net::registry_controlled_domains::GetDomainAndRegistry(
427        url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES));
428  }
429
430  return !domain->empty() && *domain != cur_frame_data_->domain;
431}
432
433void PhishingDOMFeatureExtractor::InsertFeatures() {
434  DCHECK(page_feature_state_.get());
435
436  if (page_feature_state_->total_links > 0) {
437    // Add a feature for the fraction of times the page links to an external
438    // domain vs. an internal domain.
439    double link_freq = static_cast<double>(
440        page_feature_state_->external_links) /
441        page_feature_state_->total_links;
442    features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq);
443
444    // Add a feature for each unique domain that we're linking to
445    for (base::hash_set<std::string>::iterator it =
446             page_feature_state_->external_domains.begin();
447         it != page_feature_state_->external_domains.end(); ++it) {
448      features_->AddBooleanFeature(features::kPageLinkDomain + *it);
449    }
450
451    // Fraction of links that use https.
452    double secure_freq = static_cast<double>(
453        page_feature_state_->secure_links) / page_feature_state_->total_links;
454    features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq);
455  }
456
457  // Record whether forms appear and whether various form elements appear.
458  if (page_feature_state_->num_forms > 0) {
459    features_->AddBooleanFeature(features::kPageHasForms);
460  }
461  if (page_feature_state_->num_text_inputs > 0) {
462    features_->AddBooleanFeature(features::kPageHasTextInputs);
463  }
464  if (page_feature_state_->num_pswd_inputs > 0) {
465    features_->AddBooleanFeature(features::kPageHasPswdInputs);
466  }
467  if (page_feature_state_->num_radio_inputs > 0) {
468    features_->AddBooleanFeature(features::kPageHasRadioInputs);
469  }
470  if (page_feature_state_->num_check_inputs > 0) {
471    features_->AddBooleanFeature(features::kPageHasCheckInputs);
472  }
473
474  // Record fraction of form actions that point to a different domain.
475  if (page_feature_state_->total_actions > 0) {
476    double action_freq = static_cast<double>(
477        page_feature_state_->action_other_domain) /
478        page_feature_state_->total_actions;
479    features_->AddRealFeature(features::kPageActionOtherDomainFreq,
480                              action_freq);
481  }
482
483  // Record how many image src attributes point to a different domain.
484  if (page_feature_state_->total_imgs > 0) {
485    double img_freq = static_cast<double>(
486        page_feature_state_->img_other_domain) /
487        page_feature_state_->total_imgs;
488    features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq);
489  }
490
491  // Record number of script tags (discretized for numerical stability.)
492  if (page_feature_state_->num_script_tags > 1) {
493    features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
494    if (page_feature_state_->num_script_tags > 6) {
495      features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
496    }
497  }
498}
499
500}  // namespace safe_browsing
501