phishing_dom_feature_extractor.cc revision 9ab5563a3196760eb381d102cbb2bc0f7abc6a50
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/bind.h" 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/compiler_specific.h" 97d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/containers/hash_tables.h" 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h" 119ab5563a3196760eb381d102cbb2bc0f7abc6a50Ben Murdoch#include "base/message_loop/message_loop.h" 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/metrics/histogram.h" 137d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/strings/string_util.h" 14eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch#include "base/time/time.h" 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/feature_extractor_clock.h" 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/features.h" 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "content/public/renderer/render_view.h" 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "net/base/registry_controlled_domains/registry_controlled_domain.h" 19eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch#include "third_party/WebKit/public/platform/WebString.h" 207d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "third_party/WebKit/public/web/WebElement.h" 217d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "third_party/WebKit/public/web/WebFrame.h" 227d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "third_party/WebKit/public/web/WebNodeCollection.h" 237d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "third_party/WebKit/public/web/WebView.h" 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing { 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This time should be short enough that it doesn't noticeably disrupt the 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// user's interaction with the page. 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10; 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Experimenting shows that we get a reasonable gain in performance by 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// increasing this up to around 10, but there's not much benefit in 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// increasing it past that. 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10; 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This should be longer than we expect feature extraction to take on any 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// actual phishing page. 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500; 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Intermediate state used for computing features. See features.h for 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// descriptions of the DOM features that are computed. 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct PhishingDOMFeatureExtractor::PageFeatureState { 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Link related features 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int external_links; 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::hash_set<std::string> external_domains; 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int secure_links; 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int total_links; 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Form related features 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_forms; 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_text_inputs; 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_pswd_inputs; 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_radio_inputs; 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_check_inputs; 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int action_other_domain; 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int total_actions; 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Image related features 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int img_other_domain; 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int total_imgs; 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // How many script tags 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_script_tags; 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The time at which we started feature extraction for the current page. 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::TimeTicks start_time; 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The number of iterations we've done for the current extraction. 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_iterations; 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) explicit PageFeatureState(base::TimeTicks start_time_ticks) 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : external_links(0), 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) secure_links(0), 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) total_links(0), 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_forms(0), 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_text_inputs(0), 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_pswd_inputs(0), 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_radio_inputs(0), 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_check_inputs(0), 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) action_other_domain(0), 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) total_actions(0), 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) img_other_domain(0), 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) total_imgs(0), 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_script_tags(0), 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start_time(start_time_ticks), 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_iterations(0) {} 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~PageFeatureState() {} 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Per-frame state 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct PhishingDOMFeatureExtractor::FrameData { 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This is our reference to document.all, which is an iterator over all 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of the elements in the document. It keeps track of our current position. 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WebKit::WebNodeCollection elements; 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The domain of the document URL, stored here so that we don't need to 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // recompute it every time it's needed. 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string domain; 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) content::RenderView* render_view, 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FeatureExtractorClock* clock) 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : render_view_(render_view), 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) clock_(clock), 106c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) weak_factory_(this) { 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Clear(); 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() { 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The RenderView should have called CancelPendingExtraction() before 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // we are destroyed. 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CheckNoPendingExtraction(); 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::ExtractFeatures( 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FeatureMap* features, 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const DoneCallback& done_callback) { 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The RenderView should have called CancelPendingExtraction() before 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // starting a new extraction, so DCHECK this. 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CheckNoPendingExtraction(); 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // However, in an opt build, we will go ahead and clean up the pending 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // extraction so that we can start in a known state. 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CancelPendingExtraction(); 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_ = features; 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) done_callback_ = done_callback; 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_.reset(new PageFeatureState(clock_->Now())); 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WebKit::WebView* web_view = render_view_->GetWebView(); 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (web_view && web_view->mainFrame()) { 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_document_ = web_view->mainFrame()->document(); 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 135a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles) base::MessageLoop::current()->PostTask( 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FROM_HERE, 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) weak_factory_.GetWeakPtr())); 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::CancelPendingExtraction() { 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Cancel any pending callbacks, and clear our state. 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) weak_factory_.InvalidateWeakPtrs(); 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Clear(); 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(page_feature_state_.get()); 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->num_iterations; 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::TimeTicks current_chunk_start_time = clock_->Now(); 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (cur_document_.isNull()) { 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This will only happen if we weren't able to get the document for the 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // main frame. We'll treat this as an extraction failure. 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RunCallback(false); 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_elements = 0; 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WebKit::WebNode cur_node; 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (cur_frame_data_.get()) { 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We're resuming traversal of a frame, so just advance to the next node. 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_node = cur_frame_data_->elements.nextItem(); 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // When we resume the traversal, the first call to nextItem() potentially 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // has to walk through the document again from the beginning, if it was 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // modified between our chunks of work. Log how long this takes, so we 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // can tell if it's too slow. 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) clock_->Now() - current_chunk_start_time); 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We just moved to a new frame, so update our frame state 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and advance to the first element. 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ResetFrameData(); 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_node = cur_frame_data_->elements.firstItem(); 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (; !cur_node.isNull(); 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_node = cur_frame_data_->elements.nextItem()) { 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!cur_node.isElementNode()) { 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WebKit::WebElement element = cur_node.to<WebKit::WebElement>(); 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (element.hasTagName("a")) { 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) HandleLink(element); 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (element.hasTagName("form")) { 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) HandleForm(element); 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (element.hasTagName("img")) { 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) HandleImage(element); 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (element.hasTagName("input")) { 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) HandleInput(element); 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (element.hasTagName("script")) { 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) HandleScript(element); 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (++num_elements >= kClockCheckGranularity) { 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_elements = 0; 1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::TimeTicks now = clock_->Now(); 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (now - page_feature_state_->start_time >= 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { 2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DLOG(ERROR) << "Feature extraction took too long, giving up"; 2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We expect this to happen infrequently, so record when it does. 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1); 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RunCallback(false); 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::TimeDelta chunk_elapsed = now - current_chunk_start_time; 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (chunk_elapsed >= 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) { 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The time limit for the current chunk is up, so post a task to 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // continue extraction. 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record how much time we actually spent on the chunk. If this is 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // much higher than kMaxTimePerChunkMs, we may need to adjust the 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // clock granularity. 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime", 2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) chunk_elapsed); 218a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles) base::MessageLoop::current()->PostTask( 2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FROM_HERE, 2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::Bind( 2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, 2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) weak_factory_.GetWeakPtr())); 2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Otherwise, continue. 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We're done with this frame, recalculate the FrameData when we 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // advance to the next frame. 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_frame_data_.reset(); 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) InsertFeatures(); 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RunCallback(true); 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::HandleLink( 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const WebKit::WebElement& element) { 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Count the number of times we link to a different host. 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!element.hasAttribute("href")) { 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DVLOG(1) << "Skipping anchor tag with no href"; 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Retrieve the link and resolve the link in case it's relative. 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WebKit::WebURL full_url = element.document().completeURL( 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) element.getAttribute("href")); 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string domain; 2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_external = IsExternalDomain(full_url, &domain); 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (domain.empty()) { 2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DVLOG(1) << "Could not extract domain from link: " << full_url; 2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (is_external) { 2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->external_links; 2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record each unique domain that we link to. 2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->external_domains.insert(domain); 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Check how many are https links. 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (GURL(full_url).SchemeIs("https")) { 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->secure_links; 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->total_links; 2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::HandleForm( 2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const WebKit::WebElement& element) { 2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Increment the number of forms on this page. 2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->num_forms; 2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record whether the action points to a different domain. 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!element.hasAttribute("action")) { 2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WebKit::WebURL full_url = element.document().completeURL( 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) element.getAttribute("action")); 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string domain; 2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_external = IsExternalDomain(full_url, &domain); 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (domain.empty()) { 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DVLOG(1) << "Could not extract domain from form action: " << full_url; 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (is_external) { 2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->action_other_domain; 2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->total_actions; 2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::HandleImage( 2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const WebKit::WebElement& element) { 3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!element.hasAttribute("src")) { 3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DVLOG(1) << "Skipping img tag with no src"; 3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record whether the image points to a different domain. 3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WebKit::WebURL full_url = element.document().completeURL( 3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) element.getAttribute("src")); 3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string domain; 3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_external = IsExternalDomain(full_url, &domain); 3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (domain.empty()) { 3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DVLOG(1) << "Could not extract domain from image src: " << full_url; 3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (is_external) { 3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->img_other_domain; 3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->total_imgs; 3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::HandleInput( 3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const WebKit::WebElement& element) { 3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The HTML spec says that if the type is unspecified, it defaults to text. 3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // In addition, any unrecognized type will be treated as a text input. 3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Note that we use the attribute value rather than 3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // WebFormControlElement::formControlType() for consistency with the 3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // way the phishing classification model is created. 3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string type = element.getAttribute("type").utf8(); 3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringToLowerASCII(&type); 3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (type == "password") { 3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->num_pswd_inputs; 3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (type == "radio") { 3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->num_radio_inputs; 3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (type == "checkbox") { 3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->num_check_inputs; 3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (type != "submit" && type != "reset" && type != "file" && 3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) type != "hidden" && type != "image" && type != "button") { 3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Note that there are a number of new input types in HTML5 that are not 3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // handled above. For now, we will consider these as text inputs since 3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // they could be used to capture user input. 3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->num_text_inputs; 3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::HandleScript( 3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const WebKit::WebElement& element) { 3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->num_script_tags; 3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { 3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(done_callback_.is_null()); 3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(!cur_frame_data_.get()); 3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(cur_document_.isNull()); 3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!done_callback_.is_null() || cur_frame_data_.get() || 3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) !cur_document_.isNull()) { 3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(ERROR) << "Extraction in progress, missing call to " 3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) << "CancelPendingExtraction"; 3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::RunCallback(bool success) { 3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record some timing stats that we can use to evaluate feature extraction 3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // performance. These include both successful and failed extractions. 3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(page_feature_state_.get()); 3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations", 3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->num_iterations); 3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime", 3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) clock_->Now() - page_feature_state_->start_time); 3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(!done_callback_.is_null()); 3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) done_callback_.Run(success); 3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Clear(); 3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::Clear() { 3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_ = NULL; 3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) done_callback_.Reset(); 3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_frame_data_.reset(NULL); 3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_document_.reset(); 3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::ResetFrameData() { 3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(!cur_document_.isNull()); 3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(!cur_frame_data_.get()); 3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_frame_data_.reset(new FrameData()); 3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_frame_data_->elements = cur_document_.all(); 3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_frame_data_->domain = 389a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles) net::registry_controlled_domains::GetDomainAndRegistry( 390a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles) cur_document_.url(), 391a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles) net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); 3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)WebKit::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() { 3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(!cur_document_.isNull()); 3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WebKit::WebFrame* frame = cur_document_.frame(); 3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance to the next frame that contains a document, with no wrapping. 3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (frame) { 3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while ((frame = frame->traverseNext(false))) { 4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!frame->document().isNull()) { 4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return frame->document(); 4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Keep track of how often frame traversal got "stuck" due to the 4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // current subdocument getting removed from the frame tree. 4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1); 4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return WebKit::WebDocument(); 4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, 4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string* domain) const { 4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(domain); 4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(cur_frame_data_.get()); 4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (cur_frame_data_->domain.empty()) { 4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // TODO(bryner): Ensure that the url encoding is consistent with the features 4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in the model. 4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (url.HostIsIPAddress()) { 4245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) domain->assign(url.host()); 4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 426a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles) domain->assign(net::registry_controlled_domains::GetDomainAndRegistry( 427a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles) url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES)); 4285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return !domain->empty() && *domain != cur_frame_data_->domain; 4315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::InsertFeatures() { 4345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(page_feature_state_.get()); 4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->total_links > 0) { 4375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Add a feature for the fraction of times the page links to an external 4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // domain vs. an internal domain. 4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) double link_freq = static_cast<double>( 4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->external_links) / 4415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->total_links; 4425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq); 4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Add a feature for each unique domain that we're linking to 4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (base::hash_set<std::string>::iterator it = 4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->external_domains.begin(); 4475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) it != page_feature_state_->external_domains.end(); ++it) { 4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageLinkDomain + *it); 4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Fraction of links that use https. 4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) double secure_freq = static_cast<double>( 4535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->secure_links) / page_feature_state_->total_links; 4545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq); 4555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record whether forms appear and whether various form elements appear. 4585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->num_forms > 0) { 4595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageHasForms); 4605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->num_text_inputs > 0) { 4625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageHasTextInputs); 4635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->num_pswd_inputs > 0) { 4655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageHasPswdInputs); 4665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->num_radio_inputs > 0) { 4685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageHasRadioInputs); 4695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->num_check_inputs > 0) { 4715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageHasCheckInputs); 4725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record fraction of form actions that point to a different domain. 4755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->total_actions > 0) { 4765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) double action_freq = static_cast<double>( 4775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->action_other_domain) / 4785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->total_actions; 4795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddRealFeature(features::kPageActionOtherDomainFreq, 4805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) action_freq); 4815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record how many image src attributes point to a different domain. 4845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->total_imgs > 0) { 4855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) double img_freq = static_cast<double>( 4865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->img_other_domain) / 4875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->total_imgs; 4885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq); 4895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record number of script tags (discretized for numerical stability.) 4925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->num_script_tags > 1) { 4935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); 4945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->num_script_tags > 6) { 4955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); 4965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace safe_browsing 501