15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/bind.h" 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/compiler_specific.h" 97d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/containers/hash_tables.h" 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h" 119ab5563a3196760eb381d102cbb2bc0f7abc6a50Ben Murdoch#include "base/message_loop/message_loop.h" 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/metrics/histogram.h" 137d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/strings/string_util.h" 14eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch#include "base/time/time.h" 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/feature_extractor_clock.h" 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/features.h" 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "content/public/renderer/render_view.h" 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "net/base/registry_controlled_domains/registry_controlled_domain.h" 19eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch#include "third_party/WebKit/public/platform/WebString.h" 207d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "third_party/WebKit/public/web/WebElement.h" 215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "third_party/WebKit/public/web/WebElementCollection.h" 220529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch#include "third_party/WebKit/public/web/WebLocalFrame.h" 237d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "third_party/WebKit/public/web/WebView.h" 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing { 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This time should be short enough that it doesn't noticeably disrupt the 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// user's interaction with the page. 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10; 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Experimenting shows that we get a reasonable gain in performance by 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// increasing this up to around 10, but there's not much benefit in 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// increasing it past that. 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10; 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This should be longer than we expect feature extraction to take on any 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// actual phishing page. 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500; 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Intermediate state used for computing features. See features.h for 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// descriptions of the DOM features that are computed. 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct PhishingDOMFeatureExtractor::PageFeatureState { 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Link related features 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int external_links; 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::hash_set<std::string> external_domains; 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int secure_links; 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int total_links; 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Form related features 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_forms; 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_text_inputs; 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_pswd_inputs; 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_radio_inputs; 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_check_inputs; 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int action_other_domain; 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int total_actions; 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Image related features 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int img_other_domain; 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int total_imgs; 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // How many script tags 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_script_tags; 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The time at which we started feature extraction for the current page. 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::TimeTicks start_time; 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The number of iterations we've done for the current extraction. 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_iterations; 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) explicit PageFeatureState(base::TimeTicks start_time_ticks) 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : external_links(0), 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) secure_links(0), 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) total_links(0), 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_forms(0), 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_text_inputs(0), 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_pswd_inputs(0), 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_radio_inputs(0), 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_check_inputs(0), 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) action_other_domain(0), 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) total_actions(0), 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) img_other_domain(0), 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) total_imgs(0), 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_script_tags(0), 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start_time(start_time_ticks), 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_iterations(0) {} 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~PageFeatureState() {} 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Per-frame state 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct PhishingDOMFeatureExtractor::FrameData { 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This is our reference to document.all, which is an iterator over all 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of the elements in the document. It keeps track of our current position. 955d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) blink::WebElementCollection elements; 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The domain of the document URL, stored here so that we don't need to 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // recompute it every time it's needed. 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string domain; 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) content::RenderView* render_view, 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FeatureExtractorClock* clock) 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : render_view_(render_view), 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) clock_(clock), 106c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) weak_factory_(this) { 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Clear(); 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() { 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The RenderView should have called CancelPendingExtraction() before 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // we are destroyed. 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CheckNoPendingExtraction(); 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::ExtractFeatures( 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FeatureMap* features, 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const DoneCallback& done_callback) { 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The RenderView should have called CancelPendingExtraction() before 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // starting a new extraction, so DCHECK this. 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CheckNoPendingExtraction(); 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // However, in an opt build, we will go ahead and clean up the pending 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // extraction so that we can start in a known state. 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CancelPendingExtraction(); 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_ = features; 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) done_callback_ = done_callback; 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_.reset(new PageFeatureState(clock_->Now())); 130f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) blink::WebView* web_view = render_view_->GetWebView(); 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (web_view && web_view->mainFrame()) { 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_document_ = web_view->mainFrame()->document(); 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 135a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles) base::MessageLoop::current()->PostTask( 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FROM_HERE, 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) weak_factory_.GetWeakPtr())); 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::CancelPendingExtraction() { 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Cancel any pending callbacks, and clear our state. 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) weak_factory_.InvalidateWeakPtrs(); 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Clear(); 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(page_feature_state_.get()); 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->num_iterations; 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::TimeTicks current_chunk_start_time = clock_->Now(); 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (cur_document_.isNull()) { 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This will only happen if we weren't able to get the document for the 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // main frame. We'll treat this as an extraction failure. 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RunCallback(false); 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_elements = 0; 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { 1615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) blink::WebElement cur_element; 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (cur_frame_data_.get()) { 1635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // We're resuming traversal of a frame, so just advance to the next 1645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) // element. 1655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) cur_element = cur_frame_data_->elements.nextItem(); 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // When we resume the traversal, the first call to nextItem() potentially 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // has to walk through the document again from the beginning, if it was 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // modified between our chunks of work. Log how long this takes, so we 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // can tell if it's too slow. 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) clock_->Now() - current_chunk_start_time); 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We just moved to a new frame, so update our frame state 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and advance to the first element. 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ResetFrameData(); 1765d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) cur_element = cur_frame_data_->elements.firstItem(); 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1795d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for (; !cur_element.isNull(); 1805d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) cur_element = cur_frame_data_->elements.nextItem()) { 1815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (cur_element.hasTagName("a")) { 1825d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) HandleLink(cur_element); 1835d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } else if (cur_element.hasTagName("form")) { 1845d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) HandleForm(cur_element); 1855d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } else if (cur_element.hasTagName("img")) { 1865d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) HandleImage(cur_element); 1875d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } else if (cur_element.hasTagName("input")) { 1885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) HandleInput(cur_element); 1895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } else if (cur_element.hasTagName("script")) { 1905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) HandleScript(cur_element); 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (++num_elements >= kClockCheckGranularity) { 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_elements = 0; 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::TimeTicks now = clock_->Now(); 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (now - page_feature_state_->start_time >= 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { 1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DLOG(ERROR) << "Feature extraction took too long, giving up"; 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We expect this to happen infrequently, so record when it does. 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1); 2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RunCallback(false); 2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::TimeDelta chunk_elapsed = now - current_chunk_start_time; 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (chunk_elapsed >= 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) { 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The time limit for the current chunk is up, so post a task to 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // continue extraction. 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record how much time we actually spent on the chunk. If this is 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // much higher than kMaxTimePerChunkMs, we may need to adjust the 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // clock granularity. 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime", 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) chunk_elapsed); 215a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles) base::MessageLoop::current()->PostTask( 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FROM_HERE, 2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::Bind( 2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, 2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) weak_factory_.GetWeakPtr())); 2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Otherwise, continue. 2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We're done with this frame, recalculate the FrameData when we 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // advance to the next frame. 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_frame_data_.reset(); 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) InsertFeatures(); 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RunCallback(true); 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::HandleLink( 236f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) const blink::WebElement& element) { 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Count the number of times we link to a different host. 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!element.hasAttribute("href")) { 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DVLOG(1) << "Skipping anchor tag with no href"; 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Retrieve the link and resolve the link in case it's relative. 244f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) blink::WebURL full_url = element.document().completeURL( 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) element.getAttribute("href")); 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string domain; 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_external = IsExternalDomain(full_url, &domain); 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (domain.empty()) { 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DVLOG(1) << "Could not extract domain from link: " << full_url; 2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (is_external) { 2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->external_links; 2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record each unique domain that we link to. 2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->external_domains.insert(domain); 2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Check how many are https links. 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (GURL(full_url).SchemeIs("https")) { 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->secure_links; 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->total_links; 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::HandleForm( 270f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) const blink::WebElement& element) { 2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Increment the number of forms on this page. 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->num_forms; 2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record whether the action points to a different domain. 2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!element.hasAttribute("action")) { 2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 279f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) blink::WebURL full_url = element.document().completeURL( 2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) element.getAttribute("action")); 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string domain; 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_external = IsExternalDomain(full_url, &domain); 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (domain.empty()) { 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DVLOG(1) << "Could not extract domain from form action: " << full_url; 2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (is_external) { 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->action_other_domain; 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->total_actions; 2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::HandleImage( 296f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) const blink::WebElement& element) { 2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!element.hasAttribute("src")) { 2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DVLOG(1) << "Skipping img tag with no src"; 2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record whether the image points to a different domain. 302f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) blink::WebURL full_url = element.document().completeURL( 3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) element.getAttribute("src")); 3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string domain; 3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_external = IsExternalDomain(full_url, &domain); 3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (domain.empty()) { 3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DVLOG(1) << "Could not extract domain from image src: " << full_url; 3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (is_external) { 3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->img_other_domain; 3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->total_imgs; 3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::HandleInput( 318f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) const blink::WebElement& element) { 3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The HTML spec says that if the type is unspecified, it defaults to text. 3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // In addition, any unrecognized type will be treated as a text input. 3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Note that we use the attribute value rather than 3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // WebFormControlElement::formControlType() for consistency with the 3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // way the phishing classification model is created. 3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string type = element.getAttribute("type").utf8(); 3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringToLowerASCII(&type); 3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (type == "password") { 3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->num_pswd_inputs; 3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (type == "radio") { 3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->num_radio_inputs; 3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (type == "checkbox") { 3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->num_check_inputs; 3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (type != "submit" && type != "reset" && type != "file" && 3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) type != "hidden" && type != "image" && type != "button") { 3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Note that there are a number of new input types in HTML5 that are not 3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // handled above. For now, we will consider these as text inputs since 3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // they could be used to capture user input. 3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->num_text_inputs; 3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::HandleScript( 343f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) const blink::WebElement& element) { 3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++page_feature_state_->num_script_tags; 3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { 3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(done_callback_.is_null()); 3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(!cur_frame_data_.get()); 3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(cur_document_.isNull()); 3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!done_callback_.is_null() || cur_frame_data_.get() || 3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) !cur_document_.isNull()) { 3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(ERROR) << "Extraction in progress, missing call to " 3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) << "CancelPendingExtraction"; 3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::RunCallback(bool success) { 3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record some timing stats that we can use to evaluate feature extraction 3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // performance. These include both successful and failed extractions. 3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(page_feature_state_.get()); 3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations", 3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->num_iterations); 3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime", 3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) clock_->Now() - page_feature_state_->start_time); 3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(!done_callback_.is_null()); 3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) done_callback_.Run(success); 3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Clear(); 3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::Clear() { 3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_ = NULL; 3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) done_callback_.Reset(); 3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_frame_data_.reset(NULL); 3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_document_.reset(); 3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::ResetFrameData() { 3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(!cur_document_.isNull()); 3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(!cur_frame_data_.get()); 3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_frame_data_.reset(new FrameData()); 3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_frame_data_->elements = cur_document_.all(); 3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_frame_data_->domain = 386a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles) net::registry_controlled_domains::GetDomainAndRegistry( 387a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles) cur_document_.url(), 388a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles) net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); 3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 391f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() { 3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(!cur_document_.isNull()); 393f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) blink::WebFrame* frame = cur_document_.frame(); 3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance to the next frame that contains a document, with no wrapping. 3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (frame) { 3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while ((frame = frame->traverseNext(false))) { 3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!frame->document().isNull()) { 3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return frame->document(); 3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Keep track of how often frame traversal got "stuck" due to the 4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // current subdocument getting removed from the frame tree. 4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1); 4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 406f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) return blink::WebDocument(); 4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, 4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string* domain) const { 4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(domain); 4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(cur_frame_data_.get()); 4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (cur_frame_data_->domain.empty()) { 4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // TODO(bryner): Ensure that the url encoding is consistent with the features 4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in the model. 4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (url.HostIsIPAddress()) { 4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) domain->assign(url.host()); 4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 423a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles) domain->assign(net::registry_controlled_domains::GetDomainAndRegistry( 424a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles) url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES)); 4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return !domain->empty() && *domain != cur_frame_data_->domain; 4285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PhishingDOMFeatureExtractor::InsertFeatures() { 4315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(page_feature_state_.get()); 4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->total_links > 0) { 4345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Add a feature for the fraction of times the page links to an external 4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // domain vs. an internal domain. 4365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) double link_freq = static_cast<double>( 4375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->external_links) / 4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->total_links; 4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq); 4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Add a feature for each unique domain that we're linking to 4425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (base::hash_set<std::string>::iterator it = 4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->external_domains.begin(); 4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) it != page_feature_state_->external_domains.end(); ++it) { 4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageLinkDomain + *it); 4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Fraction of links that use https. 4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) double secure_freq = static_cast<double>( 4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->secure_links) / page_feature_state_->total_links; 4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq); 4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record whether forms appear and whether various form elements appear. 4555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->num_forms > 0) { 4565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageHasForms); 4575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->num_text_inputs > 0) { 4595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageHasTextInputs); 4605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->num_pswd_inputs > 0) { 4625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageHasPswdInputs); 4635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->num_radio_inputs > 0) { 4655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageHasRadioInputs); 4665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->num_check_inputs > 0) { 4685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageHasCheckInputs); 4695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record fraction of form actions that point to a different domain. 4725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->total_actions > 0) { 4735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) double action_freq = static_cast<double>( 4745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->action_other_domain) / 4755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->total_actions; 4765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddRealFeature(features::kPageActionOtherDomainFreq, 4775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) action_freq); 4785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record how many image src attributes point to a different domain. 4815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->total_imgs > 0) { 4825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) double img_freq = static_cast<double>( 4835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->img_other_domain) / 4845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) page_feature_state_->total_imgs; 4855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq); 4865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record number of script tags (discretized for numerical stability.) 4895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->num_script_tags > 1) { 4905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); 4915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (page_feature_state_->num_script_tags > 6) { 4925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); 4935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace safe_browsing 498