15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// PhishingDOMFeatureExtractor handles computing DOM-based features for the 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// client-side phishing detection model. These include the presence of various 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// types of elements, ratios of external and secure links, and tokens for 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// external domains linked to. 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_ 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_ 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string> 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h" 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/callback.h" 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h" 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/weak_ptr.h" 197d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "third_party/WebKit/public/web/WebDocument.h" 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class GURL; 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 23f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)namespace blink { 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class WebElement; 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace content { 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class RenderView; 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing { 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FeatureExtractorClock; 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FeatureMap; 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class PhishingDOMFeatureExtractor { 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Callback to be run when feature extraction finishes. The callback 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // argument is true if extraction was successful, false otherwise. 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) typedef base::Callback<void(bool)> DoneCallback; 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Creates a PhishingDOMFeatureExtractor for the specified RenderView. 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The PhishingDOMFeatureExtrator should be destroyed prior to destroying 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the RenderView. |clock| is used for timing feature extractor operations, 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and may be mocked for testing. The caller maintains ownership of the 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // clock. 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PhishingDOMFeatureExtractor(content::RenderView* render_view, 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FeatureExtractorClock* clock); 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~PhishingDOMFeatureExtractor(); 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Begins extracting features into the given FeatureMap for the page 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // currently loaded in this object's RenderView. To avoid blocking the 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // render thread for too long, the feature extractor may run in several 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // chunks of work, posting a task to the current MessageLoop to continue 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // processing. Once feature extraction is complete, |done_callback| 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is run on the current thread. PhishingDOMFeatureExtractor takes 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ownership of the callback. 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void ExtractFeatures(FeatureMap* features, const DoneCallback& done_callback); 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Cancels any pending feature extraction. The DoneCallback will not be run. 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Must be called if there is a feature extraction in progress when the page 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is unloaded or the PhishingDOMFeatureExtractor is destroyed. 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void CancelPendingExtraction(); 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) struct FrameData; 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) struct PageFeatureState; 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The maximum amount of wall time that we will spend on a single extraction 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // iteration before pausing to let other MessageLoop tasks run. 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static const int kMaxTimePerChunkMs; 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The number of elements that we will process before checking to see whether 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // kMaxTimePerChunkMs has elapsed. Since checking the current time can be 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // slow, we don't do this on every element processed. 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static const int kClockCheckGranularity; 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The maximum total amount of time that the feature extractor will run 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // before giving up on the current page. 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static const int kMaxTotalTimeMs; 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // until a predefined maximum amount of time has elapsed, then posts a task 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to the current MessageLoop to continue extraction. When extraction 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // finishes, calls RunCallback(). 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void ExtractFeaturesWithTimeout(); 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Handlers for the various HTML elements that we compute features for. 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Since some of the features (such as ratios) cannot be computed until 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // feature extraction is finished, these handlers do not add to the feature 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // map directly. Instead, they update the values in the PageFeatureState. 91f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) void HandleLink(const blink::WebElement& element); 92f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) void HandleForm(const blink::WebElement& element); 93f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) void HandleImage(const blink::WebElement& element); 94f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) void HandleInput(const blink::WebElement& element); 95f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) void HandleScript(const blink::WebElement& element); 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Helper to verify that there is no pending feature extraction. Dies in 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // debug builds if the state is not as expected. This is a no-op in release 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // builds. 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void CheckNoPendingExtraction(); 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Runs |done_callback_| and then clears all internal state. 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void RunCallback(bool success); 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Clears all internal feature extraction state. 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void Clear(); 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Called after advancing |cur_document_| to update the state in 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // |cur_frame_data_|. 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void ResetFrameData(); 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns the next document in frame-traversal order from cur_document_. 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If there are no more documents, returns a null WebDocument. 114f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) blink::WebDocument GetNextDocument(); 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Given a URL, checks whether the domain is different from the domain of 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the current frame's URL. If so, stores the domain in |domain| and returns 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // true, otherwise returns false. 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool IsExternalDomain(const GURL& url, std::string* domain) const; 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Called once all frames have been processed to compute features from the 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // PageFeatureState and add them to |features_|. See features.h for a 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // description of which features are computed. 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void InsertFeatures(); 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Non-owned pointer to the view that we will extract features from. 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) content::RenderView* render_view_; 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Non-owned pointer to our clock. 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FeatureExtractorClock* clock_; 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The output parameters from the most recent call to ExtractFeatures(). 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FeatureMap* features_; // The caller keeps ownership of this. 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DoneCallback done_callback_; 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The current (sub-)document that we are processing. May be a null document 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (isNull()) if we are not currently extracting features. 138f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) blink::WebDocument cur_document_; 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Stores extra state for |cur_document_| that will be persisted until we 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // advance to the next frame. 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) scoped_ptr<FrameData> cur_frame_data_; 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Stores the intermediate data used to create features. This data is 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // accumulated across all frames in the RenderView. 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) scoped_ptr<PageFeatureState> page_feature_state_; 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Used in scheduling ExtractFeaturesWithTimeout tasks. 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // These pointers are invalidated if extraction is cancelled. 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::WeakPtrFactory<PhishingDOMFeatureExtractor> weak_factory_; 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_COPY_AND_ASSIGN(PhishingDOMFeatureExtractor); 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace safe_browsing 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_ 158