15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// PhishingDOMFeatureExtractor handles computing DOM-based features for the
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// client-side phishing detection model.  These include the presence of various
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// types of elements, ratios of external and secure links, and tokens for
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// external domains linked to.
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h"
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/callback.h"
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h"
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/weak_ptr.h"
197d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "third_party/WebKit/public/web/WebDocument.h"
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class GURL;
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
23f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)namespace blink {
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class WebElement;
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace content {
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class RenderView;
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing {
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FeatureExtractorClock;
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FeatureMap;
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class PhishingDOMFeatureExtractor {
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Callback to be run when feature extraction finishes.  The callback
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // argument is true if extraction was successful, false otherwise.
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  typedef base::Callback<void(bool)> DoneCallback;
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Creates a PhishingDOMFeatureExtractor for the specified RenderView.
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The PhishingDOMFeatureExtrator should be destroyed prior to destroying
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the RenderView.  |clock| is used for timing feature extractor operations,
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // and may be mocked for testing.  The caller maintains ownership of the
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // clock.
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PhishingDOMFeatureExtractor(content::RenderView* render_view,
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                              FeatureExtractorClock* clock);
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~PhishingDOMFeatureExtractor();
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Begins extracting features into the given FeatureMap for the page
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // currently loaded in this object's RenderView.  To avoid blocking the
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // render thread for too long, the feature extractor may run in several
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // chunks of work, posting a task to the current MessageLoop to continue
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // processing.  Once feature extraction is complete, |done_callback|
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // is run on the current thread.  PhishingDOMFeatureExtractor takes
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // ownership of the callback.
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void ExtractFeatures(FeatureMap* features, const DoneCallback& done_callback);
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Cancels any pending feature extraction.  The DoneCallback will not be run.
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Must be called if there is a feature extraction in progress when the page
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // is unloaded or the PhishingDOMFeatureExtractor is destroyed.
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void CancelPendingExtraction();
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  struct FrameData;
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  struct PageFeatureState;
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The maximum amount of wall time that we will spend on a single extraction
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // iteration before pausing to let other MessageLoop tasks run.
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const int kMaxTimePerChunkMs;
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The number of elements that we will process before checking to see whether
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // kMaxTimePerChunkMs has elapsed.  Since checking the current time can be
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // slow, we don't do this on every element processed.
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const int kClockCheckGranularity;
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The maximum total amount of time that the feature extractor will run
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // before giving up on the current page.
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const int kMaxTotalTimeMs;
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Does the actual work of ExtractFeatures.  ExtractFeaturesWithTimeout runs
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // until a predefined maximum amount of time has elapsed, then posts a task
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // to the current MessageLoop to continue extraction.  When extraction
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // finishes, calls RunCallback().
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void ExtractFeaturesWithTimeout();
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Handlers for the various HTML elements that we compute features for.
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Since some of the features (such as ratios) cannot be computed until
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // feature extraction is finished, these handlers do not add to the feature
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // map directly.  Instead, they update the values in the PageFeatureState.
91f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  void HandleLink(const blink::WebElement& element);
92f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  void HandleForm(const blink::WebElement& element);
93f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  void HandleImage(const blink::WebElement& element);
94f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  void HandleInput(const blink::WebElement& element);
95f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  void HandleScript(const blink::WebElement& element);
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Helper to verify that there is no pending feature extraction.  Dies in
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // debug builds if the state is not as expected.  This is a no-op in release
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // builds.
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void CheckNoPendingExtraction();
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Runs |done_callback_| and then clears all internal state.
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void RunCallback(bool success);
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Clears all internal feature extraction state.
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void Clear();
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Called after advancing |cur_document_| to update the state in
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // |cur_frame_data_|.
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void ResetFrameData();
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns the next document in frame-traversal order from cur_document_.
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If there are no more documents, returns a null WebDocument.
114f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  blink::WebDocument GetNextDocument();
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Given a URL, checks whether the domain is different from the domain of
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the current frame's URL.  If so, stores the domain in |domain| and returns
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // true, otherwise returns false.
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool IsExternalDomain(const GURL& url, std::string* domain) const;
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Called once all frames have been processed to compute features from the
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // PageFeatureState and add them to |features_|.  See features.h for a
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // description of which features are computed.
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void InsertFeatures();
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Non-owned pointer to the view that we will extract features from.
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  content::RenderView* render_view_;
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Non-owned pointer to our clock.
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FeatureExtractorClock* clock_;
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The output parameters from the most recent call to ExtractFeatures().
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FeatureMap* features_;  // The caller keeps ownership of this.
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DoneCallback done_callback_;
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The current (sub-)document that we are processing.  May be a null document
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // (isNull()) if we are not currently extracting features.
138f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)  blink::WebDocument cur_document_;
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Stores extra state for |cur_document_| that will be persisted until we
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // advance to the next frame.
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scoped_ptr<FrameData> cur_frame_data_;
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Stores the intermediate data used to create features.  This data is
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // accumulated across all frames in the RenderView.
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scoped_ptr<PageFeatureState> page_feature_state_;
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Used in scheduling ExtractFeaturesWithTimeout tasks.
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // These pointers are invalidated if extraction is cancelled.
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  base::WeakPtrFactory<PhishingDOMFeatureExtractor> weak_factory_;
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(PhishingDOMFeatureExtractor);
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace safe_browsing
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
158