1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// This class handles the process of extracting all of the features from a
6// page and computing a phishyness score.  The basic steps are:
7//  - Run each feature extractor over the page, building up a FeatureMap of
8//    feature -> value.
9//  - SHA-256 hash all of the feature names in the map so that they match the
10//    supplied model.
11//  - Hand the hashed map off to a Scorer, which computes the probability that
12//    the page is phishy.
13//  - If the page is phishy, run the supplied callback.
14//
15// For more details, see phishing_*_feature_extractor.h, scorer.h, and
16// client_model.proto.
17
18#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
19#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
20
21#include <set>
22
23#include "base/basictypes.h"
24#include "base/callback.h"
25#include "base/memory/scoped_ptr.h"
26#include "base/memory/weak_ptr.h"
27#include "base/strings/string16.h"
28
29namespace content {
30class RenderView;
31}
32
33namespace safe_browsing {
34class ClientPhishingRequest;
35class FeatureExtractorClock;
36class FeatureMap;
37class PhishingDOMFeatureExtractor;
38class PhishingTermFeatureExtractor;
39class PhishingUrlFeatureExtractor;
40class Scorer;
41
42class PhishingClassifier {
43 public:
44  // Callback to be run when phishing classification finishes. The verdict
45  // is a ClientPhishingRequest which contains the verdict computed by the
46  // classifier as well as the extracted features.  If the verdict.is_phishing()
47  // is true, the page is considered phishy by the client-side model,
48  // and the browser should ping back to get a final verdict.  The
49  // verdict.client_score() is set to kInvalidScore if classification failed.
50  typedef base::Callback<void(const ClientPhishingRequest& /* verdict */)>
51      DoneCallback;
52
53  static const float kInvalidScore;
54
55  // Creates a new PhishingClassifier object that will operate on
56  // |render_view|.  |clock| is used to time feature extractor operations, and
57  // the PhishingClassifier takes ownership of this object.  Note that the
58  // classifier will not be 'ready' until set_phishing_scorer() is called.
59  PhishingClassifier(content::RenderView* render_view,
60                     FeatureExtractorClock* clock);
61  virtual ~PhishingClassifier();
62
63  // Sets a scorer for the classifier to use in computing the phishiness score.
64  // This must live at least as long as the PhishingClassifier.  The caller is
65  // expected to cancel any pending classification before setting a phishing
66  // scorer.
67  void set_phishing_scorer(const Scorer* scorer);
68
69  // Returns true if the classifier is ready to classify pages, i.e. it
70  // has had a scorer set via set_phishing_scorer().
71  bool is_ready() const;
72
73  // Called by the RenderView when a page has finished loading.  This begins
74  // the feature extraction and scoring process. |page_text| should contain
75  // the plain text of a web page, including any subframes, as returned by
76  // RenderView::CaptureText().  |page_text| is owned by the caller, and must
77  // not be destroyed until either |done_callback| is run or
78  // CancelPendingClassification() is called.
79  //
80  // To avoid blocking the render thread for too long, phishing classification
81  // may run in several chunks of work, posting a task to the current
82  // MessageLoop to continue processing.  Once the scoring process is complete,
83  // |done_callback| is run on the current thread.  PhishingClassifier takes
84  // ownership of the callback.
85  //
86  // It is an error to call BeginClassification if the classifier is not yet
87  // ready.
88  virtual void BeginClassification(const base::string16* page_text,
89                                   const DoneCallback& callback);
90
91  // Called by the RenderView (on the render thread) when a page is unloading
92  // or the RenderView is being destroyed.  This cancels any extraction that
93  // is in progress.  It is an error to call CancelPendingClassification if
94  // the classifier is not yet ready.
95  virtual void CancelPendingClassification();
96
97 private:
98  // Any score equal to or above this value is considered phishy.
99  static const float kPhishyThreshold;
100
101  // Begins the feature extraction process, by extracting URL features and
102  // beginning DOM feature extraction.
103  void BeginFeatureExtraction();
104
105  // Callback to be run when DOM feature extraction is complete.
106  // If it was successful, begins term feature extraction, otherwise
107  // runs the DoneCallback with a non-phishy verdict.
108  void DOMExtractionFinished(bool success);
109
110  // Callback to be run when term feature extraction is complete.
111  // If it was successful, computes a score and runs the DoneCallback.
112  // If extraction was unsuccessful, runs the DoneCallback with a
113  // non-phishy verdict.
114  void TermExtractionFinished(bool success);
115
116  // Helper to verify that there is no pending phishing classification.  Dies
117  // in debug builds if the state is not as expected.  This is a no-op in
118  // release builds.
119  void CheckNoPendingClassification();
120
121  // Helper method to run the DoneCallback and clear the state.
122  void RunCallback(const ClientPhishingRequest& verdict);
123
124  // Helper to run the DoneCallback when feature extraction has failed.
125  // This always signals a non-phishy verdict for the page, with kInvalidScore.
126  void RunFailureCallback();
127
128  // Clears the current state of the PhishingClassifier.
129  void Clear();
130
131  content::RenderView* render_view_;  // owns us
132  const Scorer* scorer_;  // owned by the caller
133  scoped_ptr<FeatureExtractorClock> clock_;
134  scoped_ptr<PhishingUrlFeatureExtractor> url_extractor_;
135  scoped_ptr<PhishingDOMFeatureExtractor> dom_extractor_;
136  scoped_ptr<PhishingTermFeatureExtractor> term_extractor_;
137
138  // State for any in-progress extraction.
139  scoped_ptr<FeatureMap> features_;
140  scoped_ptr<std::set<uint32> > shingle_hashes_;
141  const base::string16* page_text_;  // owned by the caller
142  DoneCallback done_callback_;
143
144  // Used in scheduling BeginFeatureExtraction tasks.
145  // These pointers are invalidated if classification is cancelled.
146  base::WeakPtrFactory<PhishingClassifier> weak_factory_;
147
148  DISALLOW_COPY_AND_ASSIGN(PhishingClassifier);
149};
150
151}  // namespace safe_browsing
152
153#endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
154