1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4// 5// This class handles the process of extracting all of the features from a 6// page and computing a phishyness score. The basic steps are: 7// - Run each feature extractor over the page, building up a FeatureMap of 8// feature -> value. 9// - SHA-256 hash all of the feature names in the map so that they match the 10// supplied model. 11// - Hand the hashed map off to a Scorer, which computes the probability that 12// the page is phishy. 13// - If the page is phishy, run the supplied callback. 14// 15// For more details, see phishing_*_feature_extractor.h, scorer.h, and 16// client_model.proto. 17 18#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_ 19#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_ 20 21#include <set> 22 23#include "base/basictypes.h" 24#include "base/callback.h" 25#include "base/memory/scoped_ptr.h" 26#include "base/memory/weak_ptr.h" 27#include "base/strings/string16.h" 28 29namespace content { 30class RenderView; 31} 32 33namespace safe_browsing { 34class ClientPhishingRequest; 35class FeatureExtractorClock; 36class FeatureMap; 37class PhishingDOMFeatureExtractor; 38class PhishingTermFeatureExtractor; 39class PhishingUrlFeatureExtractor; 40class Scorer; 41 42class PhishingClassifier { 43 public: 44 // Callback to be run when phishing classification finishes. The verdict 45 // is a ClientPhishingRequest which contains the verdict computed by the 46 // classifier as well as the extracted features. If the verdict.is_phishing() 47 // is true, the page is considered phishy by the client-side model, 48 // and the browser should ping back to get a final verdict. The 49 // verdict.client_score() is set to kInvalidScore if classification failed. 50 typedef base::Callback<void(const ClientPhishingRequest& /* verdict */)> 51 DoneCallback; 52 53 static const float kInvalidScore; 54 55 // Creates a new PhishingClassifier object that will operate on 56 // |render_view|. |clock| is used to time feature extractor operations, and 57 // the PhishingClassifier takes ownership of this object. Note that the 58 // classifier will not be 'ready' until set_phishing_scorer() is called. 59 PhishingClassifier(content::RenderView* render_view, 60 FeatureExtractorClock* clock); 61 virtual ~PhishingClassifier(); 62 63 // Sets a scorer for the classifier to use in computing the phishiness score. 64 // This must live at least as long as the PhishingClassifier. The caller is 65 // expected to cancel any pending classification before setting a phishing 66 // scorer. 67 void set_phishing_scorer(const Scorer* scorer); 68 69 // Returns true if the classifier is ready to classify pages, i.e. it 70 // has had a scorer set via set_phishing_scorer(). 71 bool is_ready() const; 72 73 // Called by the RenderView when a page has finished loading. This begins 74 // the feature extraction and scoring process. |page_text| should contain 75 // the plain text of a web page, including any subframes, as returned by 76 // RenderView::CaptureText(). |page_text| is owned by the caller, and must 77 // not be destroyed until either |done_callback| is run or 78 // CancelPendingClassification() is called. 79 // 80 // To avoid blocking the render thread for too long, phishing classification 81 // may run in several chunks of work, posting a task to the current 82 // MessageLoop to continue processing. Once the scoring process is complete, 83 // |done_callback| is run on the current thread. PhishingClassifier takes 84 // ownership of the callback. 85 // 86 // It is an error to call BeginClassification if the classifier is not yet 87 // ready. 88 virtual void BeginClassification(const base::string16* page_text, 89 const DoneCallback& callback); 90 91 // Called by the RenderView (on the render thread) when a page is unloading 92 // or the RenderView is being destroyed. This cancels any extraction that 93 // is in progress. It is an error to call CancelPendingClassification if 94 // the classifier is not yet ready. 95 virtual void CancelPendingClassification(); 96 97 private: 98 // Any score equal to or above this value is considered phishy. 99 static const float kPhishyThreshold; 100 101 // Begins the feature extraction process, by extracting URL features and 102 // beginning DOM feature extraction. 103 void BeginFeatureExtraction(); 104 105 // Callback to be run when DOM feature extraction is complete. 106 // If it was successful, begins term feature extraction, otherwise 107 // runs the DoneCallback with a non-phishy verdict. 108 void DOMExtractionFinished(bool success); 109 110 // Callback to be run when term feature extraction is complete. 111 // If it was successful, computes a score and runs the DoneCallback. 112 // If extraction was unsuccessful, runs the DoneCallback with a 113 // non-phishy verdict. 114 void TermExtractionFinished(bool success); 115 116 // Helper to verify that there is no pending phishing classification. Dies 117 // in debug builds if the state is not as expected. This is a no-op in 118 // release builds. 119 void CheckNoPendingClassification(); 120 121 // Helper method to run the DoneCallback and clear the state. 122 void RunCallback(const ClientPhishingRequest& verdict); 123 124 // Helper to run the DoneCallback when feature extraction has failed. 125 // This always signals a non-phishy verdict for the page, with kInvalidScore. 126 void RunFailureCallback(); 127 128 // Clears the current state of the PhishingClassifier. 129 void Clear(); 130 131 content::RenderView* render_view_; // owns us 132 const Scorer* scorer_; // owned by the caller 133 scoped_ptr<FeatureExtractorClock> clock_; 134 scoped_ptr<PhishingUrlFeatureExtractor> url_extractor_; 135 scoped_ptr<PhishingDOMFeatureExtractor> dom_extractor_; 136 scoped_ptr<PhishingTermFeatureExtractor> term_extractor_; 137 138 // State for any in-progress extraction. 139 scoped_ptr<FeatureMap> features_; 140 scoped_ptr<std::set<uint32> > shingle_hashes_; 141 const base::string16* page_text_; // owned by the caller 142 DoneCallback done_callback_; 143 144 // Used in scheduling BeginFeatureExtraction tasks. 145 // These pointers are invalidated if classification is cancelled. 146 base::WeakPtrFactory<PhishingClassifier> weak_factory_; 147 148 DISALLOW_COPY_AND_ASSIGN(PhishingClassifier); 149}; 150 151} // namespace safe_browsing 152 153#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_ 154