1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/renderer/safe_browsing/phishing_classifier.h"
6
7#include <string>
8
9#include "base/bind.h"
10#include "base/callback.h"
11#include "base/compiler_specific.h"
12#include "base/logging.h"
13#include "base/message_loop/message_loop.h"
14#include "base/metrics/histogram.h"
15#include "base/strings/string_util.h"
16#include "chrome/common/safe_browsing/csd.pb.h"
17#include "chrome/common/url_constants.h"
18#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
19#include "chrome/renderer/safe_browsing/features.h"
20#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
21#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
22#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
23#include "chrome/renderer/safe_browsing/scorer.h"
24#include "content/public/renderer/render_view.h"
25#include "crypto/sha2.h"
26#include "third_party/WebKit/public/platform/WebURL.h"
27#include "third_party/WebKit/public/platform/WebURLRequest.h"
28#include "third_party/WebKit/public/web/WebDataSource.h"
29#include "third_party/WebKit/public/web/WebDocument.h"
30#include "third_party/WebKit/public/web/WebFrame.h"
31#include "third_party/WebKit/public/web/WebView.h"
32#include "url/gurl.h"
33
34namespace safe_browsing {
35
36const float PhishingClassifier::kInvalidScore = -1.0;
37const float PhishingClassifier::kPhishyThreshold = 0.5;
38
39PhishingClassifier::PhishingClassifier(content::RenderView* render_view,
40                                       FeatureExtractorClock* clock)
41    : render_view_(render_view),
42      scorer_(NULL),
43      clock_(clock),
44      weak_factory_(this) {
45  Clear();
46}
47
48PhishingClassifier::~PhishingClassifier() {
49  // The RenderView should have called CancelPendingClassification() before
50  // we are destroyed.
51  CheckNoPendingClassification();
52}
53
54void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {
55  CheckNoPendingClassification();
56  scorer_ = scorer;
57  if (scorer_) {
58    url_extractor_.reset(new PhishingUrlFeatureExtractor);
59    dom_extractor_.reset(
60        new PhishingDOMFeatureExtractor(render_view_, clock_.get()));
61    term_extractor_.reset(new PhishingTermFeatureExtractor(
62        &scorer_->page_terms(),
63        &scorer_->page_words(),
64        scorer_->max_words_per_term(),
65        scorer_->murmurhash3_seed(),
66        scorer_->max_shingles_per_page(),
67        scorer_->shingle_size(),
68        clock_.get()));
69  } else {
70    // We're disabling client-side phishing detection, so tear down all
71    // of the relevant objects.
72    url_extractor_.reset();
73    dom_extractor_.reset();
74    term_extractor_.reset();
75  }
76}
77
78bool PhishingClassifier::is_ready() const {
79  return scorer_ != NULL;
80}
81
82void PhishingClassifier::BeginClassification(
83    const base::string16* page_text,
84    const DoneCallback& done_callback) {
85  DCHECK(is_ready());
86
87  // The RenderView should have called CancelPendingClassification() before
88  // starting a new classification, so DCHECK this.
89  CheckNoPendingClassification();
90  // However, in an opt build, we will go ahead and clean up the pending
91  // classification so that we can start in a known state.
92  CancelPendingClassification();
93
94  page_text_ = page_text;
95  done_callback_ = done_callback;
96
97  // For consistency, we always want to invoke the DoneCallback
98  // asynchronously, rather than directly from this method.  To ensure that
99  // this is the case, post a task to begin feature extraction on the next
100  // iteration of the message loop.
101  base::MessageLoop::current()->PostTask(
102      FROM_HERE,
103      base::Bind(&PhishingClassifier::BeginFeatureExtraction,
104                 weak_factory_.GetWeakPtr()));
105}
106
107void PhishingClassifier::BeginFeatureExtraction() {
108  blink::WebView* web_view = render_view_->GetWebView();
109  if (!web_view) {
110    RunFailureCallback();
111    return;
112  }
113
114  blink::WebFrame* frame = web_view->mainFrame();
115  if (!frame) {
116    RunFailureCallback();
117    return;
118  }
119
120  // Check whether the URL is one that we should classify.
121  // Currently, we only classify http: URLs that are GET requests.
122  GURL url(frame->document().url());
123  if (!url.SchemeIs(url::kHttpScheme)) {
124    RunFailureCallback();
125    return;
126  }
127
128  blink::WebDataSource* ds = frame->dataSource();
129  if (!ds || !EqualsASCII(ds->request().httpMethod(), "GET")) {
130    RunFailureCallback();
131    return;
132  }
133
134  features_.reset(new FeatureMap);
135  if (!url_extractor_->ExtractFeatures(url, features_.get())) {
136    RunFailureCallback();
137    return;
138  }
139
140  // DOM feature extraction can take awhile, so it runs asynchronously
141  // in several chunks of work and invokes the callback when finished.
142  dom_extractor_->ExtractFeatures(
143      features_.get(),
144      base::Bind(&PhishingClassifier::DOMExtractionFinished,
145                 base::Unretained(this)));
146}
147
148void PhishingClassifier::CancelPendingClassification() {
149  // Note that cancelling the feature extractors is simply a no-op if they
150  // were not running.
151  DCHECK(is_ready());
152  dom_extractor_->CancelPendingExtraction();
153  term_extractor_->CancelPendingExtraction();
154  weak_factory_.InvalidateWeakPtrs();
155  Clear();
156}
157
158void PhishingClassifier::DOMExtractionFinished(bool success) {
159  shingle_hashes_.reset(new std::set<uint32>);
160  if (success) {
161    // Term feature extraction can take awhile, so it runs asynchronously
162    // in several chunks of work and invokes the callback when finished.
163    term_extractor_->ExtractFeatures(
164        page_text_,
165        features_.get(),
166        shingle_hashes_.get(),
167        base::Bind(&PhishingClassifier::TermExtractionFinished,
168                   base::Unretained(this)));
169  } else {
170    RunFailureCallback();
171  }
172}
173
174void PhishingClassifier::TermExtractionFinished(bool success) {
175  if (success) {
176    blink::WebView* web_view = render_view_->GetWebView();
177    if (!web_view) {
178      RunFailureCallback();
179      return;
180    }
181    blink::WebFrame* main_frame = web_view->mainFrame();
182    if (!main_frame) {
183      RunFailureCallback();
184      return;
185    }
186
187    // Hash all of the features so that they match the model, then compute
188    // the score.
189    FeatureMap hashed_features;
190    ClientPhishingRequest verdict;
191    verdict.set_model_version(scorer_->model_version());
192    verdict.set_url(main_frame->document().url().spec());
193    for (base::hash_map<std::string, double>::const_iterator it =
194             features_->features().begin();
195         it != features_->features().end(); ++it) {
196      VLOG(2) << "Feature: " << it->first << " = " << it->second;
197      bool result = hashed_features.AddRealFeature(
198          crypto::SHA256HashString(it->first), it->second);
199      DCHECK(result);
200      ClientPhishingRequest::Feature* feature = verdict.add_feature_map();
201      feature->set_name(it->first);
202      feature->set_value(it->second);
203    }
204    for (std::set<uint32>::const_iterator it = shingle_hashes_->begin();
205         it != shingle_hashes_->end(); ++it) {
206      verdict.add_shingle_hashes(*it);
207    }
208    float score = static_cast<float>(scorer_->ComputeScore(hashed_features));
209    verdict.set_client_score(score);
210    verdict.set_is_phishing(score >= kPhishyThreshold);
211    RunCallback(verdict);
212  } else {
213    RunFailureCallback();
214  }
215}
216
217void PhishingClassifier::CheckNoPendingClassification() {
218  DCHECK(done_callback_.is_null());
219  DCHECK(!page_text_);
220  if (!done_callback_.is_null() || page_text_) {
221    LOG(ERROR) << "Classification in progress, missing call to "
222               << "CancelPendingClassification";
223    UMA_HISTOGRAM_COUNTS("SBClientPhishing.CheckNoPendingClassificationFailed",
224                         1);
225  }
226}
227
228void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) {
229  done_callback_.Run(verdict);
230  Clear();
231}
232
233void PhishingClassifier::RunFailureCallback() {
234  ClientPhishingRequest verdict;
235  // In this case we're not guaranteed to have a valid URL.  Just set it
236  // to the empty string to make sure we have a valid protocol buffer.
237  verdict.set_url("");
238  verdict.set_client_score(kInvalidScore);
239  verdict.set_is_phishing(false);
240  RunCallback(verdict);
241}
242
243void PhishingClassifier::Clear() {
244  page_text_ = NULL;
245  done_callback_.Reset();
246  features_.reset(NULL);
247  shingle_hashes_.reset(NULL);
248}
249
250}  // namespace safe_browsing
251