1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/renderer/safe_browsing/phishing_classifier_delegate.h"
6
7#include <set>
8
9#include "base/bind.h"
10#include "base/callback.h"
11#include "base/lazy_instance.h"
12#include "base/logging.h"
13#include "base/metrics/histogram.h"
14#include "chrome/common/safe_browsing/csd.pb.h"
15#include "chrome/common/safe_browsing/safebrowsing_messages.h"
16#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
17#include "chrome/renderer/safe_browsing/phishing_classifier.h"
18#include "chrome/renderer/safe_browsing/scorer.h"
19#include "content/public/renderer/document_state.h"
20#include "content/public/renderer/navigation_state.h"
21#include "content/public/renderer/render_thread.h"
22#include "content/public/renderer/render_view.h"
23#include "third_party/WebKit/public/platform/WebURL.h"
24#include "third_party/WebKit/public/web/WebDocument.h"
25#include "third_party/WebKit/public/web/WebLocalFrame.h"
26#include "third_party/WebKit/public/web/WebView.h"
27
28using content::DocumentState;
29using content::NavigationState;
30using content::RenderThread;
31
32namespace safe_browsing {
33
34static GURL StripRef(const GURL& url) {
35  GURL::Replacements replacements;
36  replacements.ClearRef();
37  return url.ReplaceComponents(replacements);
38}
39
40typedef std::set<PhishingClassifierDelegate*> PhishingClassifierDelegates;
41static base::LazyInstance<PhishingClassifierDelegates>
42    g_delegates = LAZY_INSTANCE_INITIALIZER;
43
44static base::LazyInstance<scoped_ptr<const safe_browsing::Scorer> >
45    g_phishing_scorer = LAZY_INSTANCE_INITIALIZER;
46
47// static
48PhishingClassifierFilter* PhishingClassifierFilter::Create() {
49  // Private constructor and public static Create() method to facilitate
50  // stubbing out this class for binary-size reduction purposes.
51  return new PhishingClassifierFilter();
52}
53
54PhishingClassifierFilter::PhishingClassifierFilter()
55    : RenderProcessObserver() {}
56
57PhishingClassifierFilter::~PhishingClassifierFilter() {}
58
59bool PhishingClassifierFilter::OnControlMessageReceived(
60    const IPC::Message& message) {
61  bool handled = true;
62  IPC_BEGIN_MESSAGE_MAP(PhishingClassifierFilter, message)
63    IPC_MESSAGE_HANDLER(SafeBrowsingMsg_SetPhishingModel, OnSetPhishingModel)
64    IPC_MESSAGE_UNHANDLED(handled = false)
65  IPC_END_MESSAGE_MAP()
66  return handled;
67}
68
69void PhishingClassifierFilter::OnSetPhishingModel(const std::string& model) {
70  safe_browsing::Scorer* scorer = NULL;
71  // An empty model string means we should disable client-side phishing
72  // detection.
73  if (!model.empty()) {
74    scorer = safe_browsing::Scorer::Create(model);
75    if (!scorer) {
76      DLOG(ERROR) << "Unable to create a PhishingScorer - corrupt model?";
77      return;
78    }
79  }
80  PhishingClassifierDelegates::iterator i;
81  for (i = g_delegates.Get().begin(); i != g_delegates.Get().end(); ++i) {
82    (*i)->SetPhishingScorer(scorer);
83  }
84  g_phishing_scorer.Get().reset(scorer);
85}
86
87// static
88PhishingClassifierDelegate* PhishingClassifierDelegate::Create(
89    content::RenderView* render_view, PhishingClassifier* classifier) {
90  // Private constructor and public static Create() method to facilitate
91  // stubbing out this class for binary-size reduction purposes.
92  return new PhishingClassifierDelegate(render_view, classifier);
93}
94
95PhishingClassifierDelegate::PhishingClassifierDelegate(
96    content::RenderView* render_view,
97    PhishingClassifier* classifier)
98    : content::RenderViewObserver(render_view),
99      last_main_frame_transition_(ui::PAGE_TRANSITION_LINK),
100      have_page_text_(false),
101      is_classifying_(false) {
102  g_delegates.Get().insert(this);
103  if (!classifier) {
104    classifier = new PhishingClassifier(render_view,
105                                        new FeatureExtractorClock());
106  }
107
108  classifier_.reset(classifier);
109
110  if (g_phishing_scorer.Get().get())
111    SetPhishingScorer(g_phishing_scorer.Get().get());
112}
113
114PhishingClassifierDelegate::~PhishingClassifierDelegate() {
115  CancelPendingClassification(SHUTDOWN);
116  g_delegates.Get().erase(this);
117}
118
119void PhishingClassifierDelegate::SetPhishingScorer(
120    const safe_browsing::Scorer* scorer) {
121  if (!render_view()->GetWebView())
122    return;  // RenderView is tearing down.
123  if (is_classifying_) {
124    // If there is a classification going on right now it means we're
125    // actually replacing an existing scorer with a new model.  In
126    // this case we simply cancel the current classification.
127    // TODO(noelutz): if this happens too frequently we could also
128    // replace the old scorer with the new one once classification is done
129    // but this would complicate the code somewhat.
130    CancelPendingClassification(NEW_PHISHING_SCORER);
131  }
132  classifier_->set_phishing_scorer(scorer);
133  // Start classifying the current page if all conditions are met.
134  // See MaybeStartClassification() for details.
135  MaybeStartClassification();
136}
137
138void PhishingClassifierDelegate::OnStartPhishingDetection(const GURL& url) {
139  last_url_received_from_browser_ = StripRef(url);
140  // Start classifying the current page if all conditions are met.
141  // See MaybeStartClassification() for details.
142  MaybeStartClassification();
143}
144
145void PhishingClassifierDelegate::DidCommitProvisionalLoad(
146    blink::WebLocalFrame* frame, bool is_new_navigation) {
147  // A new page is starting to load, so cancel classificaiton.
148  //
149  // TODO(bryner): We shouldn't need to cancel classification if the navigation
150  // is within the same page.  However, if we let classification continue in
151  // this case, we need to properly deal with the fact that PageCaptured will
152  // be called again for the in-page navigation.  We need to be sure not to
153  // swap out the page text while the term feature extractor is still running.
154  DocumentState* document_state = DocumentState::FromDataSource(
155      frame->dataSource());
156  NavigationState* navigation_state = document_state->navigation_state();
157  CancelPendingClassification(navigation_state->was_within_same_page() ?
158                              NAVIGATE_WITHIN_PAGE : NAVIGATE_AWAY);
159  if (frame == render_view()->GetWebView()->mainFrame()) {
160    last_main_frame_transition_ = navigation_state->transition_type();
161  }
162}
163
164void PhishingClassifierDelegate::PageCaptured(base::string16* page_text,
165                                              bool preliminary_capture) {
166  if (preliminary_capture) {
167    return;
168  }
169  // Make sure there's no classification in progress.  We don't want to swap
170  // out the page text string from underneath the term feature extractor.
171  //
172  // Note: Currently, if the url hasn't changed, we won't restart
173  // classification in this case.  We may want to adjust this.
174  CancelPendingClassification(PAGE_RECAPTURED);
175  last_finished_load_url_ = GetToplevelUrl();
176  classifier_page_text_.swap(*page_text);
177  have_page_text_ = true;
178  MaybeStartClassification();
179}
180
181void PhishingClassifierDelegate::CancelPendingClassification(
182    CancelClassificationReason reason) {
183  if (is_classifying_) {
184    UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.CancelClassificationReason",
185                              reason,
186                              CANCEL_CLASSIFICATION_MAX);
187    is_classifying_ = false;
188  }
189  if (classifier_->is_ready()) {
190    classifier_->CancelPendingClassification();
191  }
192  classifier_page_text_.clear();
193  have_page_text_ = false;
194}
195
196bool PhishingClassifierDelegate::OnMessageReceived(
197    const IPC::Message& message) {
198  bool handled = true;
199  IPC_BEGIN_MESSAGE_MAP(PhishingClassifierDelegate, message)
200    IPC_MESSAGE_HANDLER(SafeBrowsingMsg_StartPhishingDetection,
201                        OnStartPhishingDetection)
202    IPC_MESSAGE_UNHANDLED(handled = false)
203  IPC_END_MESSAGE_MAP()
204  return handled;
205}
206
207void PhishingClassifierDelegate::ClassificationDone(
208    const ClientPhishingRequest& verdict) {
209  // We no longer need the page text.
210  classifier_page_text_.clear();
211  VLOG(2) << "Phishy verdict = " << verdict.is_phishing()
212          << " score = " << verdict.client_score();
213  if (verdict.client_score() != PhishingClassifier::kInvalidScore) {
214    DCHECK_EQ(last_url_sent_to_classifier_.spec(), verdict.url());
215    RenderThread::Get()->Send(new SafeBrowsingHostMsg_PhishingDetectionDone(
216        routing_id(), verdict.SerializeAsString()));
217  }
218}
219
220GURL PhishingClassifierDelegate::GetToplevelUrl() {
221  return render_view()->GetWebView()->mainFrame()->document().url();
222}
223
224void PhishingClassifierDelegate::MaybeStartClassification() {
225  // We can begin phishing classification when the following conditions are
226  // met:
227  //  1. A Scorer has been created
228  //  2. The browser has sent a StartPhishingDetection message for the current
229  //     toplevel URL.
230  //  3. The page has finished loading and the page text has been extracted.
231  //  4. The load is a new navigation (not a session history navigation).
232  //  5. The toplevel URL has not already been classified.
233  //
234  // Note that if we determine that this particular navigation should not be
235  // classified at all (as opposed to deferring it until we get an IPC or the
236  // load completes), we discard the page text since it won't be needed.
237  if (!classifier_->is_ready()) {
238    VLOG(2) << "Not starting classification, no Scorer created.";
239    // Keep classifier_page_text_, in case a Scorer is set later.
240    return;
241  }
242
243  if (last_main_frame_transition_ & ui::PAGE_TRANSITION_FORWARD_BACK) {
244    // Skip loads from session history navigation.  However, update the
245    // last URL sent to the classifier, so that we'll properly detect
246    // in-page navigations.
247    VLOG(2) << "Not starting classification for back/forward navigation";
248    last_url_sent_to_classifier_ = last_finished_load_url_;
249    classifier_page_text_.clear();  // we won't need this.
250    have_page_text_ = false;
251    return;
252  }
253
254  GURL stripped_last_load_url(StripRef(last_finished_load_url_));
255  if (stripped_last_load_url == StripRef(last_url_sent_to_classifier_)) {
256    // We've already classified this toplevel URL, so this was likely an
257    // in-page navigation or a subframe navigation.  The browser should not
258    // send a StartPhishingDetection IPC in this case.
259    VLOG(2) << "Toplevel URL is unchanged, not starting classification.";
260    classifier_page_text_.clear();  // we won't need this.
261    have_page_text_ = false;
262    return;
263  }
264
265  if (!have_page_text_) {
266    VLOG(2) << "Not starting classification, there is no page text ready.";
267    return;
268  }
269
270  if (last_url_received_from_browser_ != stripped_last_load_url) {
271    // The browser has not yet confirmed that this URL should be classified,
272    // so defer classification for now.  Note: the ref does not affect
273    // any of the browser's preclassification checks, so we don't require it
274    // to match.
275    VLOG(2) << "Not starting classification, last url from browser is "
276            << last_url_received_from_browser_ << ", last finished load is "
277            << last_finished_load_url_;
278    // Keep classifier_page_text_, in case the browser notifies us later that
279    // we should classify the URL.
280    return;
281  }
282
283  VLOG(2) << "Starting classification for " << last_finished_load_url_;
284  last_url_sent_to_classifier_ = last_finished_load_url_;
285  is_classifying_ = true;
286  classifier_->BeginClassification(
287      &classifier_page_text_,
288      base::Bind(&PhishingClassifierDelegate::ClassificationDone,
289                 base::Unretained(this)));
290}
291
292}  // namespace safe_browsing
293