phishing_classifier_browsertest.cc revision 5821806d5e7f356e8fa4b058a389a808ea183019
169e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal// Copyright (c) 2012 The Chromium Authors. All rights reserved.
269e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal// Use of this source code is governed by a BSD-style license that can be
369e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal// found in the LICENSE file.
469e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal//
569e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal// Note that although this is not a "browser" test, it runs as part of
669e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal// browser_tests.  This is because WebKit does not work properly if it is
769e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal// shutdown and re-initialized.  Since browser_tests runs each test in a
869e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal// new process, this avoids the problem.
969e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal
1069e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "chrome/renderer/safe_browsing/phishing_classifier.h"
1169e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal
1269e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include <string>
1369e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal
1469e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "base/bind.h"
1569e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "base/memory/scoped_ptr.h"
1669e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "base/string16.h"
1769e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "base/utf_string_conversions.h"
1869e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "chrome/common/safe_browsing/client_model.pb.h"
1969e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "chrome/common/safe_browsing/csd.pb.h"
2069e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "chrome/renderer/safe_browsing/features.h"
2169e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
2269e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
2369e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "chrome/renderer/safe_browsing/scorer.h"
2469e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "content/public/test/render_view_fake_resources_test.h"
2569e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "crypto/sha2.h"
2669e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "testing/gmock/include/gmock/gmock.h"
2769e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal
2869e17611504376e4d4603925f8528dfc890fd2c6Luis Sigalusing ::testing::AllOf;
2969e17611504376e4d4603925f8528dfc890fd2c6Luis Sigalusing ::testing::Contains;
3069e17611504376e4d4603925f8528dfc890fd2c6Luis Sigalusing ::testing::Not;
3169e17611504376e4d4603925f8528dfc890fd2c6Luis Sigalusing ::testing::Pair;
3269e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal
3369e17611504376e4d4603925f8528dfc890fd2c6Luis Sigalnamespace safe_browsing {
3469e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal
3569e17611504376e4d4603925f8528dfc890fd2c6Luis Sigalclass PhishingClassifierTest : public content::RenderViewFakeResourcesTest {
3669e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal protected:
37  PhishingClassifierTest()
38      : url_tld_token_net_(features::kUrlTldToken + std::string("net")),
39        page_link_domain_phishing_(features::kPageLinkDomain +
40                                   std::string("phishing.com")),
41        page_term_login_(features::kPageTerm + std::string("login")) {}
42
43  virtual void SetUp() {
44    // Set up WebKit and the RenderView.
45    content::RenderViewFakeResourcesTest::SetUp();
46
47    // Construct a model to test with.  We include one feature from each of
48    // the feature extractors, which allows us to verify that they all ran.
49    ClientSideModel model;
50
51    model.add_hashes(crypto::SHA256HashString(url_tld_token_net_));
52    model.add_hashes(crypto::SHA256HashString(page_link_domain_phishing_));
53    model.add_hashes(crypto::SHA256HashString(page_term_login_));
54    model.add_hashes(crypto::SHA256HashString("login"));
55    model.add_hashes(crypto::SHA256HashString(features::kUrlTldToken +
56                                              std::string("net")));
57    model.add_hashes(crypto::SHA256HashString(features::kPageLinkDomain +
58                                              std::string("phishing.com")));
59    model.add_hashes(crypto::SHA256HashString(features::kPageTerm +
60                                              std::string("login")));
61    model.add_hashes(crypto::SHA256HashString("login"));
62
63    // Add a default rule with a non-phishy weight.
64    ClientSideModel::Rule* rule = model.add_rule();
65    rule->set_weight(-1.0);
66
67    // To give a phishy score, the total weight needs to be >= 0
68    // (0.5 when converted to a probability).  This will only happen
69    // if all of the listed features are present.
70    rule = model.add_rule();
71    rule->add_feature(0);
72    rule->add_feature(1);
73    rule->add_feature(2);
74    rule->set_weight(1.0);
75
76    model.add_page_term(3);
77    model.set_murmur_hash_seed(2777808611U);
78    model.add_page_word(MurmurHash3String("login", model.murmur_hash_seed()));
79    model.set_max_words_per_term(1);
80
81    clock_ = new MockFeatureExtractorClock;
82    scorer_.reset(Scorer::Create(model.SerializeAsString()));
83    ASSERT_TRUE(scorer_.get());
84    classifier_.reset(new PhishingClassifier(view(), clock_));
85  }
86
87  virtual void TearDown() {
88    content::RenderViewFakeResourcesTest::TearDown();
89  }
90
91  // Helper method to start phishing classification and wait for it to
92  // complete.  Returns the true if the page is classified as phishy and
93  // false otherwise.
94  bool RunPhishingClassifier(const string16* page_text,
95                             float* phishy_score,
96                             FeatureMap* features) {
97    verdict_.Clear();
98    *phishy_score = PhishingClassifier::kInvalidScore;
99    features->Clear();
100
101    classifier_->BeginClassification(
102        page_text,
103        base::Bind(&PhishingClassifierTest::ClassificationFinished,
104                   base::Unretained(this)));
105    message_loop_.Run();
106
107    *phishy_score = verdict_.client_score();
108    for (int i = 0; i < verdict_.feature_map_size(); ++i) {
109      features->AddRealFeature(verdict_.feature_map(i).name(),
110                               verdict_.feature_map(i).value());
111    }
112    return verdict_.is_phishing();
113  }
114
115  // Completion callback for classification.
116  void ClassificationFinished(const ClientPhishingRequest& verdict) {
117    verdict_ = verdict;  // copy the verdict.
118    message_loop_.Quit();
119  }
120
121  scoped_ptr<Scorer> scorer_;
122  scoped_ptr<PhishingClassifier> classifier_;
123  MockFeatureExtractorClock* clock_;  // owned by classifier_
124
125  // Features that are in the model.
126  const std::string url_tld_token_net_;
127  const std::string page_link_domain_phishing_;
128  const std::string page_term_login_;
129
130  // This member holds the status from the most recent call to the
131  // ClassificationFinished callback.
132  ClientPhishingRequest verdict_;
133};
134
135TEST_F(PhishingClassifierTest, TestClassification) {
136  // No scorer yet, so the classifier is not ready.
137  EXPECT_FALSE(classifier_->is_ready());
138
139  // Now set the scorer.
140  classifier_->set_phishing_scorer(scorer_.get());
141  EXPECT_TRUE(classifier_->is_ready());
142
143  // This test doesn't exercise the extraction timing.
144  EXPECT_CALL(*clock_, Now())
145      .WillRepeatedly(::testing::Return(base::TimeTicks::Now()));
146
147  responses_["http://host.net/"] =
148      "<html><body><a href=\"http://phishing.com/\">login</a></body></html>";
149  LoadURL("http://host.net/");
150
151  string16 page_text = ASCIIToUTF16("login");
152  float phishy_score;
153  FeatureMap features;
154  EXPECT_TRUE(RunPhishingClassifier(&page_text, &phishy_score, &features));
155  // Note: features.features() might contain other features that simply aren't
156  // in the model.
157  EXPECT_THAT(features.features(),
158              AllOf(Contains(Pair(url_tld_token_net_, 1.0)),
159                    Contains(Pair(page_link_domain_phishing_, 1.0)),
160                    Contains(Pair(page_term_login_, 1.0))));
161  EXPECT_FLOAT_EQ(0.5, phishy_score);
162
163  // Change the link domain to something non-phishy.
164  responses_["http://host.net/"] =
165      "<html><body><a href=\"http://safe.com/\">login</a></body></html>";
166  LoadURL("http://host.net/");
167
168  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
169  EXPECT_THAT(features.features(),
170              AllOf(Contains(Pair(url_tld_token_net_, 1.0)),
171                    Contains(Pair(page_term_login_, 1.0))));
172  EXPECT_THAT(features.features(),
173              Not(Contains(Pair(page_link_domain_phishing_, 1.0))));
174  EXPECT_GE(phishy_score, 0.0);
175  EXPECT_LT(phishy_score, 0.5);
176
177  // Extraction should fail for this case, since there is no TLD.
178  responses_["http://localhost/"] = "<html><body>content</body></html>";
179  LoadURL("http://localhost/");
180  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
181  EXPECT_EQ(0U, features.features().size());
182  EXPECT_EQ(PhishingClassifier::kInvalidScore, phishy_score);
183
184  // Extraction should also fail for this case, because the URL is not http.
185  responses_["https://host.net/"] = "<html><body>secure</body></html>";
186  LoadURL("https://host.net/");
187  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
188  EXPECT_EQ(0U, features.features().size());
189  EXPECT_EQ(PhishingClassifier::kInvalidScore, phishy_score);
190
191  // Extraction should fail for this case because the URL is a POST request.
192  LoadURLWithPost("http://host.net/");
193  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
194  EXPECT_EQ(0U, features.features().size());
195  EXPECT_EQ(PhishingClassifier::kInvalidScore, phishy_score);
196}
197
198TEST_F(PhishingClassifierTest, DisableDetection) {
199  // No scorer yet, so the classifier is not ready.
200  EXPECT_FALSE(classifier_->is_ready());
201
202  // Now set the scorer.
203  classifier_->set_phishing_scorer(scorer_.get());
204  EXPECT_TRUE(classifier_->is_ready());
205
206  // Set a NULL scorer, which turns detection back off.
207  classifier_->set_phishing_scorer(NULL);
208  EXPECT_FALSE(classifier_->is_ready());
209}
210
211}  // namespace safe_browsing
212