renderer/safe_browsing/phishing_classifier_browsertest.cc

69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal// Copyright (c) 2012 The Chromium Authors. All rights reserved.
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal// Use of this source code is governed by a BSD-style license that can be
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal// found in the LICENSE file.
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal//
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal// Note that although this is not a "browser" test, it runs as part of
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal// browser_tests.  This is because WebKit does not work properly if it is
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal// shutdown and re-initialized.  Since browser_tests runs each test in a
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal// new process, this avoids the problem.
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "chrome/renderer/safe_browsing/phishing_classifier.h"
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include <string>
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "base/bind.h"
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "base/memory/scoped_ptr.h"
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "base/string16.h"
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "base/utf_string_conversions.h"
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "chrome/common/safe_browsing/client_model.pb.h"
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "chrome/common/safe_browsing/csd.pb.h"
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "chrome/renderer/safe_browsing/features.h"
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "chrome/renderer/safe_browsing/scorer.h"
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "content/public/test/render_view_fake_resources_test.h"
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "crypto/sha2.h"
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal#include "testing/gmock/include/gmock/gmock.h"
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigalusing ::testing::AllOf;
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigalusing ::testing::Contains;
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigalusing ::testing::Not;
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigalusing ::testing::Pair;
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigalnamespace safe_browsing {
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigalclass PhishingClassifierTest : public content::RenderViewFakeResourcesTest {
69e17611504376e4d4603925f8528dfc890fd2c6Luis Sigal protected:
  PhishingClassifierTest()
      : url_tld_token_net_(features::kUrlTldToken + std::string("net")),
        page_link_domain_phishing_(features::kPageLinkDomain +
                                   std::string("phishing.com")),
        page_term_login_(features::kPageTerm + std::string("login")) {}

  virtual void SetUp() {
    // Set up WebKit and the RenderView.
    content::RenderViewFakeResourcesTest::SetUp();

    // Construct a model to test with.  We include one feature from each of
    // the feature extractors, which allows us to verify that they all ran.
    ClientSideModel model;

    model.add_hashes(crypto::SHA256HashString(url_tld_token_net_));
    model.add_hashes(crypto::SHA256HashString(page_link_domain_phishing_));
    model.add_hashes(crypto::SHA256HashString(page_term_login_));
    model.add_hashes(crypto::SHA256HashString("login"));
    model.add_hashes(crypto::SHA256HashString(features::kUrlTldToken +
                                              std::string("net")));
    model.add_hashes(crypto::SHA256HashString(features::kPageLinkDomain +
                                              std::string("phishing.com")));
    model.add_hashes(crypto::SHA256HashString(features::kPageTerm +
                                              std::string("login")));
    model.add_hashes(crypto::SHA256HashString("login"));

    // Add a default rule with a non-phishy weight.
    ClientSideModel::Rule* rule = model.add_rule();
    rule->set_weight(-1.0);

    // To give a phishy score, the total weight needs to be >= 0
    // (0.5 when converted to a probability).  This will only happen
    // if all of the listed features are present.
    rule = model.add_rule();
    rule->add_feature(0);
    rule->add_feature(1);
    rule->add_feature(2);
    rule->set_weight(1.0);

    model.add_page_term(3);
    model.set_murmur_hash_seed(2777808611U);
    model.add_page_word(MurmurHash3String("login", model.murmur_hash_seed()));
    model.set_max_words_per_term(1);

    clock_ = new MockFeatureExtractorClock;
    scorer_.reset(Scorer::Create(model.SerializeAsString()));
    ASSERT_TRUE(scorer_.get());
    classifier_.reset(new PhishingClassifier(view(), clock_));
  }

  virtual void TearDown() {
    content::RenderViewFakeResourcesTest::TearDown();
  }

  // Helper method to start phishing classification and wait for it to
  // complete.  Returns the true if the page is classified as phishy and
  // false otherwise.
  bool RunPhishingClassifier(const string16* page_text,
                             float* phishy_score,
                             FeatureMap* features) {
    verdict_.Clear();
    *phishy_score = PhishingClassifier::kInvalidScore;
    features->Clear();

    classifier_->BeginClassification(
        page_text,
        base::Bind(&PhishingClassifierTest::ClassificationFinished,
                   base::Unretained(this)));
    message_loop_.Run();

    *phishy_score = verdict_.client_score();
    for (int i = 0; i < verdict_.feature_map_size(); ++i) {
      features->AddRealFeature(verdict_.feature_map(i).name(),
                               verdict_.feature_map(i).value());
    }
    return verdict_.is_phishing();
  }

  // Completion callback for classification.
  void ClassificationFinished(const ClientPhishingRequest& verdict) {
    verdict_ = verdict;  // copy the verdict.
    message_loop_.Quit();
  }

  scoped_ptr<Scorer> scorer_;
  scoped_ptr<PhishingClassifier> classifier_;
  MockFeatureExtractorClock* clock_;  // owned by classifier_

  // Features that are in the model.
  const std::string url_tld_token_net_;
  const std::string page_link_domain_phishing_;
  const std::string page_term_login_;

  // This member holds the status from the most recent call to the
  // ClassificationFinished callback.
  ClientPhishingRequest verdict_;
};

TEST_F(PhishingClassifierTest, TestClassification) {
  // No scorer yet, so the classifier is not ready.
  EXPECT_FALSE(classifier_->is_ready());

  // Now set the scorer.
  classifier_->set_phishing_scorer(scorer_.get());
  EXPECT_TRUE(classifier_->is_ready());

  // This test doesn't exercise the extraction timing.
  EXPECT_CALL(*clock_, Now())
      .WillRepeatedly(::testing::Return(base::TimeTicks::Now()));

  responses_["http://host.net/"] =
      "<html><body><a href=\"http://phishing.com/\">login</a></body></html>";
  LoadURL("http://host.net/");

  string16 page_text = ASCIIToUTF16("login");
  float phishy_score;
  FeatureMap features;
  EXPECT_TRUE(RunPhishingClassifier(&page_text, &phishy_score, &features));
  // Note: features.features() might contain other features that simply aren't
  // in the model.
  EXPECT_THAT(features.features(),
              AllOf(Contains(Pair(url_tld_token_net_, 1.0)),
                    Contains(Pair(page_link_domain_phishing_, 1.0)),
                    Contains(Pair(page_term_login_, 1.0))));
  EXPECT_FLOAT_EQ(0.5, phishy_score);

  // Change the link domain to something non-phishy.
  responses_["http://host.net/"] =
      "<html><body><a href=\"http://safe.com/\">login</a></body></html>";
  LoadURL("http://host.net/");

  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
  EXPECT_THAT(features.features(),
              AllOf(Contains(Pair(url_tld_token_net_, 1.0)),
                    Contains(Pair(page_term_login_, 1.0))));
  EXPECT_THAT(features.features(),
              Not(Contains(Pair(page_link_domain_phishing_, 1.0))));
  EXPECT_GE(phishy_score, 0.0);
  EXPECT_LT(phishy_score, 0.5);

  // Extraction should fail for this case, since there is no TLD.
  responses_["http://localhost/"] = "<html><body>content</body></html>";
  LoadURL("http://localhost/");
  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
  EXPECT_EQ(0U, features.features().size());
  EXPECT_EQ(PhishingClassifier::kInvalidScore, phishy_score);

  // Extraction should also fail for this case, because the URL is not http.
  responses_["https://host.net/"] = "<html><body>secure</body></html>";
  LoadURL("https://host.net/");
  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
  EXPECT_EQ(0U, features.features().size());
  EXPECT_EQ(PhishingClassifier::kInvalidScore, phishy_score);

  // Extraction should fail for this case because the URL is a POST request.
  LoadURLWithPost("http://host.net/");
  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
  EXPECT_EQ(0U, features.features().size());
  EXPECT_EQ(PhishingClassifier::kInvalidScore, phishy_score);
}

TEST_F(PhishingClassifierTest, DisableDetection) {
  // No scorer yet, so the classifier is not ready.
  EXPECT_FALSE(classifier_->is_ready());

  // Now set the scorer.
  classifier_->set_phishing_scorer(scorer_.get());
  EXPECT_TRUE(classifier_->is_ready());

  // Set a NULL scorer, which turns detection back off.
  classifier_->set_phishing_scorer(NULL);
  EXPECT_FALSE(classifier_->is_ready());
}

}  // namespace safe_browsing