15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/scorer.h"
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
77d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/containers/hash_tables.h"
82a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "base/files/file_path.h"
92a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "base/files/scoped_temp_dir.h"
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/format_macros.h"
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h"
129ab5563a3196760eb381d102cbb2bc0f7abc6a50Ben Murdoch#include "base/message_loop/message_loop.h"
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/threading/thread.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/common/safe_browsing/client_model.pb.h"
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/features.h"
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "testing/gmock/include/gmock/gmock.h"
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "testing/gtest/include/gtest/gtest.h"
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing {
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class PhishingScorerTest : public ::testing::Test {
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) protected:
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void SetUp() {
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Setup a simple model.  Note that the scorer does not care about
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // how features are encoded so we use readable strings here to make
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // the test simpler to follow.
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    model_.Clear();
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    model_.add_hashes("feature1");
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    model_.add_hashes("feature2");
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    model_.add_hashes("feature3");
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    model_.add_hashes("token one");
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    model_.add_hashes("token two");
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ClientSideModel::Rule* rule;
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    rule = model_.add_rule();
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    rule->set_weight(0.5);
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    rule = model_.add_rule();
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    rule->add_feature(0);  // feature1
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    rule->set_weight(2.0);
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    rule = model_.add_rule();
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    rule->add_feature(0);  // feature1
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    rule->add_feature(1);  // feature2
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    rule->set_weight(3.0);
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    model_.add_page_term(3);  // token one
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    model_.add_page_term(4);  // token two
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // These will be murmur3 hashes, but for this test it's not necessary
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // that the hashes correspond to actual words.
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    model_.add_page_word(1000U);
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    model_.add_page_word(2000U);
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    model_.add_page_word(3000U);
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    model_.set_max_words_per_term(2);
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    model_.set_murmur_hash_seed(12345U);
58cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)    model_.set_max_shingles_per_page(10);
59cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)    model_.set_shingle_size(3);
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ClientSideModel model_;
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST_F(PhishingScorerTest, HasValidModel) {
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scoped_ptr<Scorer> scorer;
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scorer.reset(Scorer::Create(model_.SerializeAsString()));
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_TRUE(scorer.get() != NULL);
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Invalid model string.
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scorer.reset(Scorer::Create("bogus string"));
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_FALSE(scorer.get());
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Mode is missing a required field.
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  model_.clear_max_words_per_term();
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scorer.reset(Scorer::Create(model_.SerializePartialAsString()));
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_FALSE(scorer.get());
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST_F(PhishingScorerTest, PageTerms) {
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scoped_ptr<Scorer> scorer(Scorer::Create(model_.SerializeAsString()));
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ASSERT_TRUE(scorer.get());
83116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch
84116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  // Use std::vector instead of base::hash_set for comparison.
85116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  // On Android, EXPECT_THAT(..., ContainerEq(...)) doesn't support
86116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  // std::hash_set, but std::vector works fine.
87116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  std::vector<std::string> expected_page_terms;
88116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  expected_page_terms.push_back("token one");
89116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  expected_page_terms.push_back("token two");
90116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  std::sort(expected_page_terms.begin(), expected_page_terms.end());
91116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch
92116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  base::hash_set<std::string> page_terms = scorer->page_terms();
93116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  std::vector<std::string> page_terms_v(page_terms.begin(), page_terms.end());
94116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  std::sort(page_terms_v.begin(), page_terms_v.end());
95116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch
96116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  EXPECT_THAT(page_terms_v, ::testing::ContainerEq(expected_page_terms));
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST_F(PhishingScorerTest, PageWords) {
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scoped_ptr<Scorer> scorer(Scorer::Create(model_.SerializeAsString()));
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ASSERT_TRUE(scorer.get());
102116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  std::vector<uint32> expected_page_words;
103116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  expected_page_words.push_back(1000U);
104116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  expected_page_words.push_back(2000U);
105116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  expected_page_words.push_back(3000U);
106116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  std::sort(expected_page_words.begin(), expected_page_words.end());
107116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch
108116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  base::hash_set<uint32> page_words = scorer->page_words();
109116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  std::vector<uint32> page_words_v(page_words.begin(), page_words.end());
110116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  std::sort(page_words_v.begin(), page_words_v.end());
111116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch
112116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  EXPECT_THAT(page_words_v, ::testing::ContainerEq(expected_page_words));
113116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_EQ(2U, scorer->max_words_per_term());
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_EQ(12345U, scorer->murmurhash3_seed());
116cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  EXPECT_EQ(10U, scorer->max_shingles_per_page());
117cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  EXPECT_EQ(3U, scorer->shingle_size());
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST_F(PhishingScorerTest, ComputeScore) {
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scoped_ptr<Scorer> scorer(Scorer::Create(model_.SerializeAsString()));
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ASSERT_TRUE(scorer.get());
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // An empty feature map should match the empty rule.
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FeatureMap features;
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The expected logodds is 0.5 (empty rule) => p = exp(0.5) / (exp(0.5) + 1)
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // => 0.62245933120185459
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_DOUBLE_EQ(0.62245933120185459, scorer->ComputeScore(features));
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Same if the feature does not match any rule.
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_TRUE(features.AddBooleanFeature("not existing feature"));
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_DOUBLE_EQ(0.62245933120185459, scorer->ComputeScore(features));
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Feature 1 matches which means that the logodds will be:
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   0.5 (empty rule) + 2.0 (rule weight) * 0.15 (feature weight) = 0.8
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   => p = 0.6899744811276125
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_TRUE(features.AddRealFeature("feature1", 0.15));
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_DOUBLE_EQ(0.6899744811276125, scorer->ComputeScore(features));
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Now, both feature 1 and feature 2 match.  Expected logodds:
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   0.5 (empty rule) + 2.0 (rule weight) * 0.15 (feature weight) +
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   3.0 (rule weight) * 0.15 (feature1 weight) * 1.0 (feature2) weight = 9.8
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   => p = 0.99999627336071584
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_TRUE(features.AddBooleanFeature("feature2"));
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_DOUBLE_EQ(0.77729986117469119, scorer->ComputeScore(features));
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace safe_browsing
147