15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/scorer.h" 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 77d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/containers/hash_tables.h" 82a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "base/files/file_path.h" 92a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "base/files/scoped_temp_dir.h" 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/format_macros.h" 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h" 129ab5563a3196760eb381d102cbb2bc0f7abc6a50Ben Murdoch#include "base/message_loop/message_loop.h" 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/threading/thread.h" 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/common/safe_browsing/client_model.pb.h" 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/features.h" 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "testing/gmock/include/gmock/gmock.h" 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "testing/gtest/include/gtest/gtest.h" 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing { 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class PhishingScorerTest : public ::testing::Test { 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) protected: 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual void SetUp() { 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Setup a simple model. Note that the scorer does not care about 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // how features are encoded so we use readable strings here to make 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the test simpler to follow. 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) model_.Clear(); 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) model_.add_hashes("feature1"); 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) model_.add_hashes("feature2"); 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) model_.add_hashes("feature3"); 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) model_.add_hashes("token one"); 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) model_.add_hashes("token two"); 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientSideModel::Rule* rule; 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rule = model_.add_rule(); 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rule->set_weight(0.5); 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rule = model_.add_rule(); 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rule->add_feature(0); // feature1 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rule->set_weight(2.0); 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rule = model_.add_rule(); 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rule->add_feature(0); // feature1 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rule->add_feature(1); // feature2 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rule->set_weight(3.0); 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) model_.add_page_term(3); // token one 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) model_.add_page_term(4); // token two 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // These will be murmur3 hashes, but for this test it's not necessary 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // that the hashes correspond to actual words. 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) model_.add_page_word(1000U); 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) model_.add_page_word(2000U); 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) model_.add_page_word(3000U); 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) model_.set_max_words_per_term(2); 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) model_.set_murmur_hash_seed(12345U); 58cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) model_.set_max_shingles_per_page(10); 59cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) model_.set_shingle_size(3); 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientSideModel model_; 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST_F(PhishingScorerTest, HasValidModel) { 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) scoped_ptr<Scorer> scorer; 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) scorer.reset(Scorer::Create(model_.SerializeAsString())); 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_TRUE(scorer.get() != NULL); 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Invalid model string. 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) scorer.reset(Scorer::Create("bogus string")); 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_FALSE(scorer.get()); 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Mode is missing a required field. 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) model_.clear_max_words_per_term(); 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) scorer.reset(Scorer::Create(model_.SerializePartialAsString())); 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_FALSE(scorer.get()); 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST_F(PhishingScorerTest, PageTerms) { 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) scoped_ptr<Scorer> scorer(Scorer::Create(model_.SerializeAsString())); 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ASSERT_TRUE(scorer.get()); 83116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch 84116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch // Use std::vector instead of base::hash_set for comparison. 85116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch // On Android, EXPECT_THAT(..., ContainerEq(...)) doesn't support 86116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch // std::hash_set, but std::vector works fine. 87116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch std::vector<std::string> expected_page_terms; 88116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch expected_page_terms.push_back("token one"); 89116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch expected_page_terms.push_back("token two"); 90116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch std::sort(expected_page_terms.begin(), expected_page_terms.end()); 91116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch 92116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch base::hash_set<std::string> page_terms = scorer->page_terms(); 93116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch std::vector<std::string> page_terms_v(page_terms.begin(), page_terms.end()); 94116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch std::sort(page_terms_v.begin(), page_terms_v.end()); 95116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch 96116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch EXPECT_THAT(page_terms_v, ::testing::ContainerEq(expected_page_terms)); 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST_F(PhishingScorerTest, PageWords) { 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) scoped_ptr<Scorer> scorer(Scorer::Create(model_.SerializeAsString())); 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ASSERT_TRUE(scorer.get()); 102116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch std::vector<uint32> expected_page_words; 103116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch expected_page_words.push_back(1000U); 104116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch expected_page_words.push_back(2000U); 105116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch expected_page_words.push_back(3000U); 106116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch std::sort(expected_page_words.begin(), expected_page_words.end()); 107116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch 108116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch base::hash_set<uint32> page_words = scorer->page_words(); 109116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch std::vector<uint32> page_words_v(page_words.begin(), page_words.end()); 110116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch std::sort(page_words_v.begin(), page_words_v.end()); 111116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch 112116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch EXPECT_THAT(page_words_v, ::testing::ContainerEq(expected_page_words)); 113116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(2U, scorer->max_words_per_term()); 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(12345U, scorer->murmurhash3_seed()); 116cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) EXPECT_EQ(10U, scorer->max_shingles_per_page()); 117cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) EXPECT_EQ(3U, scorer->shingle_size()); 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST_F(PhishingScorerTest, ComputeScore) { 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) scoped_ptr<Scorer> scorer(Scorer::Create(model_.SerializeAsString())); 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ASSERT_TRUE(scorer.get()); 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // An empty feature map should match the empty rule. 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FeatureMap features; 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The expected logodds is 0.5 (empty rule) => p = exp(0.5) / (exp(0.5) + 1) 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // => 0.62245933120185459 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_DOUBLE_EQ(0.62245933120185459, scorer->ComputeScore(features)); 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Same if the feature does not match any rule. 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_TRUE(features.AddBooleanFeature("not existing feature")); 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_DOUBLE_EQ(0.62245933120185459, scorer->ComputeScore(features)); 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Feature 1 matches which means that the logodds will be: 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 0.5 (empty rule) + 2.0 (rule weight) * 0.15 (feature weight) = 0.8 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // => p = 0.6899744811276125 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_TRUE(features.AddRealFeature("feature1", 0.15)); 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_DOUBLE_EQ(0.6899744811276125, scorer->ComputeScore(features)); 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Now, both feature 1 and feature 2 match. Expected logodds: 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 0.5 (empty rule) + 2.0 (rule weight) * 0.15 (feature weight) + 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 3.0 (rule weight) * 0.15 (feature1 weight) * 1.0 (feature2) weight = 9.8 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // => p = 0.99999627336071584 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_TRUE(features.AddBooleanFeature("feature2")); 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_DOUBLE_EQ(0.77729986117469119, scorer->ComputeScore(features)); 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace safe_browsing 147