15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h" 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string> 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector> 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/features.h" 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/test_utils.h" 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "testing/gmock/include/gmock/gmock.h" 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "testing/gtest/include/gtest/gtest.h" 137dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "url/gurl.h" 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)using ::testing::ElementsAre; 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing { 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class PhishingUrlFeatureExtractorTest : public ::testing::Test { 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) protected: 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PhishingUrlFeatureExtractor extractor_; 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void SplitStringIntoLongAlphanumTokens(const std::string& full, 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<std::string>* tokens) { 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(full, 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) tokens); 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST_F(PhishingUrlFeatureExtractorTest, ExtractFeatures) { 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string url = "http://123.0.0.1/mydocuments/a.file.html"; 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FeatureMap expected_features; 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlHostIsIpAddress); 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlPathToken + 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("mydocuments")); 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlPathToken + 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("file")); 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlPathToken + 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("html")); 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FeatureMap features; 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features)); 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ExpectFeatureMapsAreEqual(features, expected_features); 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url = "http://www.www.cnn.co.uk/sports/sports/index.html?shouldnotappear"; 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.Clear(); 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlTldToken + 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("co.uk")); 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlDomainToken + 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("cnn")); 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlOtherHostToken + 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("www")); 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne); 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlPathToken + 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("sports")); 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlPathToken + 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("index")); 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlPathToken + 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("html")); 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features.Clear(); 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features)); 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ExpectFeatureMapsAreEqual(features, expected_features); 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url = "http://justadomain.com/"; 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.Clear(); 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlTldToken + 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("com")); 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlDomainToken + 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("justadomain")); 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features.Clear(); 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features)); 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ExpectFeatureMapsAreEqual(features, expected_features); 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url = "http://witharef.com/#abc"; 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.Clear(); 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlTldToken + 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("com")); 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlDomainToken + 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("witharef")); 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features.Clear(); 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features)); 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ExpectFeatureMapsAreEqual(features, expected_features); 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url = "http://...www..lotsodots....com./"; 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.Clear(); 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlTldToken + 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("com")); 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlDomainToken + 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("lotsodots")); 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_features.AddBooleanFeature(features::kUrlOtherHostToken + 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string("www")); 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) features.Clear(); 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features)); 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ExpectFeatureMapsAreEqual(features, expected_features); 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url = "http://unrecognized.tld/"; 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features)); 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url = "http://com/123"; 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features)); 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url = "http://.co.uk/"; 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features)); 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url = "file:///nohost.txt"; 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features)); 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url = "not:valid:at:all"; 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features)); 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST_F(PhishingUrlFeatureExtractorTest, SplitStringIntoLongAlphanumTokens) { 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string full = "This.is/a_pretty\\unusual-!path,indeed"; 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<std::string> long_tokens; 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SplitStringIntoLongAlphanumTokens(full, &long_tokens); 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_THAT(long_tokens, 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ElementsAre("This", "pretty", "unusual", "path", "indeed")); 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) long_tokens.clear(); 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) full = "...i-am_re/al&ly\\b,r,o|k=e:n///up%20"; 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SplitStringIntoLongAlphanumTokens(full, &long_tokens); 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_THAT(long_tokens, ElementsAre()); 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace safe_browsing 130