1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h" 6 7#include <string> 8#include <vector> 9#include "chrome/renderer/safe_browsing/features.h" 10#include "chrome/renderer/safe_browsing/test_utils.h" 11#include "testing/gmock/include/gmock/gmock.h" 12#include "testing/gtest/include/gtest/gtest.h" 13#include "url/gurl.h" 14 15using ::testing::ElementsAre; 16 17namespace safe_browsing { 18 19class PhishingUrlFeatureExtractorTest : public ::testing::Test { 20 protected: 21 PhishingUrlFeatureExtractor extractor_; 22 23 void SplitStringIntoLongAlphanumTokens(const std::string& full, 24 std::vector<std::string>* tokens) { 25 PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(full, 26 tokens); 27 } 28}; 29 30TEST_F(PhishingUrlFeatureExtractorTest, ExtractFeatures) { 31 std::string url = "http://123.0.0.1/mydocuments/a.file.html"; 32 FeatureMap expected_features; 33 expected_features.AddBooleanFeature(features::kUrlHostIsIpAddress); 34 expected_features.AddBooleanFeature(features::kUrlPathToken + 35 std::string("mydocuments")); 36 expected_features.AddBooleanFeature(features::kUrlPathToken + 37 std::string("file")); 38 expected_features.AddBooleanFeature(features::kUrlPathToken + 39 std::string("html")); 40 41 FeatureMap features; 42 ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features)); 43 ExpectFeatureMapsAreEqual(features, expected_features); 44 45 url = "http://www.www.cnn.co.uk/sports/sports/index.html?shouldnotappear"; 46 expected_features.Clear(); 47 expected_features.AddBooleanFeature(features::kUrlTldToken + 48 std::string("co.uk")); 49 expected_features.AddBooleanFeature(features::kUrlDomainToken + 50 std::string("cnn")); 51 expected_features.AddBooleanFeature(features::kUrlOtherHostToken + 52 std::string("www")); 53 expected_features.AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne); 54 expected_features.AddBooleanFeature(features::kUrlPathToken + 55 std::string("sports")); 56 expected_features.AddBooleanFeature(features::kUrlPathToken + 57 std::string("index")); 58 expected_features.AddBooleanFeature(features::kUrlPathToken + 59 std::string("html")); 60 61 features.Clear(); 62 ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features)); 63 ExpectFeatureMapsAreEqual(features, expected_features); 64 65 url = "http://justadomain.com/"; 66 expected_features.Clear(); 67 expected_features.AddBooleanFeature(features::kUrlTldToken + 68 std::string("com")); 69 expected_features.AddBooleanFeature(features::kUrlDomainToken + 70 std::string("justadomain")); 71 72 features.Clear(); 73 ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features)); 74 ExpectFeatureMapsAreEqual(features, expected_features); 75 76 url = "http://witharef.com/#abc"; 77 expected_features.Clear(); 78 expected_features.AddBooleanFeature(features::kUrlTldToken + 79 std::string("com")); 80 expected_features.AddBooleanFeature(features::kUrlDomainToken + 81 std::string("witharef")); 82 83 features.Clear(); 84 ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features)); 85 ExpectFeatureMapsAreEqual(features, expected_features); 86 87 url = "http://...www..lotsodots....com./"; 88 expected_features.Clear(); 89 expected_features.AddBooleanFeature(features::kUrlTldToken + 90 std::string("com")); 91 expected_features.AddBooleanFeature(features::kUrlDomainToken + 92 std::string("lotsodots")); 93 expected_features.AddBooleanFeature(features::kUrlOtherHostToken + 94 std::string("www")); 95 96 features.Clear(); 97 ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features)); 98 ExpectFeatureMapsAreEqual(features, expected_features); 99 100 url = "http://unrecognized.tld/"; 101 EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features)); 102 103 url = "http://com/123"; 104 EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features)); 105 106 url = "http://.co.uk/"; 107 EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features)); 108 109 url = "file:///nohost.txt"; 110 EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features)); 111 112 url = "not:valid:at:all"; 113 EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features)); 114} 115 116TEST_F(PhishingUrlFeatureExtractorTest, SplitStringIntoLongAlphanumTokens) { 117 std::string full = "This.is/a_pretty\\unusual-!path,indeed"; 118 std::vector<std::string> long_tokens; 119 SplitStringIntoLongAlphanumTokens(full, &long_tokens); 120 EXPECT_THAT(long_tokens, 121 ElementsAre("This", "pretty", "unusual", "path", "indeed")); 122 123 long_tokens.clear(); 124 full = "...i-am_re/al&ly\\b,r,o|k=e:n///up%20"; 125 SplitStringIntoLongAlphanumTokens(full, &long_tokens); 126 EXPECT_THAT(long_tokens, ElementsAre()); 127} 128 129} // namespace safe_browsing 130