1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
6
7#include <string>
8#include <vector>
9#include "chrome/renderer/safe_browsing/features.h"
10#include "chrome/renderer/safe_browsing/test_utils.h"
11#include "testing/gmock/include/gmock/gmock.h"
12#include "testing/gtest/include/gtest/gtest.h"
13#include "url/gurl.h"
14
15using ::testing::ElementsAre;
16
17namespace safe_browsing {
18
19class PhishingUrlFeatureExtractorTest : public ::testing::Test {
20 protected:
21  PhishingUrlFeatureExtractor extractor_;
22
23  void SplitStringIntoLongAlphanumTokens(const std::string& full,
24                                         std::vector<std::string>* tokens) {
25    PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(full,
26                                                                   tokens);
27  }
28};
29
30TEST_F(PhishingUrlFeatureExtractorTest, ExtractFeatures) {
31  std::string url = "http://123.0.0.1/mydocuments/a.file.html";
32  FeatureMap expected_features;
33  expected_features.AddBooleanFeature(features::kUrlHostIsIpAddress);
34  expected_features.AddBooleanFeature(features::kUrlPathToken +
35                                      std::string("mydocuments"));
36  expected_features.AddBooleanFeature(features::kUrlPathToken +
37                                      std::string("file"));
38  expected_features.AddBooleanFeature(features::kUrlPathToken +
39                                      std::string("html"));
40
41  FeatureMap features;
42  ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
43  ExpectFeatureMapsAreEqual(features, expected_features);
44
45  url = "http://www.www.cnn.co.uk/sports/sports/index.html?shouldnotappear";
46  expected_features.Clear();
47  expected_features.AddBooleanFeature(features::kUrlTldToken +
48                                      std::string("co.uk"));
49  expected_features.AddBooleanFeature(features::kUrlDomainToken +
50                                      std::string("cnn"));
51  expected_features.AddBooleanFeature(features::kUrlOtherHostToken +
52                                      std::string("www"));
53  expected_features.AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne);
54  expected_features.AddBooleanFeature(features::kUrlPathToken +
55                                      std::string("sports"));
56  expected_features.AddBooleanFeature(features::kUrlPathToken +
57                                      std::string("index"));
58  expected_features.AddBooleanFeature(features::kUrlPathToken +
59                                      std::string("html"));
60
61  features.Clear();
62  ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
63  ExpectFeatureMapsAreEqual(features, expected_features);
64
65  url = "http://justadomain.com/";
66  expected_features.Clear();
67  expected_features.AddBooleanFeature(features::kUrlTldToken +
68                                      std::string("com"));
69  expected_features.AddBooleanFeature(features::kUrlDomainToken +
70                                      std::string("justadomain"));
71
72  features.Clear();
73  ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
74  ExpectFeatureMapsAreEqual(features, expected_features);
75
76  url = "http://witharef.com/#abc";
77  expected_features.Clear();
78  expected_features.AddBooleanFeature(features::kUrlTldToken +
79                                      std::string("com"));
80  expected_features.AddBooleanFeature(features::kUrlDomainToken +
81                                      std::string("witharef"));
82
83  features.Clear();
84  ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
85  ExpectFeatureMapsAreEqual(features, expected_features);
86
87  url = "http://...www..lotsodots....com./";
88  expected_features.Clear();
89  expected_features.AddBooleanFeature(features::kUrlTldToken +
90                                      std::string("com"));
91  expected_features.AddBooleanFeature(features::kUrlDomainToken +
92                                      std::string("lotsodots"));
93  expected_features.AddBooleanFeature(features::kUrlOtherHostToken +
94                                      std::string("www"));
95
96  features.Clear();
97  ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
98  ExpectFeatureMapsAreEqual(features, expected_features);
99
100  url = "http://unrecognized.tld/";
101  EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
102
103  url = "http://com/123";
104  EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
105
106  url = "http://.co.uk/";
107  EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
108
109  url = "file:///nohost.txt";
110  EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
111
112  url = "not:valid:at:all";
113  EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
114}
115
116TEST_F(PhishingUrlFeatureExtractorTest, SplitStringIntoLongAlphanumTokens) {
117  std::string full = "This.is/a_pretty\\unusual-!path,indeed";
118  std::vector<std::string> long_tokens;
119  SplitStringIntoLongAlphanumTokens(full, &long_tokens);
120  EXPECT_THAT(long_tokens,
121              ElementsAre("This", "pretty", "unusual", "path", "indeed"));
122
123  long_tokens.clear();
124  full = "...i-am_re/al&ly\\b,r,o|k=e:n///up%20";
125  SplitStringIntoLongAlphanumTokens(full, &long_tokens);
126  EXPECT_THAT(long_tokens, ElementsAre());
127}
128
129}  // namespace safe_browsing
130