15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector>
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/features.h"
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/safe_browsing/test_utils.h"
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "testing/gmock/include/gmock/gmock.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "testing/gtest/include/gtest/gtest.h"
137dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "url/gurl.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)using ::testing::ElementsAre;
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing {
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class PhishingUrlFeatureExtractorTest : public ::testing::Test {
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) protected:
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PhishingUrlFeatureExtractor extractor_;
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void SplitStringIntoLongAlphanumTokens(const std::string& full,
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                         std::vector<std::string>* tokens) {
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(full,
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                                                   tokens);
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST_F(PhishingUrlFeatureExtractorTest, ExtractFeatures) {
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string url = "http://123.0.0.1/mydocuments/a.file.html";
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FeatureMap expected_features;
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlHostIsIpAddress);
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlPathToken +
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("mydocuments"));
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlPathToken +
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("file"));
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlPathToken +
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("html"));
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FeatureMap features;
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ExpectFeatureMapsAreEqual(features, expected_features);
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  url = "http://www.www.cnn.co.uk/sports/sports/index.html?shouldnotappear";
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.Clear();
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlTldToken +
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("co.uk"));
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlDomainToken +
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("cnn"));
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlOtherHostToken +
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("www"));
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne);
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlPathToken +
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("sports"));
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlPathToken +
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("index"));
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlPathToken +
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("html"));
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  features.Clear();
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ExpectFeatureMapsAreEqual(features, expected_features);
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  url = "http://justadomain.com/";
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.Clear();
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlTldToken +
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("com"));
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlDomainToken +
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("justadomain"));
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  features.Clear();
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ExpectFeatureMapsAreEqual(features, expected_features);
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  url = "http://witharef.com/#abc";
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.Clear();
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlTldToken +
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("com"));
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlDomainToken +
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("witharef"));
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  features.Clear();
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ExpectFeatureMapsAreEqual(features, expected_features);
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  url = "http://...www..lotsodots....com./";
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.Clear();
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlTldToken +
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("com"));
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlDomainToken +
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("lotsodots"));
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  expected_features.AddBooleanFeature(features::kUrlOtherHostToken +
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      std::string("www"));
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  features.Clear();
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ExpectFeatureMapsAreEqual(features, expected_features);
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  url = "http://unrecognized.tld/";
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  url = "http://com/123";
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  url = "http://.co.uk/";
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  url = "file:///nohost.txt";
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  url = "not:valid:at:all";
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST_F(PhishingUrlFeatureExtractorTest, SplitStringIntoLongAlphanumTokens) {
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string full = "This.is/a_pretty\\unusual-!path,indeed";
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<std::string> long_tokens;
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  SplitStringIntoLongAlphanumTokens(full, &long_tokens);
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_THAT(long_tokens,
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              ElementsAre("This", "pretty", "unusual", "path", "indeed"));
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  long_tokens.clear();
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  full = "...i-am_re/al&ly\\b,r,o|k=e:n///up%20";
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  SplitStringIntoLongAlphanumTokens(full, &long_tokens);
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  EXPECT_THAT(long_tokens, ElementsAre());
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace safe_browsing
130