scored_history_match_unittest.cc revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include <algorithm> 6 7#include "base/string16.h" 8#include "base/utf_string_conversions.h" 9#include "chrome/browser/api/bookmarks/bookmark_service.h" 10#include "chrome/browser/history/scored_history_match.h" 11#include "testing/gtest/include/gtest/gtest.h" 12 13namespace history { 14 15class ScoredHistoryMatchTest : public testing::Test { 16 protected: 17 // Convenience function to create a URLRow with basic data for |url|, |title|, 18 // |visit_count|, and |typed_count|. |days_since_last_visit| gives the number 19 // of days ago to which to set the URL's last_visit. 20 URLRow MakeURLRow(const char* url, 21 const char* title, 22 int visit_count, 23 int days_since_last_visit, 24 int typed_count); 25 26 // Convenience functions for easily creating vectors of search terms. 27 String16Vector Make1Term(const char* term) const; 28 String16Vector Make2Terms(const char* term_1, const char* term_2) const; 29 30 // Convenience function for GetTopicalityScore() that builds the 31 // term match and word break information automatically that are needed 32 // to call GetTopicalityScore(). It only works for scoring a single term, 33 // not multiple terms. 34 float GetTopicalityScoreOfTermAgainstURLAndTitle(const string16& term, 35 const string16& url, 36 const string16& title); 37}; 38 39URLRow ScoredHistoryMatchTest::MakeURLRow(const char* url, 40 const char* title, 41 int visit_count, 42 int days_since_last_visit, 43 int typed_count) { 44 URLRow row(GURL(url), 0); 45 row.set_title(ASCIIToUTF16(title)); 46 row.set_visit_count(visit_count); 47 row.set_typed_count(typed_count); 48 row.set_last_visit(base::Time::NowFromSystemTime() - 49 base::TimeDelta::FromDays(days_since_last_visit)); 50 return row; 51} 52 53String16Vector ScoredHistoryMatchTest::Make1Term(const char* term) const { 54 String16Vector original_terms; 55 original_terms.push_back(ASCIIToUTF16(term)); 56 return original_terms; 57} 58 59String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1, 60 const char* term_2) const { 61 String16Vector original_terms; 62 original_terms.push_back(ASCIIToUTF16(term_1)); 63 original_terms.push_back(ASCIIToUTF16(term_2)); 64 return original_terms; 65} 66 67float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle( 68 const string16& term, 69 const string16& url, 70 const string16& title) { 71 TermMatches url_matches = MatchTermInString(term, url, 0); 72 TermMatches title_matches = MatchTermInString(term, title, 0); 73 RowWordStarts word_starts; 74 String16SetFromString16(url, &word_starts.url_word_starts_); 75 String16SetFromString16(title, &word_starts.title_word_starts_); 76 return ScoredHistoryMatch::GetTopicalityScore( 77 1, url, url_matches, title_matches, word_starts); 78} 79 80TEST_F(ScoredHistoryMatchTest, Scoring) { 81 URLRow row_a(MakeURLRow("http://abcdef", "fedcba", 3, 30, 1)); 82 // We use NowFromSystemTime() because MakeURLRow uses the same function 83 // to calculate last visit time when building a row. 84 base::Time now = base::Time::NowFromSystemTime(); 85 RowWordStarts word_starts; 86 87 // Test scores based on position. 88 // TODO(mpearson): Test new_scoring if we're actually going to turn it 89 // on by default. This requires setting word_starts, which isn't done 90 // right now. 91 ScoredHistoryMatch scored_a(row_a, ASCIIToUTF16("abc"), Make1Term("abc"), 92 word_starts, now, NULL); 93 ScoredHistoryMatch scored_b(row_a, ASCIIToUTF16("bcd"), Make1Term("bcd"), 94 word_starts, now, NULL); 95 EXPECT_GT(scored_a.raw_score, scored_b.raw_score); 96 97 // Test scores based on length. 98 ScoredHistoryMatch scored_c(row_a, ASCIIToUTF16("abcd"), Make1Term("abcd"), 99 word_starts, now, NULL); 100 EXPECT_LT(scored_a.raw_score, scored_c.raw_score); 101 102 // Test scores based on order. 103 ScoredHistoryMatch scored_d(row_a, ASCIIToUTF16("abcdef"), 104 Make2Terms("abc", "def"), word_starts, now, NULL); 105 ScoredHistoryMatch scored_e(row_a, ASCIIToUTF16("def abc"), 106 Make2Terms("def", "abc"), word_starts, now, NULL); 107 EXPECT_GT(scored_d.raw_score, scored_e.raw_score); 108 109 // Test scores based on visit_count. 110 URLRow row_b(MakeURLRow("http://abcdef", "fedcba", 10, 30, 1)); 111 ScoredHistoryMatch scored_f(row_b, ASCIIToUTF16("abc"), Make1Term("abc"), 112 word_starts, now, NULL); 113 EXPECT_GT(scored_f.raw_score, scored_a.raw_score); 114 115 // Test scores based on last_visit. 116 URLRow row_c(MakeURLRow("http://abcdef", "fedcba", 3, 10, 1)); 117 ScoredHistoryMatch scored_g(row_c, ASCIIToUTF16("abc"), Make1Term("abc"), 118 word_starts, now, NULL); 119 EXPECT_GT(scored_g.raw_score, scored_a.raw_score); 120 121 // Test scores based on typed_count. 122 URLRow row_d(MakeURLRow("http://abcdef", "fedcba", 3, 30, 10)); 123 ScoredHistoryMatch scored_h(row_d, ASCIIToUTF16("abc"), Make1Term("abc"), 124 word_starts, now, NULL); 125 EXPECT_GT(scored_h.raw_score, scored_a.raw_score); 126 127 // Test scores based on a terms appearing multiple times. 128 URLRow row_i(MakeURLRow("http://csi.csi.csi/csi_csi", 129 "CSI Guide to CSI Las Vegas, CSI New York, CSI Provo", 3, 30, 10)); 130 ScoredHistoryMatch scored_i(row_i, ASCIIToUTF16("csi"), Make1Term("csi"), 131 word_starts, now, NULL); 132 EXPECT_LT(scored_i.raw_score, 1400); 133} 134 135TEST_F(ScoredHistoryMatchTest, Inlining) { 136 // We use NowFromSystemTime() because MakeURLRow uses the same function 137 // to calculate last visit time when building a row. 138 base::Time now = base::Time::NowFromSystemTime(); 139 RowWordStarts word_starts; 140 141 { 142 URLRow row(MakeURLRow("http://www.google.com", "abcdef", 3, 30, 1)); 143 ScoredHistoryMatch scored_a(row, ASCIIToUTF16("g"), Make1Term("g"), 144 word_starts, now, NULL); 145 EXPECT_TRUE(scored_a.can_inline); 146 EXPECT_FALSE(scored_a.match_in_scheme); 147 ScoredHistoryMatch scored_b(row, ASCIIToUTF16("w"), Make1Term("w"), 148 word_starts, now, NULL); 149 EXPECT_TRUE(scored_b.can_inline); 150 EXPECT_FALSE(scored_b.match_in_scheme); 151 ScoredHistoryMatch scored_c(row, ASCIIToUTF16("h"), Make1Term("h"), 152 word_starts, now, NULL); 153 EXPECT_TRUE(scored_c.can_inline); 154 EXPECT_TRUE(scored_c.match_in_scheme); 155 ScoredHistoryMatch scored_d(row, ASCIIToUTF16("o"), Make1Term("o"), 156 word_starts, now, NULL); 157 EXPECT_FALSE(scored_d.can_inline); 158 EXPECT_FALSE(scored_d.match_in_scheme); 159 } 160 161 { 162 URLRow row(MakeURLRow("http://teams.foo.com", "abcdef", 3, 30, 1)); 163 ScoredHistoryMatch scored_a(row, ASCIIToUTF16("t"), Make1Term("t"), 164 word_starts, now, NULL); 165 EXPECT_TRUE(scored_a.can_inline); 166 EXPECT_FALSE(scored_a.match_in_scheme); 167 ScoredHistoryMatch scored_b(row, ASCIIToUTF16("f"), Make1Term("f"), 168 word_starts, now, NULL); 169 EXPECT_FALSE(scored_b.can_inline); 170 EXPECT_FALSE(scored_b.match_in_scheme); 171 ScoredHistoryMatch scored_c(row, ASCIIToUTF16("o"), Make1Term("o"), 172 word_starts, now, NULL); 173 EXPECT_FALSE(scored_c.can_inline); 174 EXPECT_FALSE(scored_c.match_in_scheme); 175 } 176 177 { 178 URLRow row(MakeURLRow("https://www.testing.com", "abcdef", 3, 30, 1)); 179 ScoredHistoryMatch scored_a(row, ASCIIToUTF16("t"), Make1Term("t"), 180 word_starts, now, NULL); 181 EXPECT_TRUE(scored_a.can_inline); 182 EXPECT_FALSE(scored_a.match_in_scheme); 183 ScoredHistoryMatch scored_b(row, ASCIIToUTF16("h"), Make1Term("h"), 184 word_starts, now, NULL); 185 EXPECT_TRUE(scored_b.can_inline); 186 EXPECT_TRUE(scored_b.match_in_scheme); 187 ScoredHistoryMatch scored_c(row, ASCIIToUTF16("w"), Make1Term("w"), 188 word_starts, now, NULL); 189 EXPECT_TRUE(scored_c.can_inline); 190 EXPECT_FALSE(scored_c.match_in_scheme); 191 } 192} 193 194class BookmarkServiceMock : public BookmarkService { 195 public: 196 explicit BookmarkServiceMock(const GURL& url); 197 virtual ~BookmarkServiceMock() {} 198 199 // Returns true if the given |url| is the same as |url_|. 200 bool IsBookmarked(const GURL& url) OVERRIDE; 201 202 // Required but unused. 203 virtual void GetBookmarks(std::vector<URLAndTitle>* bookmarks) OVERRIDE {} 204 virtual void BlockTillLoaded() OVERRIDE {} 205 206 private: 207 const GURL url_; 208 209 DISALLOW_COPY_AND_ASSIGN(BookmarkServiceMock); 210}; 211 212BookmarkServiceMock::BookmarkServiceMock(const GURL& url) 213 : BookmarkService(), 214 url_(url) { 215} 216 217bool BookmarkServiceMock::IsBookmarked(const GURL& url) { 218 return url == url_; 219} 220 221TEST_F(ScoredHistoryMatchTest, ScoringWithBookmarks) { 222 const GURL url("http://www.nanny.org"); 223 BookmarkServiceMock bookmark_model_mock(url); 224 URLRow row_a(MakeURLRow("http://www.nanny.org", "Nanny", 3, 30, 1)); 225 // We use NowFromSystemTime() because MakeURLRow uses the same function 226 // to calculate last visit time when building a row. 227 base::Time now = base::Time::NowFromSystemTime(); 228 RowWordStarts word_starts; 229 230 // Identical queries but the first should be boosted by having a bookmark. 231 ScoredHistoryMatch scored_a(row_a, ASCIIToUTF16("nanny"), Make1Term("nanny"), 232 word_starts, now, &bookmark_model_mock); 233 ScoredHistoryMatch scored_b(row_a, ASCIIToUTF16("nanny"), Make1Term("nanny"), 234 word_starts, now, NULL); 235 EXPECT_GT(scored_a.raw_score, scored_b.raw_score); 236 237 // Identical queries, neither should be boosted by having a bookmark. 238 ScoredHistoryMatch scored_c(row_a, ASCIIToUTF16("stick"), Make1Term("stick"), 239 word_starts, now, &bookmark_model_mock); 240 ScoredHistoryMatch scored_d(row_a, ASCIIToUTF16("stick"), Make1Term("stick"), 241 word_starts, now, NULL); 242 EXPECT_EQ(scored_c.raw_score, scored_d.raw_score); 243} 244 245TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) { 246 const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle( 247 ASCIIToUTF16("def"), 248 ASCIIToUTF16("http://abc.def.com/"), 249 ASCIIToUTF16("Non-Matching Title")); 250 const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle( 251 ASCIIToUTF16("def"), 252 ASCIIToUTF16("http://abc.def.com"), 253 ASCIIToUTF16("Non-Matching Title")); 254 EXPECT_EQ(hostname_no_slash, hostname); 255} 256 257// This function only tests scoring of single terms that match exactly 258// once somewhere in the URL or title. 259TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) { 260 string16 url = ASCIIToUTF16("http://abc.def.com/path1/path2?" 261 "arg1=val1&arg2=val2#hash_component"); 262 string16 title = ASCIIToUTF16("here is a title"); 263 const float hostname_score = 264 GetTopicalityScoreOfTermAgainstURLAndTitle( 265 ASCIIToUTF16("abc"), url, title); 266 const float hostname_mid_word_score = 267 GetTopicalityScoreOfTermAgainstURLAndTitle( 268 ASCIIToUTF16("bc"), url, title); 269 const float domain_name_score = 270 GetTopicalityScoreOfTermAgainstURLAndTitle( 271 ASCIIToUTF16("def"), url, title); 272 const float domain_name_mid_word_score = 273 GetTopicalityScoreOfTermAgainstURLAndTitle( 274 ASCIIToUTF16("ef"), url, title); 275 const float tld_score = 276 GetTopicalityScoreOfTermAgainstURLAndTitle( 277 ASCIIToUTF16("com"), url, title); 278 const float tld_mid_word_score = 279 GetTopicalityScoreOfTermAgainstURLAndTitle( 280 ASCIIToUTF16("om"), url, title); 281 const float path_score = 282 GetTopicalityScoreOfTermAgainstURLAndTitle( 283 ASCIIToUTF16("path1"), url, title); 284 const float path_mid_word_score = 285 GetTopicalityScoreOfTermAgainstURLAndTitle( 286 ASCIIToUTF16("ath1"), url, title); 287 const float arg_score = 288 GetTopicalityScoreOfTermAgainstURLAndTitle( 289 ASCIIToUTF16("arg2"), url, title); 290 const float arg_mid_word_score = 291 GetTopicalityScoreOfTermAgainstURLAndTitle( 292 ASCIIToUTF16("rg2"), url, title); 293 const float protocol_score = 294 GetTopicalityScoreOfTermAgainstURLAndTitle( 295 ASCIIToUTF16("htt"), url, title); 296 const float protocol_mid_word_score = 297 GetTopicalityScoreOfTermAgainstURLAndTitle( 298 ASCIIToUTF16("tt"), url, title); 299 const float title_score = 300 GetTopicalityScoreOfTermAgainstURLAndTitle( 301 ASCIIToUTF16("her"), url, title); 302 const float title_mid_word_score = 303 GetTopicalityScoreOfTermAgainstURLAndTitle( 304 ASCIIToUTF16("er"), url, title); 305 // Verify hostname and domain name > path > arg, and the same for the 306 // matches at non-word-boundaries. 307 EXPECT_GT(hostname_score, path_score); 308 EXPECT_GT(domain_name_score, path_score); 309 EXPECT_GT(path_score, arg_score); 310 EXPECT_GT(hostname_mid_word_score, path_mid_word_score); 311 EXPECT_GT(domain_name_mid_word_score, path_mid_word_score); 312 EXPECT_GT(path_mid_word_score, arg_mid_word_score); 313 // Also verify that the matches at non-word-boundaries all score 314 // worse than the matches at word boundaries. These three sets suffice. 315 EXPECT_GT(arg_score, hostname_mid_word_score); 316 EXPECT_GT(arg_score, domain_name_mid_word_score); 317 EXPECT_GT(title_score, title_mid_word_score); 318 // Check that title matches fit somewhere reasonable compared to the 319 // various types of URL matches. 320 EXPECT_GT(title_score, arg_score); 321 EXPECT_GT(arg_score, title_mid_word_score); 322 EXPECT_GT(title_mid_word_score, arg_mid_word_score); 323 // Finally, verify that protocol matches and top level domain name 324 // matches (.com, .net, etc.) score worse than everything (except 325 // possibly mid-word matches in the ?arg section of the URL--I can 326 // imagine scoring those pretty harshly as well). 327 EXPECT_GT(path_mid_word_score, protocol_score); 328 EXPECT_GT(path_mid_word_score, protocol_mid_word_score); 329 EXPECT_GT(title_mid_word_score, protocol_score); 330 EXPECT_GT(title_mid_word_score, protocol_mid_word_score); 331 EXPECT_GT(path_mid_word_score, tld_score); 332 EXPECT_GT(path_mid_word_score, tld_mid_word_score); 333 EXPECT_GT(title_mid_word_score, tld_score); 334 EXPECT_GT(title_mid_word_score, tld_mid_word_score); 335} 336 337} // namespace history 338