scored_history_match_unittest.cc revision e5d81f57cb97b3b6b7fccc9c5610d21eb81db09d
1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include <algorithm> 6 7#include "base/auto_reset.h" 8#include "base/strings/string16.h" 9#include "base/strings/utf_string_conversions.h" 10#include "chrome/browser/bookmarks/bookmark_service.h" 11#include "chrome/browser/history/scored_history_match.h" 12#include "testing/gtest/include/gtest/gtest.h" 13 14using base::ASCIIToUTF16; 15 16namespace history { 17 18// Returns a VisitInfoVector that includes |num_visits| spread over the 19// last |frequency|*|num_visits| days (relative to |now|). A frequency of 20// one means one visit each day, two means every other day, etc. 21VisitInfoVector CreateVisitInfoVector(int num_visits, 22 int frequency, 23 base::Time now) { 24 VisitInfoVector visits; 25 for (int i = 0; i < num_visits; ++i) { 26 visits.push_back( 27 std::make_pair(now - base::TimeDelta::FromDays(i * frequency), 28 content::PAGE_TRANSITION_LINK)); 29 } 30 return visits; 31} 32 33class ScoredHistoryMatchTest : public testing::Test { 34 protected: 35 // Convenience function to create a URLRow with basic data for |url|, |title|, 36 // |visit_count|, and |typed_count|. |days_since_last_visit| gives the number 37 // of days ago to which to set the URL's last_visit. 38 URLRow MakeURLRow(const char* url, 39 const char* title, 40 int visit_count, 41 int days_since_last_visit, 42 int typed_count); 43 44 // Convenience function to set the word starts information from a URLRow's 45 // URL and title. 46 void PopulateWordStarts(const URLRow& url_row, RowWordStarts* word_starts); 47 48 // Convenience functions for easily creating vectors of search terms. 49 String16Vector Make1Term(const char* term) const; 50 String16Vector Make2Terms(const char* term_1, const char* term_2) const; 51 52 // Convenience function for GetTopicalityScore() that builds the 53 // term match and word break information automatically that are needed 54 // to call GetTopicalityScore(). It only works for scoring a single term, 55 // not multiple terms. 56 float GetTopicalityScoreOfTermAgainstURLAndTitle(const base::string16& term, 57 const base::string16& url, 58 const base::string16& title); 59}; 60 61URLRow ScoredHistoryMatchTest::MakeURLRow(const char* url, 62 const char* title, 63 int visit_count, 64 int days_since_last_visit, 65 int typed_count) { 66 URLRow row(GURL(url), 0); 67 row.set_title(ASCIIToUTF16(title)); 68 row.set_visit_count(visit_count); 69 row.set_typed_count(typed_count); 70 row.set_last_visit(base::Time::NowFromSystemTime() - 71 base::TimeDelta::FromDays(days_since_last_visit)); 72 return row; 73} 74 75void ScoredHistoryMatchTest::PopulateWordStarts( 76 const URLRow& url_row, RowWordStarts* word_starts) { 77 String16SetFromString16(ASCIIToUTF16(url_row.url().spec()), 78 &word_starts->url_word_starts_); 79 String16SetFromString16(url_row.title(), &word_starts->title_word_starts_); 80} 81 82 83String16Vector ScoredHistoryMatchTest::Make1Term(const char* term) const { 84 String16Vector original_terms; 85 original_terms.push_back(ASCIIToUTF16(term)); 86 return original_terms; 87} 88 89String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1, 90 const char* term_2) const { 91 String16Vector original_terms; 92 original_terms.push_back(ASCIIToUTF16(term_1)); 93 original_terms.push_back(ASCIIToUTF16(term_2)); 94 return original_terms; 95} 96 97float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle( 98 const base::string16& term, 99 const base::string16& url, 100 const base::string16& title) { 101 // Make an empty match and simply populate the fields we need in order 102 // to call GetTopicalityScore(). 103 ScoredHistoryMatch scored_match; 104 scored_match.url_matches_ = MatchTermInString(term, url, 0); 105 scored_match.title_matches_ = MatchTermInString(term, title, 0); 106 RowWordStarts word_starts; 107 String16SetFromString16(url, &word_starts.url_word_starts_); 108 String16SetFromString16(title, &word_starts.title_word_starts_); 109 WordStarts one_word_no_offset(1, 0u); 110 return scored_match.GetTopicalityScore(1, url, one_word_no_offset, 111 word_starts); 112} 113 114TEST_F(ScoredHistoryMatchTest, Scoring) { 115 // We use NowFromSystemTime() because MakeURLRow uses the same function 116 // to calculate last visit time when building a row. 117 base::Time now = base::Time::NowFromSystemTime(); 118 119 URLRow row_a(MakeURLRow("http://fedcba", "abcd bcd", 3, 30, 1)); 120 RowWordStarts word_starts_a; 121 PopulateWordStarts(row_a, &word_starts_a); 122 WordStarts one_word_no_offset(1, 0u); 123 VisitInfoVector visits_a = CreateVisitInfoVector(3, 30, now); 124 // Mark one visit as typed. 125 visits_a[0].second = content::PAGE_TRANSITION_TYPED; 126 ScoredHistoryMatch scored_a(row_a, visits_a, std::string(), 127 ASCIIToUTF16("abc"), Make1Term("abc"), 128 one_word_no_offset, word_starts_a, now, NULL); 129 130 // Test scores based on visit_count. 131 URLRow row_b(MakeURLRow("http://abcdef", "abcd bcd", 10, 30, 1)); 132 RowWordStarts word_starts_b; 133 PopulateWordStarts(row_b, &word_starts_b); 134 VisitInfoVector visits_b = CreateVisitInfoVector(10, 30, now); 135 visits_b[0].second = content::PAGE_TRANSITION_TYPED; 136 ScoredHistoryMatch scored_b(row_b, visits_b, std::string(), 137 ASCIIToUTF16("abc"), Make1Term("abc"), 138 one_word_no_offset, word_starts_b, now, NULL); 139 EXPECT_GT(scored_b.raw_score(), scored_a.raw_score()); 140 141 // Test scores based on last_visit. 142 URLRow row_c(MakeURLRow("http://abcdef", "abcd bcd", 3, 10, 1)); 143 RowWordStarts word_starts_c; 144 PopulateWordStarts(row_c, &word_starts_c); 145 VisitInfoVector visits_c = CreateVisitInfoVector(3, 10, now); 146 visits_c[0].second = content::PAGE_TRANSITION_TYPED; 147 ScoredHistoryMatch scored_c(row_c, visits_c, std::string(), 148 ASCIIToUTF16("abc"), Make1Term("abc"), 149 one_word_no_offset, word_starts_c, now, NULL); 150 EXPECT_GT(scored_c.raw_score(), scored_a.raw_score()); 151 152 // Test scores based on typed_count. 153 URLRow row_d(MakeURLRow("http://abcdef", "abcd bcd", 3, 30, 3)); 154 RowWordStarts word_starts_d; 155 PopulateWordStarts(row_d, &word_starts_d); 156 VisitInfoVector visits_d = CreateVisitInfoVector(3, 30, now); 157 visits_d[0].second = content::PAGE_TRANSITION_TYPED; 158 visits_d[1].second = content::PAGE_TRANSITION_TYPED; 159 visits_d[2].second = content::PAGE_TRANSITION_TYPED; 160 ScoredHistoryMatch scored_d(row_d, visits_d, std::string(), 161 ASCIIToUTF16("abc"), Make1Term("abc"), 162 one_word_no_offset, word_starts_d, now, NULL); 163 EXPECT_GT(scored_d.raw_score(), scored_a.raw_score()); 164 165 // Test scores based on a terms appearing multiple times. 166 URLRow row_e(MakeURLRow("http://csi.csi.csi/csi_csi", 167 "CSI Guide to CSI Las Vegas, CSI New York, CSI Provo", 3, 30, 3)); 168 RowWordStarts word_starts_e; 169 PopulateWordStarts(row_e, &word_starts_e); 170 const VisitInfoVector visits_e = visits_d; 171 ScoredHistoryMatch scored_e(row_e, visits_e, std::string(), 172 ASCIIToUTF16("csi"), Make1Term("csi"), 173 one_word_no_offset, word_starts_e, now, NULL); 174 EXPECT_LT(scored_e.raw_score(), 1400); 175 176 // Test that a result with only a mid-term match (i.e., not at a word 177 // boundary) scores 0. 178 ScoredHistoryMatch scored_f(row_a, visits_a, std::string(), 179 ASCIIToUTF16("cd"), Make1Term("cd"), 180 one_word_no_offset, word_starts_a, now, NULL); 181 EXPECT_EQ(scored_f.raw_score(), 0); 182} 183 184class BookmarkServiceMock : public BookmarkService { 185 public: 186 explicit BookmarkServiceMock(const GURL& url); 187 virtual ~BookmarkServiceMock() {} 188 189 // Returns true if the given |url| is the same as |url_|. 190 virtual bool IsBookmarked(const GURL& url) OVERRIDE; 191 192 // Required but unused. 193 virtual void GetBookmarks(std::vector<URLAndTitle>* bookmarks) OVERRIDE {} 194 virtual void BlockTillLoaded() OVERRIDE {} 195 196 private: 197 const GURL url_; 198 199 DISALLOW_COPY_AND_ASSIGN(BookmarkServiceMock); 200}; 201 202BookmarkServiceMock::BookmarkServiceMock(const GURL& url) 203 : BookmarkService(), 204 url_(url) { 205} 206 207bool BookmarkServiceMock::IsBookmarked(const GURL& url) { 208 return url == url_; 209} 210 211TEST_F(ScoredHistoryMatchTest, ScoringBookmarks) { 212 // We use NowFromSystemTime() because MakeURLRow uses the same function 213 // to calculate last visit time when building a row. 214 base::Time now = base::Time::NowFromSystemTime(); 215 216 std::string url_string("http://fedcba"); 217 const GURL url(url_string); 218 URLRow row(MakeURLRow(url_string.c_str(), "abcd bcd", 8, 3, 1)); 219 RowWordStarts word_starts; 220 PopulateWordStarts(row, &word_starts); 221 WordStarts one_word_no_offset(1, 0u); 222 VisitInfoVector visits = CreateVisitInfoVector(8, 3, now); 223 ScoredHistoryMatch scored(row, visits, std::string(), 224 ASCIIToUTF16("abc"), Make1Term("abc"), 225 one_word_no_offset, word_starts, now, NULL); 226 // Now bookmark that URL and make sure its score increases. 227 base::AutoReset<int> reset(&ScoredHistoryMatch::bookmark_value_, 5); 228 BookmarkServiceMock bookmark_model_mock(url); 229 ScoredHistoryMatch scored_with_bookmark( 230 row, visits, std::string(), ASCIIToUTF16("abc"), Make1Term("abc"), 231 one_word_no_offset, word_starts, now, &bookmark_model_mock); 232 EXPECT_GT(scored_with_bookmark.raw_score(), scored.raw_score()); 233} 234 235TEST_F(ScoredHistoryMatchTest, ScoringDiscountFrecency) { 236 // We use NowFromSystemTime() because MakeURLRow uses the same function 237 // to calculate last visit time when building a row. 238 base::Time now = base::Time::NowFromSystemTime(); 239 240 std::string url_string("http://fedcba.com/"); 241 const GURL url(url_string); 242 URLRow row(MakeURLRow(url_string.c_str(), "", 1, 1, 1)); 243 RowWordStarts word_starts; 244 PopulateWordStarts(row, &word_starts); 245 WordStarts one_word_no_offset(1, 0u); 246 VisitInfoVector visits = CreateVisitInfoVector(1, 1, now); 247 ScoredHistoryMatch scored(row, visits, std::string(), ASCIIToUTF16("fed"), 248 Make1Term("fed"), one_word_no_offset, word_starts, 249 now, NULL); 250 251 // With properly discounted scores, the final raw_score should be lower. 252 base::AutoReset<bool> reset( 253 &ScoredHistoryMatch::discount_frecency_when_few_visits_, true); 254 ScoredHistoryMatch scored_with_discount_frecency( 255 row, visits, std::string(), ASCIIToUTF16("fed"), 256 Make1Term("fed"), one_word_no_offset, word_starts, now, NULL); 257 EXPECT_LT(scored_with_discount_frecency.raw_score(), scored.raw_score()); 258} 259 260TEST_F(ScoredHistoryMatchTest, ScoringTLD) { 261 // We use NowFromSystemTime() because MakeURLRow uses the same function 262 // to calculate last visit time when building a row. 263 base::Time now = base::Time::NowFromSystemTime(); 264 265 // By default the URL should not be returned for a query that includes "com". 266 std::string url_string("http://fedcba.com/"); 267 const GURL url(url_string); 268 URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1)); 269 RowWordStarts word_starts; 270 PopulateWordStarts(row, &word_starts); 271 WordStarts two_words_no_offsets(2, 0u); 272 VisitInfoVector visits = CreateVisitInfoVector(8, 3, now); 273 ScoredHistoryMatch scored(row, visits, std::string(), 274 ASCIIToUTF16("fed com"), Make2Terms("fed", "com"), 275 two_words_no_offsets, word_starts, now, NULL); 276 EXPECT_EQ(0, scored.raw_score()); 277 278 // Now allow credit for the match in the TLD. 279 base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_tld_matches_, true); 280 ScoredHistoryMatch scored_with_tld( 281 row, visits, std::string(), ASCIIToUTF16("fed com"), 282 Make2Terms("fed", "com"), two_words_no_offsets, word_starts, now, NULL); 283 EXPECT_GT(scored_with_tld.raw_score(), 0); 284} 285 286TEST_F(ScoredHistoryMatchTest, ScoringScheme) { 287 // We use NowFromSystemTime() because MakeURLRow uses the same function 288 // to calculate last visit time when building a row. 289 base::Time now = base::Time::NowFromSystemTime(); 290 291 // By default the URL should not be returned for a query that includes "http". 292 std::string url_string("http://fedcba/"); 293 const GURL url(url_string); 294 URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1)); 295 RowWordStarts word_starts; 296 PopulateWordStarts(row, &word_starts); 297 WordStarts two_words_no_offsets(2, 0u); 298 VisitInfoVector visits = CreateVisitInfoVector(8, 3, now); 299 ScoredHistoryMatch scored(row, visits, std::string(), 300 ASCIIToUTF16("fed http"), Make2Terms("fed", "http"), 301 two_words_no_offsets, word_starts, now, NULL); 302 EXPECT_EQ(0, scored.raw_score()); 303 304 // Now allow credit for the match in the scheme. 305 base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_scheme_matches_, true); 306 ScoredHistoryMatch scored_with_scheme( 307 row, visits, std::string(), ASCIIToUTF16("fed http"), 308 Make2Terms("fed", "http"), two_words_no_offsets, word_starts, now, NULL); 309 EXPECT_GT(scored_with_scheme.raw_score(), 0); 310} 311 312TEST_F(ScoredHistoryMatchTest, Inlining) { 313 // We use NowFromSystemTime() because MakeURLRow uses the same function 314 // to calculate last visit time when building a row. 315 base::Time now = base::Time::NowFromSystemTime(); 316 RowWordStarts word_starts; 317 WordStarts one_word_no_offset(1, 0u); 318 VisitInfoVector visits; 319 320 { 321 URLRow row(MakeURLRow("http://www.google.com", "abcdef", 3, 30, 1)); 322 PopulateWordStarts(row, &word_starts); 323 ScoredHistoryMatch scored_a(row, visits, std::string(), 324 ASCIIToUTF16("g"), Make1Term("g"), 325 one_word_no_offset, word_starts, now, NULL); 326 EXPECT_TRUE(scored_a.can_inline()); 327 EXPECT_FALSE(scored_a.match_in_scheme); 328 ScoredHistoryMatch scored_b(row, visits, std::string(), 329 ASCIIToUTF16("w"), Make1Term("w"), 330 one_word_no_offset, word_starts, now, NULL); 331 EXPECT_TRUE(scored_b.can_inline()); 332 EXPECT_FALSE(scored_b.match_in_scheme); 333 ScoredHistoryMatch scored_c(row, visits, std::string(), 334 ASCIIToUTF16("h"), Make1Term("h"), 335 one_word_no_offset, word_starts, now, NULL); 336 EXPECT_TRUE(scored_c.can_inline()); 337 EXPECT_TRUE(scored_c.match_in_scheme); 338 ScoredHistoryMatch scored_d(row, visits, std::string(), 339 ASCIIToUTF16("o"), Make1Term("o"), 340 one_word_no_offset, word_starts, now, NULL); 341 EXPECT_FALSE(scored_d.can_inline()); 342 EXPECT_FALSE(scored_d.match_in_scheme); 343 } 344 345 { 346 URLRow row(MakeURLRow("http://teams.foo.com", "abcdef", 3, 30, 1)); 347 PopulateWordStarts(row, &word_starts); 348 ScoredHistoryMatch scored_a(row, visits, std::string(), 349 ASCIIToUTF16("t"), Make1Term("t"), 350 one_word_no_offset, word_starts, now, NULL); 351 EXPECT_TRUE(scored_a.can_inline()); 352 EXPECT_FALSE(scored_a.match_in_scheme); 353 ScoredHistoryMatch scored_b(row, visits, std::string(), 354 ASCIIToUTF16("f"), Make1Term("f"), 355 one_word_no_offset, word_starts, now, NULL); 356 EXPECT_FALSE(scored_b.can_inline()); 357 EXPECT_FALSE(scored_b.match_in_scheme); 358 ScoredHistoryMatch scored_c(row, visits, std::string(), 359 ASCIIToUTF16("o"), Make1Term("o"), 360 one_word_no_offset, word_starts, now, NULL); 361 EXPECT_FALSE(scored_c.can_inline()); 362 EXPECT_FALSE(scored_c.match_in_scheme); 363 } 364 365 { 366 URLRow row(MakeURLRow("https://www.testing.com", "abcdef", 3, 30, 1)); 367 PopulateWordStarts(row, &word_starts); 368 ScoredHistoryMatch scored_a(row, visits, std::string(), 369 ASCIIToUTF16("t"), Make1Term("t"), 370 one_word_no_offset, word_starts, now, NULL); 371 EXPECT_TRUE(scored_a.can_inline()); 372 EXPECT_FALSE(scored_a.match_in_scheme); 373 ScoredHistoryMatch scored_b(row, visits, std::string(), 374 ASCIIToUTF16("h"), Make1Term("h"), 375 one_word_no_offset, word_starts, now, NULL); 376 EXPECT_TRUE(scored_b.can_inline()); 377 EXPECT_TRUE(scored_b.match_in_scheme); 378 ScoredHistoryMatch scored_c(row, visits, std::string(), 379 ASCIIToUTF16("w"), Make1Term("w"), 380 one_word_no_offset, word_starts, now, NULL); 381 EXPECT_TRUE(scored_c.can_inline()); 382 EXPECT_FALSE(scored_c.match_in_scheme); 383 } 384} 385 386TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) { 387 const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle( 388 ASCIIToUTF16("def"), 389 ASCIIToUTF16("http://abc.def.com/"), 390 ASCIIToUTF16("Non-Matching Title")); 391 const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle( 392 ASCIIToUTF16("def"), 393 ASCIIToUTF16("http://abc.def.com"), 394 ASCIIToUTF16("Non-Matching Title")); 395 EXPECT_EQ(hostname_no_slash, hostname); 396} 397 398// This function only tests scoring of single terms that match exactly 399// once somewhere in the URL or title. 400TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) { 401 base::string16 url = ASCIIToUTF16("http://abc.def.com/path1/path2?" 402 "arg1=val1&arg2=val2#hash_component"); 403 base::string16 title = ASCIIToUTF16("here is a title"); 404 const float hostname_score = 405 GetTopicalityScoreOfTermAgainstURLAndTitle( 406 ASCIIToUTF16("abc"), url, title); 407 const float hostname_mid_word_score = 408 GetTopicalityScoreOfTermAgainstURLAndTitle( 409 ASCIIToUTF16("bc"), url, title); 410 const float domain_name_score = 411 GetTopicalityScoreOfTermAgainstURLAndTitle( 412 ASCIIToUTF16("def"), url, title); 413 const float domain_name_mid_word_score = 414 GetTopicalityScoreOfTermAgainstURLAndTitle( 415 ASCIIToUTF16("ef"), url, title); 416 const float tld_score = 417 GetTopicalityScoreOfTermAgainstURLAndTitle( 418 ASCIIToUTF16("com"), url, title); 419 const float tld_mid_word_score = 420 GetTopicalityScoreOfTermAgainstURLAndTitle( 421 ASCIIToUTF16("om"), url, title); 422 const float path_score = 423 GetTopicalityScoreOfTermAgainstURLAndTitle( 424 ASCIIToUTF16("path1"), url, title); 425 const float path_mid_word_score = 426 GetTopicalityScoreOfTermAgainstURLAndTitle( 427 ASCIIToUTF16("ath1"), url, title); 428 const float arg_score = 429 GetTopicalityScoreOfTermAgainstURLAndTitle( 430 ASCIIToUTF16("arg2"), url, title); 431 const float arg_mid_word_score = 432 GetTopicalityScoreOfTermAgainstURLAndTitle( 433 ASCIIToUTF16("rg2"), url, title); 434 const float protocol_score = 435 GetTopicalityScoreOfTermAgainstURLAndTitle( 436 ASCIIToUTF16("htt"), url, title); 437 const float protocol_mid_word_score = 438 GetTopicalityScoreOfTermAgainstURLAndTitle( 439 ASCIIToUTF16("tt"), url, title); 440 const float title_score = 441 GetTopicalityScoreOfTermAgainstURLAndTitle( 442 ASCIIToUTF16("her"), url, title); 443 const float title_mid_word_score = 444 GetTopicalityScoreOfTermAgainstURLAndTitle( 445 ASCIIToUTF16("er"), url, title); 446 // Verify hostname and domain name > path > arg. 447 EXPECT_GT(hostname_score, path_score); 448 EXPECT_GT(domain_name_score, path_score); 449 EXPECT_GT(path_score, arg_score); 450 // Verify that domain name > path and domain name > arg for non-word 451 // boundaries. 452 EXPECT_GT(hostname_mid_word_score, path_mid_word_score); 453 EXPECT_GT(domain_name_mid_word_score, path_mid_word_score); 454 EXPECT_GT(domain_name_mid_word_score, arg_mid_word_score); 455 EXPECT_GT(hostname_mid_word_score, arg_mid_word_score); 456 // Also verify that the matches at non-word-boundaries all score 457 // worse than the matches at word boundaries. These three sets suffice. 458 EXPECT_GT(arg_score, hostname_mid_word_score); 459 EXPECT_GT(arg_score, domain_name_mid_word_score); 460 EXPECT_GT(title_score, title_mid_word_score); 461 // Check that title matches fit somewhere reasonable compared to the 462 // various types of URL matches. 463 EXPECT_GT(title_score, arg_score); 464 EXPECT_GT(arg_score, title_mid_word_score); 465 // Finally, verify that protocol matches and top level domain name 466 // matches (.com, .net, etc.) score worse than some of the mid-word 467 // matches that actually count. 468 EXPECT_GT(hostname_mid_word_score, protocol_score); 469 EXPECT_GT(hostname_mid_word_score, protocol_mid_word_score); 470 EXPECT_GT(hostname_mid_word_score, tld_score); 471 EXPECT_GT(hostname_mid_word_score, tld_mid_word_score); 472} 473 474} // namespace history 475