scored_history_match_unittest.cc revision 0529e5d033099cbfc42635f6f6183833b09dff6e
1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include <algorithm> 6 7#include "base/auto_reset.h" 8#include "base/strings/string16.h" 9#include "base/strings/utf_string_conversions.h" 10#include "chrome/browser/history/scored_history_match.h" 11#include "components/bookmarks/core/browser/bookmark_service.h" 12#include "testing/gtest/include/gtest/gtest.h" 13 14using base::ASCIIToUTF16; 15 16namespace history { 17 18// Returns a VisitInfoVector that includes |num_visits| spread over the 19// last |frequency|*|num_visits| days (relative to |now|). A frequency of 20// one means one visit each day, two means every other day, etc. 21VisitInfoVector CreateVisitInfoVector(int num_visits, 22 int frequency, 23 base::Time now) { 24 VisitInfoVector visits; 25 for (int i = 0; i < num_visits; ++i) { 26 visits.push_back( 27 std::make_pair(now - base::TimeDelta::FromDays(i * frequency), 28 content::PAGE_TRANSITION_LINK)); 29 } 30 return visits; 31} 32 33class ScoredHistoryMatchTest : public testing::Test { 34 protected: 35 // Convenience function to create a URLRow with basic data for |url|, |title|, 36 // |visit_count|, and |typed_count|. |days_since_last_visit| gives the number 37 // of days ago to which to set the URL's last_visit. 38 URLRow MakeURLRow(const char* url, 39 const char* title, 40 int visit_count, 41 int days_since_last_visit, 42 int typed_count); 43 44 // Convenience function to set the word starts information from a URLRow's 45 // URL and title. 46 void PopulateWordStarts(const URLRow& url_row, RowWordStarts* word_starts); 47 48 // Convenience functions for easily creating vectors of search terms. 49 String16Vector Make1Term(const char* term) const; 50 String16Vector Make2Terms(const char* term_1, const char* term_2) const; 51 52 // Convenience function for GetTopicalityScore() that builds the 53 // term match and word break information automatically that are needed 54 // to call GetTopicalityScore(). It only works for scoring a single term, 55 // not multiple terms. 56 float GetTopicalityScoreOfTermAgainstURLAndTitle(const base::string16& term, 57 const base::string16& url, 58 const base::string16& title); 59}; 60 61URLRow ScoredHistoryMatchTest::MakeURLRow(const char* url, 62 const char* title, 63 int visit_count, 64 int days_since_last_visit, 65 int typed_count) { 66 URLRow row(GURL(url), 0); 67 row.set_title(ASCIIToUTF16(title)); 68 row.set_visit_count(visit_count); 69 row.set_typed_count(typed_count); 70 row.set_last_visit(base::Time::NowFromSystemTime() - 71 base::TimeDelta::FromDays(days_since_last_visit)); 72 return row; 73} 74 75void ScoredHistoryMatchTest::PopulateWordStarts( 76 const URLRow& url_row, RowWordStarts* word_starts) { 77 String16SetFromString16(ASCIIToUTF16(url_row.url().spec()), 78 &word_starts->url_word_starts_); 79 String16SetFromString16(url_row.title(), &word_starts->title_word_starts_); 80} 81 82 83String16Vector ScoredHistoryMatchTest::Make1Term(const char* term) const { 84 String16Vector original_terms; 85 original_terms.push_back(ASCIIToUTF16(term)); 86 return original_terms; 87} 88 89String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1, 90 const char* term_2) const { 91 String16Vector original_terms; 92 original_terms.push_back(ASCIIToUTF16(term_1)); 93 original_terms.push_back(ASCIIToUTF16(term_2)); 94 return original_terms; 95} 96 97float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle( 98 const base::string16& term, 99 const base::string16& url, 100 const base::string16& title) { 101 // Make an empty match and simply populate the fields we need in order 102 // to call GetTopicalityScore(). 103 ScoredHistoryMatch scored_match; 104 scored_match.url_matches_ = MatchTermInString(term, url, 0); 105 scored_match.title_matches_ = MatchTermInString(term, title, 0); 106 RowWordStarts word_starts; 107 String16SetFromString16(url, &word_starts.url_word_starts_); 108 String16SetFromString16(title, &word_starts.title_word_starts_); 109 WordStarts one_word_no_offset(1, 0u); 110 return scored_match.GetTopicalityScore(1, url, one_word_no_offset, 111 word_starts); 112} 113 114TEST_F(ScoredHistoryMatchTest, Scoring) { 115 // We use NowFromSystemTime() because MakeURLRow uses the same function 116 // to calculate last visit time when building a row. 117 base::Time now = base::Time::NowFromSystemTime(); 118 119 URLRow row_a(MakeURLRow("http://fedcba", "abcd bcd", 3, 30, 1)); 120 RowWordStarts word_starts_a; 121 PopulateWordStarts(row_a, &word_starts_a); 122 WordStarts one_word_no_offset(1, 0u); 123 VisitInfoVector visits_a = CreateVisitInfoVector(3, 30, now); 124 // Mark one visit as typed. 125 visits_a[0].second = content::PAGE_TRANSITION_TYPED; 126 ScoredHistoryMatch scored_a(row_a, visits_a, std::string(), 127 ASCIIToUTF16("abc"), Make1Term("abc"), 128 one_word_no_offset, word_starts_a, now, NULL); 129 130 // Test scores based on visit_count. 131 URLRow row_b(MakeURLRow("http://abcdef", "abcd bcd", 10, 30, 1)); 132 RowWordStarts word_starts_b; 133 PopulateWordStarts(row_b, &word_starts_b); 134 VisitInfoVector visits_b = CreateVisitInfoVector(10, 30, now); 135 visits_b[0].second = content::PAGE_TRANSITION_TYPED; 136 ScoredHistoryMatch scored_b(row_b, visits_b, std::string(), 137 ASCIIToUTF16("abc"), Make1Term("abc"), 138 one_word_no_offset, word_starts_b, now, NULL); 139 EXPECT_GT(scored_b.raw_score(), scored_a.raw_score()); 140 141 // Test scores based on last_visit. 142 URLRow row_c(MakeURLRow("http://abcdef", "abcd bcd", 3, 10, 1)); 143 RowWordStarts word_starts_c; 144 PopulateWordStarts(row_c, &word_starts_c); 145 VisitInfoVector visits_c = CreateVisitInfoVector(3, 10, now); 146 visits_c[0].second = content::PAGE_TRANSITION_TYPED; 147 ScoredHistoryMatch scored_c(row_c, visits_c, std::string(), 148 ASCIIToUTF16("abc"), Make1Term("abc"), 149 one_word_no_offset, word_starts_c, now, NULL); 150 EXPECT_GT(scored_c.raw_score(), scored_a.raw_score()); 151 152 // Test scores based on typed_count. 153 URLRow row_d(MakeURLRow("http://abcdef", "abcd bcd", 3, 30, 3)); 154 RowWordStarts word_starts_d; 155 PopulateWordStarts(row_d, &word_starts_d); 156 VisitInfoVector visits_d = CreateVisitInfoVector(3, 30, now); 157 visits_d[0].second = content::PAGE_TRANSITION_TYPED; 158 visits_d[1].second = content::PAGE_TRANSITION_TYPED; 159 visits_d[2].second = content::PAGE_TRANSITION_TYPED; 160 ScoredHistoryMatch scored_d(row_d, visits_d, std::string(), 161 ASCIIToUTF16("abc"), Make1Term("abc"), 162 one_word_no_offset, word_starts_d, now, NULL); 163 EXPECT_GT(scored_d.raw_score(), scored_a.raw_score()); 164 165 // Test scores based on a terms appearing multiple times. 166 URLRow row_e(MakeURLRow("http://csi.csi.csi/csi_csi", 167 "CSI Guide to CSI Las Vegas, CSI New York, CSI Provo", 3, 30, 3)); 168 RowWordStarts word_starts_e; 169 PopulateWordStarts(row_e, &word_starts_e); 170 const VisitInfoVector visits_e = visits_d; 171 ScoredHistoryMatch scored_e(row_e, visits_e, std::string(), 172 ASCIIToUTF16("csi"), Make1Term("csi"), 173 one_word_no_offset, word_starts_e, now, NULL); 174 EXPECT_LT(scored_e.raw_score(), 1400); 175 176 // Test that a result with only a mid-term match (i.e., not at a word 177 // boundary) scores 0. 178 ScoredHistoryMatch scored_f(row_a, visits_a, std::string(), 179 ASCIIToUTF16("cd"), Make1Term("cd"), 180 one_word_no_offset, word_starts_a, now, NULL); 181 EXPECT_EQ(scored_f.raw_score(), 0); 182} 183 184class BookmarkServiceMock : public BookmarkService { 185 public: 186 explicit BookmarkServiceMock(const GURL& url); 187 virtual ~BookmarkServiceMock() {} 188 189 // Returns true if the given |url| is the same as |url_|. 190 virtual bool IsBookmarked(const GURL& url) OVERRIDE; 191 192 // Required but unused. 193 virtual void GetBookmarks(std::vector<URLAndTitle>* bookmarks) OVERRIDE {} 194 virtual void BlockTillLoaded() OVERRIDE {} 195 196 private: 197 const GURL url_; 198 199 DISALLOW_COPY_AND_ASSIGN(BookmarkServiceMock); 200}; 201 202BookmarkServiceMock::BookmarkServiceMock(const GURL& url) 203 : BookmarkService(), 204 url_(url) { 205} 206 207bool BookmarkServiceMock::IsBookmarked(const GURL& url) { 208 return url == url_; 209} 210 211TEST_F(ScoredHistoryMatchTest, ScoringBookmarks) { 212 // We use NowFromSystemTime() because MakeURLRow uses the same function 213 // to calculate last visit time when building a row. 214 base::Time now = base::Time::NowFromSystemTime(); 215 216 std::string url_string("http://fedcba"); 217 const GURL url(url_string); 218 URLRow row(MakeURLRow(url_string.c_str(), "abcd bcd", 8, 3, 1)); 219 RowWordStarts word_starts; 220 PopulateWordStarts(row, &word_starts); 221 WordStarts one_word_no_offset(1, 0u); 222 VisitInfoVector visits = CreateVisitInfoVector(8, 3, now); 223 ScoredHistoryMatch scored(row, visits, std::string(), 224 ASCIIToUTF16("abc"), Make1Term("abc"), 225 one_word_no_offset, word_starts, now, NULL); 226 // Now bookmark that URL and make sure its score increases. 227 base::AutoReset<int> reset(&ScoredHistoryMatch::bookmark_value_, 5); 228 BookmarkServiceMock bookmark_service_mock(url); 229 ScoredHistoryMatch scored_with_bookmark( 230 row, visits, std::string(), ASCIIToUTF16("abc"), Make1Term("abc"), 231 one_word_no_offset, word_starts, now, &bookmark_service_mock); 232 EXPECT_GT(scored_with_bookmark.raw_score(), scored.raw_score()); 233} 234 235TEST_F(ScoredHistoryMatchTest, ScoringTLD) { 236 // We use NowFromSystemTime() because MakeURLRow uses the same function 237 // to calculate last visit time when building a row. 238 base::Time now = base::Time::NowFromSystemTime(); 239 240 // By default the URL should not be returned for a query that includes "com". 241 std::string url_string("http://fedcba.com/"); 242 const GURL url(url_string); 243 URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1)); 244 RowWordStarts word_starts; 245 PopulateWordStarts(row, &word_starts); 246 WordStarts two_words_no_offsets(2, 0u); 247 VisitInfoVector visits = CreateVisitInfoVector(8, 3, now); 248 ScoredHistoryMatch scored(row, visits, std::string(), 249 ASCIIToUTF16("fed com"), Make2Terms("fed", "com"), 250 two_words_no_offsets, word_starts, now, NULL); 251 EXPECT_EQ(0, scored.raw_score()); 252 253 // Now allow credit for the match in the TLD. 254 base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_tld_matches_, true); 255 ScoredHistoryMatch scored_with_tld( 256 row, visits, std::string(), ASCIIToUTF16("fed com"), 257 Make2Terms("fed", "com"), two_words_no_offsets, word_starts, now, NULL); 258 EXPECT_GT(scored_with_tld.raw_score(), 0); 259} 260 261TEST_F(ScoredHistoryMatchTest, ScoringScheme) { 262 // We use NowFromSystemTime() because MakeURLRow uses the same function 263 // to calculate last visit time when building a row. 264 base::Time now = base::Time::NowFromSystemTime(); 265 266 // By default the URL should not be returned for a query that includes "http". 267 std::string url_string("http://fedcba/"); 268 const GURL url(url_string); 269 URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1)); 270 RowWordStarts word_starts; 271 PopulateWordStarts(row, &word_starts); 272 WordStarts two_words_no_offsets(2, 0u); 273 VisitInfoVector visits = CreateVisitInfoVector(8, 3, now); 274 ScoredHistoryMatch scored(row, visits, std::string(), 275 ASCIIToUTF16("fed http"), Make2Terms("fed", "http"), 276 two_words_no_offsets, word_starts, now, NULL); 277 EXPECT_EQ(0, scored.raw_score()); 278 279 // Now allow credit for the match in the scheme. 280 base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_scheme_matches_, true); 281 ScoredHistoryMatch scored_with_scheme( 282 row, visits, std::string(), ASCIIToUTF16("fed http"), 283 Make2Terms("fed", "http"), two_words_no_offsets, word_starts, now, NULL); 284 EXPECT_GT(scored_with_scheme.raw_score(), 0); 285} 286 287TEST_F(ScoredHistoryMatchTest, Inlining) { 288 // We use NowFromSystemTime() because MakeURLRow uses the same function 289 // to calculate last visit time when building a row. 290 base::Time now = base::Time::NowFromSystemTime(); 291 RowWordStarts word_starts; 292 WordStarts one_word_no_offset(1, 0u); 293 VisitInfoVector visits; 294 295 { 296 URLRow row(MakeURLRow("http://www.google.com", "abcdef", 3, 30, 1)); 297 PopulateWordStarts(row, &word_starts); 298 ScoredHistoryMatch scored_a(row, visits, std::string(), 299 ASCIIToUTF16("g"), Make1Term("g"), 300 one_word_no_offset, word_starts, now, NULL); 301 EXPECT_TRUE(scored_a.can_inline()); 302 EXPECT_FALSE(scored_a.match_in_scheme); 303 ScoredHistoryMatch scored_b(row, visits, std::string(), 304 ASCIIToUTF16("w"), Make1Term("w"), 305 one_word_no_offset, word_starts, now, NULL); 306 EXPECT_TRUE(scored_b.can_inline()); 307 EXPECT_FALSE(scored_b.match_in_scheme); 308 ScoredHistoryMatch scored_c(row, visits, std::string(), 309 ASCIIToUTF16("h"), Make1Term("h"), 310 one_word_no_offset, word_starts, now, NULL); 311 EXPECT_TRUE(scored_c.can_inline()); 312 EXPECT_TRUE(scored_c.match_in_scheme); 313 ScoredHistoryMatch scored_d(row, visits, std::string(), 314 ASCIIToUTF16("o"), Make1Term("o"), 315 one_word_no_offset, word_starts, now, NULL); 316 EXPECT_FALSE(scored_d.can_inline()); 317 EXPECT_FALSE(scored_d.match_in_scheme); 318 } 319 320 { 321 URLRow row(MakeURLRow("http://teams.foo.com", "abcdef", 3, 30, 1)); 322 PopulateWordStarts(row, &word_starts); 323 ScoredHistoryMatch scored_a(row, visits, std::string(), 324 ASCIIToUTF16("t"), Make1Term("t"), 325 one_word_no_offset, word_starts, now, NULL); 326 EXPECT_TRUE(scored_a.can_inline()); 327 EXPECT_FALSE(scored_a.match_in_scheme); 328 ScoredHistoryMatch scored_b(row, visits, std::string(), 329 ASCIIToUTF16("f"), Make1Term("f"), 330 one_word_no_offset, word_starts, now, NULL); 331 EXPECT_FALSE(scored_b.can_inline()); 332 EXPECT_FALSE(scored_b.match_in_scheme); 333 ScoredHistoryMatch scored_c(row, visits, std::string(), 334 ASCIIToUTF16("o"), Make1Term("o"), 335 one_word_no_offset, word_starts, now, NULL); 336 EXPECT_FALSE(scored_c.can_inline()); 337 EXPECT_FALSE(scored_c.match_in_scheme); 338 } 339 340 { 341 URLRow row(MakeURLRow("https://www.testing.com", "abcdef", 3, 30, 1)); 342 PopulateWordStarts(row, &word_starts); 343 ScoredHistoryMatch scored_a(row, visits, std::string(), 344 ASCIIToUTF16("t"), Make1Term("t"), 345 one_word_no_offset, word_starts, now, NULL); 346 EXPECT_TRUE(scored_a.can_inline()); 347 EXPECT_FALSE(scored_a.match_in_scheme); 348 ScoredHistoryMatch scored_b(row, visits, std::string(), 349 ASCIIToUTF16("h"), Make1Term("h"), 350 one_word_no_offset, word_starts, now, NULL); 351 EXPECT_TRUE(scored_b.can_inline()); 352 EXPECT_TRUE(scored_b.match_in_scheme); 353 ScoredHistoryMatch scored_c(row, visits, std::string(), 354 ASCIIToUTF16("w"), Make1Term("w"), 355 one_word_no_offset, word_starts, now, NULL); 356 EXPECT_TRUE(scored_c.can_inline()); 357 EXPECT_FALSE(scored_c.match_in_scheme); 358 } 359} 360 361TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) { 362 const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle( 363 ASCIIToUTF16("def"), 364 ASCIIToUTF16("http://abc.def.com/"), 365 ASCIIToUTF16("Non-Matching Title")); 366 const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle( 367 ASCIIToUTF16("def"), 368 ASCIIToUTF16("http://abc.def.com"), 369 ASCIIToUTF16("Non-Matching Title")); 370 EXPECT_EQ(hostname_no_slash, hostname); 371} 372 373// This function only tests scoring of single terms that match exactly 374// once somewhere in the URL or title. 375TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) { 376 base::string16 url = ASCIIToUTF16("http://abc.def.com/path1/path2?" 377 "arg1=val1&arg2=val2#hash_component"); 378 base::string16 title = ASCIIToUTF16("here is a title"); 379 const float hostname_score = 380 GetTopicalityScoreOfTermAgainstURLAndTitle( 381 ASCIIToUTF16("abc"), url, title); 382 const float hostname_mid_word_score = 383 GetTopicalityScoreOfTermAgainstURLAndTitle( 384 ASCIIToUTF16("bc"), url, title); 385 const float domain_name_score = 386 GetTopicalityScoreOfTermAgainstURLAndTitle( 387 ASCIIToUTF16("def"), url, title); 388 const float domain_name_mid_word_score = 389 GetTopicalityScoreOfTermAgainstURLAndTitle( 390 ASCIIToUTF16("ef"), url, title); 391 const float tld_score = 392 GetTopicalityScoreOfTermAgainstURLAndTitle( 393 ASCIIToUTF16("com"), url, title); 394 const float tld_mid_word_score = 395 GetTopicalityScoreOfTermAgainstURLAndTitle( 396 ASCIIToUTF16("om"), url, title); 397 const float path_score = 398 GetTopicalityScoreOfTermAgainstURLAndTitle( 399 ASCIIToUTF16("path1"), url, title); 400 const float path_mid_word_score = 401 GetTopicalityScoreOfTermAgainstURLAndTitle( 402 ASCIIToUTF16("ath1"), url, title); 403 const float arg_score = 404 GetTopicalityScoreOfTermAgainstURLAndTitle( 405 ASCIIToUTF16("arg2"), url, title); 406 const float arg_mid_word_score = 407 GetTopicalityScoreOfTermAgainstURLAndTitle( 408 ASCIIToUTF16("rg2"), url, title); 409 const float protocol_score = 410 GetTopicalityScoreOfTermAgainstURLAndTitle( 411 ASCIIToUTF16("htt"), url, title); 412 const float protocol_mid_word_score = 413 GetTopicalityScoreOfTermAgainstURLAndTitle( 414 ASCIIToUTF16("tt"), url, title); 415 const float title_score = 416 GetTopicalityScoreOfTermAgainstURLAndTitle( 417 ASCIIToUTF16("her"), url, title); 418 const float title_mid_word_score = 419 GetTopicalityScoreOfTermAgainstURLAndTitle( 420 ASCIIToUTF16("er"), url, title); 421 // Verify hostname and domain name > path > arg. 422 EXPECT_GT(hostname_score, path_score); 423 EXPECT_GT(domain_name_score, path_score); 424 EXPECT_GT(path_score, arg_score); 425 // Verify that domain name > path and domain name > arg for non-word 426 // boundaries. 427 EXPECT_GT(hostname_mid_word_score, path_mid_word_score); 428 EXPECT_GT(domain_name_mid_word_score, path_mid_word_score); 429 EXPECT_GT(domain_name_mid_word_score, arg_mid_word_score); 430 EXPECT_GT(hostname_mid_word_score, arg_mid_word_score); 431 // Also verify that the matches at non-word-boundaries all score 432 // worse than the matches at word boundaries. These three sets suffice. 433 EXPECT_GT(arg_score, hostname_mid_word_score); 434 EXPECT_GT(arg_score, domain_name_mid_word_score); 435 EXPECT_GT(title_score, title_mid_word_score); 436 // Check that title matches fit somewhere reasonable compared to the 437 // various types of URL matches. 438 EXPECT_GT(title_score, arg_score); 439 EXPECT_GT(arg_score, title_mid_word_score); 440 // Finally, verify that protocol matches and top level domain name 441 // matches (.com, .net, etc.) score worse than some of the mid-word 442 // matches that actually count. 443 EXPECT_GT(hostname_mid_word_score, protocol_score); 444 EXPECT_GT(hostname_mid_word_score, protocol_mid_word_score); 445 EXPECT_GT(hostname_mid_word_score, tld_score); 446 EXPECT_GT(hostname_mid_word_score, tld_mid_word_score); 447} 448 449} // namespace history 450