scored_history_match_unittest.cc revision e5d81f57cb97b3b6b7fccc9c5610d21eb81db09d
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <algorithm>
6
7#include "base/auto_reset.h"
8#include "base/strings/string16.h"
9#include "base/strings/utf_string_conversions.h"
10#include "chrome/browser/bookmarks/bookmark_service.h"
11#include "chrome/browser/history/scored_history_match.h"
12#include "testing/gtest/include/gtest/gtest.h"
13
14using base::ASCIIToUTF16;
15
16namespace history {
17
18// Returns a VisitInfoVector that includes |num_visits| spread over the
19// last |frequency|*|num_visits| days (relative to |now|).  A frequency of
20// one means one visit each day, two means every other day, etc.
21VisitInfoVector CreateVisitInfoVector(int num_visits,
22                                      int frequency,
23                                      base::Time now) {
24  VisitInfoVector visits;
25  for (int i = 0; i < num_visits; ++i) {
26    visits.push_back(
27        std::make_pair(now - base::TimeDelta::FromDays(i * frequency),
28                       content::PAGE_TRANSITION_LINK));
29  }
30  return visits;
31}
32
33class ScoredHistoryMatchTest : public testing::Test {
34 protected:
35  // Convenience function to create a URLRow with basic data for |url|, |title|,
36  // |visit_count|, and |typed_count|. |days_since_last_visit| gives the number
37  // of days ago to which to set the URL's last_visit.
38  URLRow MakeURLRow(const char* url,
39                    const char* title,
40                    int visit_count,
41                    int days_since_last_visit,
42                    int typed_count);
43
44  // Convenience function to set the word starts information from a URLRow's
45  // URL and title.
46  void PopulateWordStarts(const URLRow& url_row, RowWordStarts* word_starts);
47
48  // Convenience functions for easily creating vectors of search terms.
49  String16Vector Make1Term(const char* term) const;
50  String16Vector Make2Terms(const char* term_1, const char* term_2) const;
51
52  // Convenience function for GetTopicalityScore() that builds the
53  // term match and word break information automatically that are needed
54  // to call GetTopicalityScore().  It only works for scoring a single term,
55  // not multiple terms.
56  float GetTopicalityScoreOfTermAgainstURLAndTitle(const base::string16& term,
57                                                   const base::string16& url,
58                                                   const base::string16& title);
59};
60
61URLRow ScoredHistoryMatchTest::MakeURLRow(const char* url,
62                                          const char* title,
63                                          int visit_count,
64                                          int days_since_last_visit,
65                                          int typed_count) {
66  URLRow row(GURL(url), 0);
67  row.set_title(ASCIIToUTF16(title));
68  row.set_visit_count(visit_count);
69  row.set_typed_count(typed_count);
70  row.set_last_visit(base::Time::NowFromSystemTime() -
71                     base::TimeDelta::FromDays(days_since_last_visit));
72  return row;
73}
74
75void ScoredHistoryMatchTest::PopulateWordStarts(
76    const URLRow& url_row, RowWordStarts* word_starts) {
77  String16SetFromString16(ASCIIToUTF16(url_row.url().spec()),
78                          &word_starts->url_word_starts_);
79  String16SetFromString16(url_row.title(), &word_starts->title_word_starts_);
80}
81
82
83String16Vector ScoredHistoryMatchTest::Make1Term(const char* term) const {
84  String16Vector original_terms;
85  original_terms.push_back(ASCIIToUTF16(term));
86  return original_terms;
87}
88
89String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1,
90                                                  const char* term_2) const {
91  String16Vector original_terms;
92  original_terms.push_back(ASCIIToUTF16(term_1));
93  original_terms.push_back(ASCIIToUTF16(term_2));
94  return original_terms;
95}
96
97float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle(
98    const base::string16& term,
99    const base::string16& url,
100    const base::string16& title) {
101  // Make an empty match and simply populate the fields we need in order
102  // to call GetTopicalityScore().
103  ScoredHistoryMatch scored_match;
104  scored_match.url_matches_ = MatchTermInString(term, url, 0);
105  scored_match.title_matches_ = MatchTermInString(term, title, 0);
106  RowWordStarts word_starts;
107  String16SetFromString16(url, &word_starts.url_word_starts_);
108  String16SetFromString16(title, &word_starts.title_word_starts_);
109  WordStarts one_word_no_offset(1, 0u);
110  return scored_match.GetTopicalityScore(1, url, one_word_no_offset,
111                                         word_starts);
112}
113
114TEST_F(ScoredHistoryMatchTest, Scoring) {
115  // We use NowFromSystemTime() because MakeURLRow uses the same function
116  // to calculate last visit time when building a row.
117  base::Time now = base::Time::NowFromSystemTime();
118
119  URLRow row_a(MakeURLRow("http://fedcba", "abcd bcd", 3, 30, 1));
120  RowWordStarts word_starts_a;
121  PopulateWordStarts(row_a, &word_starts_a);
122  WordStarts one_word_no_offset(1, 0u);
123  VisitInfoVector visits_a = CreateVisitInfoVector(3, 30, now);
124  // Mark one visit as typed.
125  visits_a[0].second = content::PAGE_TRANSITION_TYPED;
126  ScoredHistoryMatch scored_a(row_a, visits_a, std::string(),
127                              ASCIIToUTF16("abc"), Make1Term("abc"),
128                              one_word_no_offset, word_starts_a, now, NULL);
129
130  // Test scores based on visit_count.
131  URLRow row_b(MakeURLRow("http://abcdef", "abcd bcd", 10, 30, 1));
132  RowWordStarts word_starts_b;
133  PopulateWordStarts(row_b, &word_starts_b);
134  VisitInfoVector visits_b = CreateVisitInfoVector(10, 30, now);
135  visits_b[0].second = content::PAGE_TRANSITION_TYPED;
136  ScoredHistoryMatch scored_b(row_b, visits_b, std::string(),
137                              ASCIIToUTF16("abc"), Make1Term("abc"),
138                              one_word_no_offset, word_starts_b, now, NULL);
139  EXPECT_GT(scored_b.raw_score(), scored_a.raw_score());
140
141  // Test scores based on last_visit.
142  URLRow row_c(MakeURLRow("http://abcdef", "abcd bcd", 3, 10, 1));
143  RowWordStarts word_starts_c;
144  PopulateWordStarts(row_c, &word_starts_c);
145  VisitInfoVector visits_c = CreateVisitInfoVector(3, 10, now);
146  visits_c[0].second = content::PAGE_TRANSITION_TYPED;
147  ScoredHistoryMatch scored_c(row_c, visits_c, std::string(),
148                              ASCIIToUTF16("abc"), Make1Term("abc"),
149                              one_word_no_offset, word_starts_c, now, NULL);
150  EXPECT_GT(scored_c.raw_score(), scored_a.raw_score());
151
152  // Test scores based on typed_count.
153  URLRow row_d(MakeURLRow("http://abcdef", "abcd bcd", 3, 30, 3));
154  RowWordStarts word_starts_d;
155  PopulateWordStarts(row_d, &word_starts_d);
156  VisitInfoVector visits_d = CreateVisitInfoVector(3, 30, now);
157  visits_d[0].second = content::PAGE_TRANSITION_TYPED;
158  visits_d[1].second = content::PAGE_TRANSITION_TYPED;
159  visits_d[2].second = content::PAGE_TRANSITION_TYPED;
160  ScoredHistoryMatch scored_d(row_d, visits_d, std::string(),
161                              ASCIIToUTF16("abc"), Make1Term("abc"),
162                              one_word_no_offset, word_starts_d, now, NULL);
163  EXPECT_GT(scored_d.raw_score(), scored_a.raw_score());
164
165  // Test scores based on a terms appearing multiple times.
166  URLRow row_e(MakeURLRow("http://csi.csi.csi/csi_csi",
167      "CSI Guide to CSI Las Vegas, CSI New York, CSI Provo", 3, 30, 3));
168  RowWordStarts word_starts_e;
169  PopulateWordStarts(row_e, &word_starts_e);
170  const VisitInfoVector visits_e = visits_d;
171  ScoredHistoryMatch scored_e(row_e, visits_e, std::string(),
172                              ASCIIToUTF16("csi"), Make1Term("csi"),
173                              one_word_no_offset, word_starts_e, now, NULL);
174  EXPECT_LT(scored_e.raw_score(), 1400);
175
176  // Test that a result with only a mid-term match (i.e., not at a word
177  // boundary) scores 0.
178  ScoredHistoryMatch scored_f(row_a, visits_a, std::string(),
179                              ASCIIToUTF16("cd"), Make1Term("cd"),
180                              one_word_no_offset, word_starts_a, now, NULL);
181  EXPECT_EQ(scored_f.raw_score(), 0);
182}
183
184class BookmarkServiceMock : public BookmarkService {
185 public:
186  explicit BookmarkServiceMock(const GURL& url);
187  virtual ~BookmarkServiceMock() {}
188
189  // Returns true if the given |url| is the same as |url_|.
190  virtual bool IsBookmarked(const GURL& url) OVERRIDE;
191
192  // Required but unused.
193  virtual void GetBookmarks(std::vector<URLAndTitle>* bookmarks) OVERRIDE {}
194  virtual void BlockTillLoaded() OVERRIDE {}
195
196 private:
197  const GURL url_;
198
199  DISALLOW_COPY_AND_ASSIGN(BookmarkServiceMock);
200};
201
202BookmarkServiceMock::BookmarkServiceMock(const GURL& url)
203    : BookmarkService(),
204      url_(url) {
205}
206
207bool BookmarkServiceMock::IsBookmarked(const GURL& url) {
208  return url == url_;
209}
210
211TEST_F(ScoredHistoryMatchTest, ScoringBookmarks) {
212  // We use NowFromSystemTime() because MakeURLRow uses the same function
213  // to calculate last visit time when building a row.
214  base::Time now = base::Time::NowFromSystemTime();
215
216  std::string url_string("http://fedcba");
217  const GURL url(url_string);
218  URLRow row(MakeURLRow(url_string.c_str(), "abcd bcd", 8, 3, 1));
219  RowWordStarts word_starts;
220  PopulateWordStarts(row, &word_starts);
221  WordStarts one_word_no_offset(1, 0u);
222  VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
223  ScoredHistoryMatch scored(row, visits, std::string(),
224                            ASCIIToUTF16("abc"), Make1Term("abc"),
225                            one_word_no_offset, word_starts, now, NULL);
226  // Now bookmark that URL and make sure its score increases.
227  base::AutoReset<int> reset(&ScoredHistoryMatch::bookmark_value_, 5);
228  BookmarkServiceMock bookmark_model_mock(url);
229  ScoredHistoryMatch scored_with_bookmark(
230      row, visits, std::string(), ASCIIToUTF16("abc"), Make1Term("abc"),
231      one_word_no_offset, word_starts, now, &bookmark_model_mock);
232  EXPECT_GT(scored_with_bookmark.raw_score(), scored.raw_score());
233}
234
235TEST_F(ScoredHistoryMatchTest, ScoringDiscountFrecency) {
236  // We use NowFromSystemTime() because MakeURLRow uses the same function
237  // to calculate last visit time when building a row.
238  base::Time now = base::Time::NowFromSystemTime();
239
240  std::string url_string("http://fedcba.com/");
241  const GURL url(url_string);
242  URLRow row(MakeURLRow(url_string.c_str(), "", 1, 1, 1));
243  RowWordStarts word_starts;
244  PopulateWordStarts(row, &word_starts);
245  WordStarts one_word_no_offset(1, 0u);
246  VisitInfoVector visits = CreateVisitInfoVector(1, 1, now);
247  ScoredHistoryMatch scored(row, visits, std::string(), ASCIIToUTF16("fed"),
248                            Make1Term("fed"), one_word_no_offset, word_starts,
249                            now, NULL);
250
251  // With properly discounted scores, the final raw_score should be lower.
252  base::AutoReset<bool> reset(
253      &ScoredHistoryMatch::discount_frecency_when_few_visits_, true);
254  ScoredHistoryMatch scored_with_discount_frecency(
255      row, visits, std::string(), ASCIIToUTF16("fed"),
256      Make1Term("fed"), one_word_no_offset, word_starts, now, NULL);
257  EXPECT_LT(scored_with_discount_frecency.raw_score(), scored.raw_score());
258}
259
260TEST_F(ScoredHistoryMatchTest, ScoringTLD) {
261  // We use NowFromSystemTime() because MakeURLRow uses the same function
262  // to calculate last visit time when building a row.
263  base::Time now = base::Time::NowFromSystemTime();
264
265  // By default the URL should not be returned for a query that includes "com".
266  std::string url_string("http://fedcba.com/");
267  const GURL url(url_string);
268  URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1));
269  RowWordStarts word_starts;
270  PopulateWordStarts(row, &word_starts);
271  WordStarts two_words_no_offsets(2, 0u);
272  VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
273  ScoredHistoryMatch scored(row, visits, std::string(),
274                            ASCIIToUTF16("fed com"), Make2Terms("fed", "com"),
275                            two_words_no_offsets, word_starts, now, NULL);
276  EXPECT_EQ(0, scored.raw_score());
277
278  // Now allow credit for the match in the TLD.
279  base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_tld_matches_, true);
280  ScoredHistoryMatch scored_with_tld(
281      row, visits, std::string(), ASCIIToUTF16("fed com"),
282      Make2Terms("fed", "com"), two_words_no_offsets, word_starts, now, NULL);
283  EXPECT_GT(scored_with_tld.raw_score(), 0);
284}
285
286TEST_F(ScoredHistoryMatchTest, ScoringScheme) {
287  // We use NowFromSystemTime() because MakeURLRow uses the same function
288  // to calculate last visit time when building a row.
289  base::Time now = base::Time::NowFromSystemTime();
290
291  // By default the URL should not be returned for a query that includes "http".
292  std::string url_string("http://fedcba/");
293  const GURL url(url_string);
294  URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1));
295  RowWordStarts word_starts;
296  PopulateWordStarts(row, &word_starts);
297  WordStarts two_words_no_offsets(2, 0u);
298  VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
299  ScoredHistoryMatch scored(row, visits, std::string(),
300                            ASCIIToUTF16("fed http"), Make2Terms("fed", "http"),
301                            two_words_no_offsets, word_starts, now, NULL);
302  EXPECT_EQ(0, scored.raw_score());
303
304  // Now allow credit for the match in the scheme.
305  base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_scheme_matches_, true);
306  ScoredHistoryMatch scored_with_scheme(
307      row, visits, std::string(), ASCIIToUTF16("fed http"),
308      Make2Terms("fed", "http"), two_words_no_offsets, word_starts, now, NULL);
309  EXPECT_GT(scored_with_scheme.raw_score(), 0);
310}
311
312TEST_F(ScoredHistoryMatchTest, Inlining) {
313  // We use NowFromSystemTime() because MakeURLRow uses the same function
314  // to calculate last visit time when building a row.
315  base::Time now = base::Time::NowFromSystemTime();
316  RowWordStarts word_starts;
317  WordStarts one_word_no_offset(1, 0u);
318  VisitInfoVector visits;
319
320  {
321    URLRow row(MakeURLRow("http://www.google.com", "abcdef", 3, 30, 1));
322    PopulateWordStarts(row, &word_starts);
323    ScoredHistoryMatch scored_a(row, visits, std::string(),
324                                ASCIIToUTF16("g"), Make1Term("g"),
325                                one_word_no_offset, word_starts, now, NULL);
326    EXPECT_TRUE(scored_a.can_inline());
327    EXPECT_FALSE(scored_a.match_in_scheme);
328    ScoredHistoryMatch scored_b(row, visits, std::string(),
329                                ASCIIToUTF16("w"), Make1Term("w"),
330                                one_word_no_offset, word_starts, now, NULL);
331    EXPECT_TRUE(scored_b.can_inline());
332    EXPECT_FALSE(scored_b.match_in_scheme);
333    ScoredHistoryMatch scored_c(row, visits, std::string(),
334                                ASCIIToUTF16("h"), Make1Term("h"),
335                                one_word_no_offset, word_starts, now, NULL);
336    EXPECT_TRUE(scored_c.can_inline());
337    EXPECT_TRUE(scored_c.match_in_scheme);
338    ScoredHistoryMatch scored_d(row, visits, std::string(),
339                                ASCIIToUTF16("o"), Make1Term("o"),
340                                one_word_no_offset, word_starts, now, NULL);
341    EXPECT_FALSE(scored_d.can_inline());
342    EXPECT_FALSE(scored_d.match_in_scheme);
343  }
344
345  {
346    URLRow row(MakeURLRow("http://teams.foo.com", "abcdef", 3, 30, 1));
347    PopulateWordStarts(row, &word_starts);
348    ScoredHistoryMatch scored_a(row, visits, std::string(),
349                                ASCIIToUTF16("t"), Make1Term("t"),
350                                one_word_no_offset, word_starts, now, NULL);
351    EXPECT_TRUE(scored_a.can_inline());
352    EXPECT_FALSE(scored_a.match_in_scheme);
353    ScoredHistoryMatch scored_b(row, visits, std::string(),
354                                ASCIIToUTF16("f"), Make1Term("f"),
355                                one_word_no_offset, word_starts, now, NULL);
356    EXPECT_FALSE(scored_b.can_inline());
357    EXPECT_FALSE(scored_b.match_in_scheme);
358    ScoredHistoryMatch scored_c(row, visits, std::string(),
359                                ASCIIToUTF16("o"), Make1Term("o"),
360                                one_word_no_offset, word_starts, now, NULL);
361    EXPECT_FALSE(scored_c.can_inline());
362    EXPECT_FALSE(scored_c.match_in_scheme);
363  }
364
365  {
366    URLRow row(MakeURLRow("https://www.testing.com", "abcdef", 3, 30, 1));
367    PopulateWordStarts(row, &word_starts);
368    ScoredHistoryMatch scored_a(row, visits, std::string(),
369                                ASCIIToUTF16("t"), Make1Term("t"),
370                                one_word_no_offset, word_starts, now, NULL);
371    EXPECT_TRUE(scored_a.can_inline());
372    EXPECT_FALSE(scored_a.match_in_scheme);
373    ScoredHistoryMatch scored_b(row, visits, std::string(),
374                                ASCIIToUTF16("h"), Make1Term("h"),
375                                one_word_no_offset, word_starts, now, NULL);
376    EXPECT_TRUE(scored_b.can_inline());
377    EXPECT_TRUE(scored_b.match_in_scheme);
378    ScoredHistoryMatch scored_c(row, visits, std::string(),
379                                ASCIIToUTF16("w"), Make1Term("w"),
380                                one_word_no_offset, word_starts, now, NULL);
381    EXPECT_TRUE(scored_c.can_inline());
382    EXPECT_FALSE(scored_c.match_in_scheme);
383  }
384}
385
386TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) {
387  const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle(
388      ASCIIToUTF16("def"),
389      ASCIIToUTF16("http://abc.def.com/"),
390      ASCIIToUTF16("Non-Matching Title"));
391  const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle(
392      ASCIIToUTF16("def"),
393      ASCIIToUTF16("http://abc.def.com"),
394      ASCIIToUTF16("Non-Matching Title"));
395  EXPECT_EQ(hostname_no_slash, hostname);
396}
397
398// This function only tests scoring of single terms that match exactly
399// once somewhere in the URL or title.
400TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) {
401  base::string16 url = ASCIIToUTF16("http://abc.def.com/path1/path2?"
402      "arg1=val1&arg2=val2#hash_component");
403  base::string16 title = ASCIIToUTF16("here is a title");
404  const float hostname_score =
405      GetTopicalityScoreOfTermAgainstURLAndTitle(
406          ASCIIToUTF16("abc"), url, title);
407  const float hostname_mid_word_score =
408      GetTopicalityScoreOfTermAgainstURLAndTitle(
409          ASCIIToUTF16("bc"), url, title);
410  const float domain_name_score =
411      GetTopicalityScoreOfTermAgainstURLAndTitle(
412          ASCIIToUTF16("def"), url, title);
413  const float domain_name_mid_word_score =
414      GetTopicalityScoreOfTermAgainstURLAndTitle(
415          ASCIIToUTF16("ef"), url, title);
416  const float tld_score =
417      GetTopicalityScoreOfTermAgainstURLAndTitle(
418          ASCIIToUTF16("com"), url, title);
419  const float tld_mid_word_score =
420      GetTopicalityScoreOfTermAgainstURLAndTitle(
421          ASCIIToUTF16("om"), url, title);
422  const float path_score =
423      GetTopicalityScoreOfTermAgainstURLAndTitle(
424          ASCIIToUTF16("path1"), url, title);
425  const float path_mid_word_score =
426      GetTopicalityScoreOfTermAgainstURLAndTitle(
427          ASCIIToUTF16("ath1"), url, title);
428  const float arg_score =
429      GetTopicalityScoreOfTermAgainstURLAndTitle(
430          ASCIIToUTF16("arg2"), url, title);
431  const float arg_mid_word_score =
432      GetTopicalityScoreOfTermAgainstURLAndTitle(
433          ASCIIToUTF16("rg2"), url, title);
434  const float protocol_score =
435      GetTopicalityScoreOfTermAgainstURLAndTitle(
436          ASCIIToUTF16("htt"), url, title);
437  const float protocol_mid_word_score =
438      GetTopicalityScoreOfTermAgainstURLAndTitle(
439          ASCIIToUTF16("tt"), url, title);
440  const float title_score =
441      GetTopicalityScoreOfTermAgainstURLAndTitle(
442          ASCIIToUTF16("her"), url, title);
443  const float title_mid_word_score =
444      GetTopicalityScoreOfTermAgainstURLAndTitle(
445          ASCIIToUTF16("er"), url, title);
446  // Verify hostname and domain name > path > arg.
447  EXPECT_GT(hostname_score, path_score);
448  EXPECT_GT(domain_name_score, path_score);
449  EXPECT_GT(path_score, arg_score);
450  // Verify that domain name > path and domain name > arg for non-word
451  // boundaries.
452  EXPECT_GT(hostname_mid_word_score, path_mid_word_score);
453  EXPECT_GT(domain_name_mid_word_score, path_mid_word_score);
454  EXPECT_GT(domain_name_mid_word_score, arg_mid_word_score);
455  EXPECT_GT(hostname_mid_word_score, arg_mid_word_score);
456  // Also verify that the matches at non-word-boundaries all score
457  // worse than the matches at word boundaries.  These three sets suffice.
458  EXPECT_GT(arg_score, hostname_mid_word_score);
459  EXPECT_GT(arg_score, domain_name_mid_word_score);
460  EXPECT_GT(title_score, title_mid_word_score);
461  // Check that title matches fit somewhere reasonable compared to the
462  // various types of URL matches.
463  EXPECT_GT(title_score, arg_score);
464  EXPECT_GT(arg_score, title_mid_word_score);
465  // Finally, verify that protocol matches and top level domain name
466  // matches (.com, .net, etc.) score worse than some of the mid-word
467  // matches that actually count.
468  EXPECT_GT(hostname_mid_word_score, protocol_score);
469  EXPECT_GT(hostname_mid_word_score, protocol_mid_word_score);
470  EXPECT_GT(hostname_mid_word_score, tld_score);
471  EXPECT_GT(hostname_mid_word_score, tld_mid_word_score);
472}
473
474}  // namespace history
475