scored_history_match_unittest.cc revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <algorithm>
6
7#include "base/string16.h"
8#include "base/utf_string_conversions.h"
9#include "chrome/browser/api/bookmarks/bookmark_service.h"
10#include "chrome/browser/history/scored_history_match.h"
11#include "testing/gtest/include/gtest/gtest.h"
12
13namespace history {
14
15class ScoredHistoryMatchTest : public testing::Test {
16 protected:
17  // Convenience function to create a URLRow with basic data for |url|, |title|,
18  // |visit_count|, and |typed_count|. |days_since_last_visit| gives the number
19  // of days ago to which to set the URL's last_visit.
20  URLRow MakeURLRow(const char* url,
21                    const char* title,
22                    int visit_count,
23                    int days_since_last_visit,
24                    int typed_count);
25
26  // Convenience functions for easily creating vectors of search terms.
27  String16Vector Make1Term(const char* term) const;
28  String16Vector Make2Terms(const char* term_1, const char* term_2) const;
29
30  // Convenience function for GetTopicalityScore() that builds the
31  // term match and word break information automatically that are needed
32  // to call GetTopicalityScore().  It only works for scoring a single term,
33  // not multiple terms.
34  float GetTopicalityScoreOfTermAgainstURLAndTitle(const string16& term,
35                                                   const string16& url,
36                                                   const string16& title);
37};
38
39URLRow ScoredHistoryMatchTest::MakeURLRow(const char* url,
40                                          const char* title,
41                                          int visit_count,
42                                          int days_since_last_visit,
43                                          int typed_count) {
44  URLRow row(GURL(url), 0);
45  row.set_title(ASCIIToUTF16(title));
46  row.set_visit_count(visit_count);
47  row.set_typed_count(typed_count);
48  row.set_last_visit(base::Time::NowFromSystemTime() -
49                     base::TimeDelta::FromDays(days_since_last_visit));
50  return row;
51}
52
53String16Vector ScoredHistoryMatchTest::Make1Term(const char* term) const {
54  String16Vector original_terms;
55  original_terms.push_back(ASCIIToUTF16(term));
56  return original_terms;
57}
58
59String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1,
60                                                  const char* term_2) const {
61  String16Vector original_terms;
62  original_terms.push_back(ASCIIToUTF16(term_1));
63  original_terms.push_back(ASCIIToUTF16(term_2));
64  return original_terms;
65}
66
67float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle(
68    const string16& term,
69    const string16& url,
70    const string16& title) {
71  TermMatches url_matches = MatchTermInString(term, url, 0);
72  TermMatches title_matches = MatchTermInString(term, title, 0);
73  RowWordStarts word_starts;
74  String16SetFromString16(url, &word_starts.url_word_starts_);
75  String16SetFromString16(title, &word_starts.title_word_starts_);
76  return ScoredHistoryMatch::GetTopicalityScore(
77      1, url, url_matches, title_matches, word_starts);
78}
79
80TEST_F(ScoredHistoryMatchTest, Scoring) {
81  URLRow row_a(MakeURLRow("http://abcdef", "fedcba", 3, 30, 1));
82  // We use NowFromSystemTime() because MakeURLRow uses the same function
83  // to calculate last visit time when building a row.
84  base::Time now = base::Time::NowFromSystemTime();
85  RowWordStarts word_starts;
86
87  // Test scores based on position.
88  // TODO(mpearson): Test new_scoring if we're actually going to turn it
89  // on by default.  This requires setting word_starts, which isn't done
90  // right now.
91  ScoredHistoryMatch scored_a(row_a, ASCIIToUTF16("abc"), Make1Term("abc"),
92                              word_starts, now, NULL);
93  ScoredHistoryMatch scored_b(row_a, ASCIIToUTF16("bcd"), Make1Term("bcd"),
94                              word_starts, now, NULL);
95  EXPECT_GT(scored_a.raw_score, scored_b.raw_score);
96
97  // Test scores based on length.
98  ScoredHistoryMatch scored_c(row_a, ASCIIToUTF16("abcd"), Make1Term("abcd"),
99                              word_starts, now, NULL);
100  EXPECT_LT(scored_a.raw_score, scored_c.raw_score);
101
102  // Test scores based on order.
103  ScoredHistoryMatch scored_d(row_a, ASCIIToUTF16("abcdef"),
104                              Make2Terms("abc", "def"), word_starts, now, NULL);
105  ScoredHistoryMatch scored_e(row_a, ASCIIToUTF16("def abc"),
106                              Make2Terms("def", "abc"), word_starts, now, NULL);
107  EXPECT_GT(scored_d.raw_score, scored_e.raw_score);
108
109  // Test scores based on visit_count.
110  URLRow row_b(MakeURLRow("http://abcdef", "fedcba", 10, 30, 1));
111  ScoredHistoryMatch scored_f(row_b, ASCIIToUTF16("abc"), Make1Term("abc"),
112                              word_starts, now, NULL);
113  EXPECT_GT(scored_f.raw_score, scored_a.raw_score);
114
115  // Test scores based on last_visit.
116  URLRow row_c(MakeURLRow("http://abcdef", "fedcba", 3, 10, 1));
117  ScoredHistoryMatch scored_g(row_c, ASCIIToUTF16("abc"), Make1Term("abc"),
118                              word_starts, now, NULL);
119  EXPECT_GT(scored_g.raw_score, scored_a.raw_score);
120
121  // Test scores based on typed_count.
122  URLRow row_d(MakeURLRow("http://abcdef", "fedcba", 3, 30, 10));
123  ScoredHistoryMatch scored_h(row_d, ASCIIToUTF16("abc"), Make1Term("abc"),
124                              word_starts, now, NULL);
125  EXPECT_GT(scored_h.raw_score, scored_a.raw_score);
126
127  // Test scores based on a terms appearing multiple times.
128  URLRow row_i(MakeURLRow("http://csi.csi.csi/csi_csi",
129      "CSI Guide to CSI Las Vegas, CSI New York, CSI Provo", 3, 30, 10));
130  ScoredHistoryMatch scored_i(row_i, ASCIIToUTF16("csi"), Make1Term("csi"),
131                              word_starts, now, NULL);
132  EXPECT_LT(scored_i.raw_score, 1400);
133}
134
135TEST_F(ScoredHistoryMatchTest, Inlining) {
136  // We use NowFromSystemTime() because MakeURLRow uses the same function
137  // to calculate last visit time when building a row.
138  base::Time now = base::Time::NowFromSystemTime();
139  RowWordStarts word_starts;
140
141  {
142    URLRow row(MakeURLRow("http://www.google.com", "abcdef", 3, 30, 1));
143    ScoredHistoryMatch scored_a(row, ASCIIToUTF16("g"), Make1Term("g"),
144                                word_starts, now, NULL);
145    EXPECT_TRUE(scored_a.can_inline);
146    EXPECT_FALSE(scored_a.match_in_scheme);
147    ScoredHistoryMatch scored_b(row, ASCIIToUTF16("w"), Make1Term("w"),
148                                word_starts, now, NULL);
149    EXPECT_TRUE(scored_b.can_inline);
150    EXPECT_FALSE(scored_b.match_in_scheme);
151    ScoredHistoryMatch scored_c(row, ASCIIToUTF16("h"), Make1Term("h"),
152                                word_starts, now, NULL);
153    EXPECT_TRUE(scored_c.can_inline);
154    EXPECT_TRUE(scored_c.match_in_scheme);
155    ScoredHistoryMatch scored_d(row, ASCIIToUTF16("o"), Make1Term("o"),
156                                word_starts, now, NULL);
157    EXPECT_FALSE(scored_d.can_inline);
158    EXPECT_FALSE(scored_d.match_in_scheme);
159  }
160
161  {
162    URLRow row(MakeURLRow("http://teams.foo.com", "abcdef", 3, 30, 1));
163    ScoredHistoryMatch scored_a(row, ASCIIToUTF16("t"), Make1Term("t"),
164                                word_starts, now, NULL);
165    EXPECT_TRUE(scored_a.can_inline);
166    EXPECT_FALSE(scored_a.match_in_scheme);
167    ScoredHistoryMatch scored_b(row, ASCIIToUTF16("f"), Make1Term("f"),
168                                word_starts, now, NULL);
169    EXPECT_FALSE(scored_b.can_inline);
170    EXPECT_FALSE(scored_b.match_in_scheme);
171    ScoredHistoryMatch scored_c(row, ASCIIToUTF16("o"), Make1Term("o"),
172                                word_starts, now, NULL);
173    EXPECT_FALSE(scored_c.can_inline);
174    EXPECT_FALSE(scored_c.match_in_scheme);
175  }
176
177  {
178    URLRow row(MakeURLRow("https://www.testing.com", "abcdef", 3, 30, 1));
179    ScoredHistoryMatch scored_a(row, ASCIIToUTF16("t"), Make1Term("t"),
180                                word_starts, now, NULL);
181    EXPECT_TRUE(scored_a.can_inline);
182    EXPECT_FALSE(scored_a.match_in_scheme);
183    ScoredHistoryMatch scored_b(row, ASCIIToUTF16("h"), Make1Term("h"),
184                                word_starts, now, NULL);
185    EXPECT_TRUE(scored_b.can_inline);
186    EXPECT_TRUE(scored_b.match_in_scheme);
187    ScoredHistoryMatch scored_c(row, ASCIIToUTF16("w"), Make1Term("w"),
188                                word_starts, now, NULL);
189    EXPECT_TRUE(scored_c.can_inline);
190    EXPECT_FALSE(scored_c.match_in_scheme);
191  }
192}
193
194class BookmarkServiceMock : public BookmarkService {
195 public:
196  explicit BookmarkServiceMock(const GURL& url);
197  virtual ~BookmarkServiceMock() {}
198
199  // Returns true if the given |url| is the same as |url_|.
200  bool IsBookmarked(const GURL& url) OVERRIDE;
201
202  // Required but unused.
203  virtual void GetBookmarks(std::vector<URLAndTitle>* bookmarks) OVERRIDE {}
204  virtual void BlockTillLoaded() OVERRIDE {}
205
206 private:
207  const GURL url_;
208
209  DISALLOW_COPY_AND_ASSIGN(BookmarkServiceMock);
210};
211
212BookmarkServiceMock::BookmarkServiceMock(const GURL& url)
213    : BookmarkService(),
214      url_(url) {
215}
216
217bool BookmarkServiceMock::IsBookmarked(const GURL& url) {
218  return url == url_;
219}
220
221TEST_F(ScoredHistoryMatchTest, ScoringWithBookmarks) {
222  const GURL url("http://www.nanny.org");
223  BookmarkServiceMock bookmark_model_mock(url);
224  URLRow row_a(MakeURLRow("http://www.nanny.org", "Nanny", 3, 30, 1));
225  // We use NowFromSystemTime() because MakeURLRow uses the same function
226  // to calculate last visit time when building a row.
227  base::Time now = base::Time::NowFromSystemTime();
228  RowWordStarts word_starts;
229
230  // Identical queries but the first should be boosted by having a bookmark.
231  ScoredHistoryMatch scored_a(row_a, ASCIIToUTF16("nanny"), Make1Term("nanny"),
232                              word_starts, now, &bookmark_model_mock);
233  ScoredHistoryMatch scored_b(row_a, ASCIIToUTF16("nanny"), Make1Term("nanny"),
234                              word_starts, now, NULL);
235  EXPECT_GT(scored_a.raw_score, scored_b.raw_score);
236
237  // Identical queries, neither should be boosted by having a bookmark.
238  ScoredHistoryMatch scored_c(row_a, ASCIIToUTF16("stick"), Make1Term("stick"),
239                              word_starts, now, &bookmark_model_mock);
240  ScoredHistoryMatch scored_d(row_a, ASCIIToUTF16("stick"), Make1Term("stick"),
241                              word_starts, now, NULL);
242  EXPECT_EQ(scored_c.raw_score, scored_d.raw_score);
243}
244
245TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) {
246  const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle(
247      ASCIIToUTF16("def"),
248      ASCIIToUTF16("http://abc.def.com/"),
249      ASCIIToUTF16("Non-Matching Title"));
250  const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle(
251      ASCIIToUTF16("def"),
252      ASCIIToUTF16("http://abc.def.com"),
253      ASCIIToUTF16("Non-Matching Title"));
254  EXPECT_EQ(hostname_no_slash, hostname);
255}
256
257// This function only tests scoring of single terms that match exactly
258// once somewhere in the URL or title.
259TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) {
260  string16 url = ASCIIToUTF16("http://abc.def.com/path1/path2?"
261      "arg1=val1&arg2=val2#hash_component");
262  string16 title = ASCIIToUTF16("here is a title");
263  const float hostname_score =
264      GetTopicalityScoreOfTermAgainstURLAndTitle(
265          ASCIIToUTF16("abc"), url, title);
266  const float hostname_mid_word_score =
267      GetTopicalityScoreOfTermAgainstURLAndTitle(
268          ASCIIToUTF16("bc"), url, title);
269  const float domain_name_score =
270      GetTopicalityScoreOfTermAgainstURLAndTitle(
271          ASCIIToUTF16("def"), url, title);
272  const float domain_name_mid_word_score =
273      GetTopicalityScoreOfTermAgainstURLAndTitle(
274          ASCIIToUTF16("ef"), url, title);
275  const float tld_score =
276      GetTopicalityScoreOfTermAgainstURLAndTitle(
277          ASCIIToUTF16("com"), url, title);
278  const float tld_mid_word_score =
279      GetTopicalityScoreOfTermAgainstURLAndTitle(
280          ASCIIToUTF16("om"), url, title);
281  const float path_score =
282      GetTopicalityScoreOfTermAgainstURLAndTitle(
283          ASCIIToUTF16("path1"), url, title);
284  const float path_mid_word_score =
285      GetTopicalityScoreOfTermAgainstURLAndTitle(
286          ASCIIToUTF16("ath1"), url, title);
287  const float arg_score =
288      GetTopicalityScoreOfTermAgainstURLAndTitle(
289          ASCIIToUTF16("arg2"), url, title);
290  const float arg_mid_word_score =
291      GetTopicalityScoreOfTermAgainstURLAndTitle(
292          ASCIIToUTF16("rg2"), url, title);
293  const float protocol_score =
294      GetTopicalityScoreOfTermAgainstURLAndTitle(
295          ASCIIToUTF16("htt"), url, title);
296  const float protocol_mid_word_score =
297      GetTopicalityScoreOfTermAgainstURLAndTitle(
298          ASCIIToUTF16("tt"), url, title);
299  const float title_score =
300      GetTopicalityScoreOfTermAgainstURLAndTitle(
301          ASCIIToUTF16("her"), url, title);
302  const float title_mid_word_score =
303      GetTopicalityScoreOfTermAgainstURLAndTitle(
304          ASCIIToUTF16("er"), url, title);
305  // Verify hostname and domain name > path > arg, and the same for the
306  // matches at non-word-boundaries.
307  EXPECT_GT(hostname_score, path_score);
308  EXPECT_GT(domain_name_score, path_score);
309  EXPECT_GT(path_score, arg_score);
310  EXPECT_GT(hostname_mid_word_score, path_mid_word_score);
311  EXPECT_GT(domain_name_mid_word_score, path_mid_word_score);
312  EXPECT_GT(path_mid_word_score, arg_mid_word_score);
313  // Also verify that the matches at non-word-boundaries all score
314  // worse than the matches at word boundaries.  These three sets suffice.
315  EXPECT_GT(arg_score, hostname_mid_word_score);
316  EXPECT_GT(arg_score, domain_name_mid_word_score);
317  EXPECT_GT(title_score, title_mid_word_score);
318  // Check that title matches fit somewhere reasonable compared to the
319  // various types of URL matches.
320  EXPECT_GT(title_score, arg_score);
321  EXPECT_GT(arg_score, title_mid_word_score);
322  EXPECT_GT(title_mid_word_score, arg_mid_word_score);
323  // Finally, verify that protocol matches and top level domain name
324  // matches (.com, .net, etc.) score worse than everything (except
325  // possibly mid-word matches in the ?arg section of the URL--I can
326  // imagine scoring those pretty harshly as well).
327  EXPECT_GT(path_mid_word_score, protocol_score);
328  EXPECT_GT(path_mid_word_score, protocol_mid_word_score);
329  EXPECT_GT(title_mid_word_score, protocol_score);
330  EXPECT_GT(title_mid_word_score, protocol_mid_word_score);
331  EXPECT_GT(path_mid_word_score, tld_score);
332  EXPECT_GT(path_mid_word_score, tld_mid_word_score);
333  EXPECT_GT(title_mid_word_score, tld_score);
334  EXPECT_GT(title_mid_word_score, tld_mid_word_score);
335}
336
337}  // namespace history
338