1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/autocomplete/bookmark_provider.h"
6
7#include <algorithm>
8#include <functional>
9#include <vector>
10
11#include "base/prefs/pref_service.h"
12#include "base/strings/string_util.h"
13#include "base/strings/utf_string_conversions.h"
14#include "chrome/browser/autocomplete/chrome_autocomplete_scheme_classifier.h"
15#include "chrome/browser/autocomplete/history_provider.h"
16#include "chrome/browser/bookmarks/bookmark_model_factory.h"
17#include "chrome/browser/profiles/profile.h"
18#include "chrome/common/pref_names.h"
19#include "components/bookmarks/browser/bookmark_match.h"
20#include "components/bookmarks/browser/bookmark_model.h"
21#include "components/metrics/proto/omnibox_input_type.pb.h"
22#include "components/omnibox/autocomplete_result.h"
23#include "components/omnibox/url_prefix.h"
24#include "net/base/net_util.h"
25
26using bookmarks::BookmarkMatch;
27
28typedef std::vector<BookmarkMatch> BookmarkMatches;
29
30namespace {
31
32// Removes leading spaces from |title| before displaying, otherwise it looks
33// funny.  In the process, corrects |title_match_positions| so the correct
34// characters are highlighted.
35void CorrectTitleAndMatchPositions(
36    base::string16* title,
37    BookmarkMatch::MatchPositions* title_match_positions) {
38  size_t leading_whitespace_chars = title->length();
39  base::TrimWhitespace(*title, base::TRIM_LEADING, title);
40  leading_whitespace_chars-= title->length();
41  if (leading_whitespace_chars == 0)
42    return;
43  for (query_parser::Snippet::MatchPositions::iterator it =
44           title_match_positions->begin();
45       it != title_match_positions->end(); ++it) {
46    (*it) = query_parser::Snippet::MatchPosition(
47        it->first - leading_whitespace_chars,
48        it->second - leading_whitespace_chars);
49  }
50}
51
52}  // namespace
53
54// BookmarkProvider ------------------------------------------------------------
55
56BookmarkProvider::BookmarkProvider(Profile* profile)
57    : AutocompleteProvider(AutocompleteProvider::TYPE_BOOKMARK),
58      profile_(profile),
59      bookmark_model_(NULL) {
60  if (profile) {
61    bookmark_model_ = BookmarkModelFactory::GetForProfile(profile);
62    languages_ = profile_->GetPrefs()->GetString(prefs::kAcceptLanguages);
63  }
64}
65
66void BookmarkProvider::Start(const AutocompleteInput& input,
67                             bool minimal_changes) {
68  if (minimal_changes)
69    return;
70  matches_.clear();
71
72  if (input.text().empty() ||
73      (input.type() == metrics::OmniboxInputType::FORCED_QUERY))
74    return;
75
76  DoAutocomplete(input);
77}
78
79BookmarkProvider::~BookmarkProvider() {}
80
81void BookmarkProvider::DoAutocomplete(const AutocompleteInput& input) {
82  // We may not have a bookmark model for some unit tests.
83  if (!bookmark_model_)
84    return;
85
86  BookmarkMatches matches;
87  // Retrieve enough bookmarks so that we have a reasonable probability of
88  // suggesting the one that the user desires.
89  const size_t kMaxBookmarkMatches = 50;
90
91  // GetBookmarksMatching returns bookmarks matching the user's
92  // search terms using the following rules:
93  //  - The search text is broken up into search terms. Each term is searched
94  //    for separately.
95  //  - Term matches are always performed against the start of a word. 'def'
96  //    will match against 'define' but not against 'indefinite'.
97  //  - Terms must be at least three characters in length in order to perform
98  //    partial word matches. Any term of lesser length will only be used as an
99  //    exact match. 'def' will match against 'define' but 'de' will not match.
100  //  - A search containing multiple terms will return results with those words
101  //    occuring in any order.
102  //  - Terms enclosed in quotes comprises a phrase that must match exactly.
103  //  - Multiple terms enclosed in quotes will require those exact words in that
104  //    exact order to match.
105  //
106  // Please refer to the code for BookmarkIndex::GetBookmarksMatching for
107  // complete details of how searches are performed against the user's
108  // bookmarks.
109  bookmark_model_->GetBookmarksMatching(input.text(),
110                                        kMaxBookmarkMatches,
111                                        &matches);
112  if (matches.empty())
113    return;  // There were no matches.
114  const base::string16 fixed_up_input(FixupUserInput(input).second);
115  for (BookmarkMatches::const_iterator i = matches.begin(); i != matches.end();
116       ++i) {
117    // Create and score the AutocompleteMatch. If its score is 0 then the
118    // match is discarded.
119    AutocompleteMatch match(BookmarkMatchToACMatch(input, fixed_up_input, *i));
120    if (match.relevance > 0)
121      matches_.push_back(match);
122  }
123
124  // Sort and clip the resulting matches.
125  size_t num_matches =
126      std::min(matches_.size(), AutocompleteProvider::kMaxMatches);
127  std::partial_sort(matches_.begin(), matches_.begin() + num_matches,
128                    matches_.end(), AutocompleteMatch::MoreRelevant);
129  matches_.resize(num_matches);
130}
131
132namespace {
133
134// for_each helper functor that calculates a match factor for each query term
135// when calculating the final score.
136//
137// Calculate a 'factor' from 0 to the bookmark's title length for a match
138// based on 1) how many characters match and 2) where the match is positioned.
139class ScoringFunctor {
140 public:
141  // |title_length| is the length of the bookmark title against which this
142  // match will be scored.
143  explicit ScoringFunctor(size_t title_length)
144      : title_length_(static_cast<double>(title_length)),
145        scoring_factor_(0.0) {
146  }
147
148  void operator()(const query_parser::Snippet::MatchPosition& match) {
149    double term_length = static_cast<double>(match.second - match.first);
150    scoring_factor_ += term_length *
151        (title_length_ - match.first) / title_length_;
152  }
153
154  double ScoringFactor() { return scoring_factor_; }
155
156 private:
157  double title_length_;
158  double scoring_factor_;
159};
160
161}  // namespace
162
163AutocompleteMatch BookmarkProvider::BookmarkMatchToACMatch(
164    const AutocompleteInput& input,
165    const base::string16& fixed_up_input_text,
166    const BookmarkMatch& bookmark_match) {
167  // The AutocompleteMatch we construct is non-deletable because the only
168  // way to support this would be to delete the underlying bookmark, which is
169  // unlikely to be what the user intends.
170  AutocompleteMatch match(this, 0, false,
171                          AutocompleteMatchType::BOOKMARK_TITLE);
172  base::string16 title(bookmark_match.node->GetTitle());
173  BookmarkMatch::MatchPositions new_title_match_positions =
174      bookmark_match.title_match_positions;
175  CorrectTitleAndMatchPositions(&title, &new_title_match_positions);
176  const GURL& url(bookmark_match.node->url());
177  const base::string16& url_utf16 = base::UTF8ToUTF16(url.spec());
178  size_t inline_autocomplete_offset = URLPrefix::GetInlineAutocompleteOffset(
179      input.text(), fixed_up_input_text, false, url_utf16);
180  match.destination_url = url;
181  const size_t match_start = bookmark_match.url_match_positions.empty() ?
182      0 : bookmark_match.url_match_positions[0].first;
183  const bool trim_http = !AutocompleteInput::HasHTTPScheme(input.text()) &&
184      ((match_start == base::string16::npos) || (match_start != 0));
185  std::vector<size_t> offsets = BookmarkMatch::OffsetsFromMatchPositions(
186      bookmark_match.url_match_positions);
187  // In addition to knowing how |offsets| is transformed, we need to know how
188  // |inline_autocomplete_offset| is transformed.  We add it to the end of
189  // |offsets|, compute how everything is transformed, then remove it from the
190  // end.
191  offsets.push_back(inline_autocomplete_offset);
192  match.contents = net::FormatUrlWithOffsets(url, languages_,
193      net::kFormatUrlOmitAll & ~(trim_http ? 0 : net::kFormatUrlOmitHTTP),
194      net::UnescapeRule::SPACES, NULL, NULL, &offsets);
195  inline_autocomplete_offset = offsets.back();
196  offsets.pop_back();
197  BookmarkMatch::MatchPositions new_url_match_positions =
198      BookmarkMatch::ReplaceOffsetsInMatchPositions(
199          bookmark_match.url_match_positions, offsets);
200  match.contents_class =
201      ClassificationsFromMatch(new_url_match_positions,
202                               match.contents.size(),
203                               true);
204  match.fill_into_edit =
205      AutocompleteInput::FormattedStringWithEquivalentMeaning(
206          url, match.contents, ChromeAutocompleteSchemeClassifier(profile_));
207  if (inline_autocomplete_offset != base::string16::npos) {
208    // |inline_autocomplete_offset| may be beyond the end of the
209    // |fill_into_edit| if the user has typed an URL with a scheme and the
210    // last character typed is a slash.  That slash is removed by the
211    // FormatURLWithOffsets call above.
212    if (inline_autocomplete_offset < match.fill_into_edit.length()) {
213      match.inline_autocompletion =
214          match.fill_into_edit.substr(inline_autocomplete_offset);
215    }
216    match.allowed_to_be_default_match = match.inline_autocompletion.empty() ||
217        !HistoryProvider::PreventInlineAutocomplete(input);
218  }
219  match.description = title;
220  match.description_class =
221      ClassificationsFromMatch(bookmark_match.title_match_positions,
222                               match.description.size(),
223                               false);
224
225  // Summary on how a relevance score is determined for the match:
226  //
227  // For each match within the bookmark's title or URL (or both), calculate a
228  // 'factor', sum up those factors, then use the sum to figure out a value
229  // between the base score and the maximum score.
230  //
231  // The factor for each match is the product of:
232  //
233  //  1) how many characters in the bookmark's title/URL are part of this match.
234  //     This is capped at the length of the bookmark's title
235  //     to prevent terms that match in both the title and the URL from
236  //     scoring too strongly.
237  //
238  //  2) where the match occurs within the bookmark's title or URL,
239  //     giving more points for matches that appear earlier in the string:
240  //       ((string_length - position of match start) / string_length).
241  //
242  //  Example: Given a bookmark title of 'abcde fghijklm', with a title length
243  //     of 14, and two different search terms, 'abcde' and 'fghij', with
244  //     start positions of 0 and 6, respectively, 'abcde' will score higher
245  //     (with a a partial factor of (14-0)/14 = 1.000 ) than 'fghij' (with
246  //     a partial factor of (14-6)/14 = 0.571 ).  (In this example neither
247  //     term matches in the URL.)
248  //
249  // Once all match factors have been calculated they are summed.  If there
250  // are no URL matches, the resulting sum will never be greater than the
251  // length of the bookmark title because of the way the bookmark model matches
252  // and removes overlaps.  (In particular, the bookmark model only
253  // matches terms to the beginning of words and it removes all overlapping
254  // matches, keeping only the longest.  Together these mean that each
255  // character is included in at most one match.)  If there are matches in the
256  // URL, the sum can be greater.
257  //
258  // This sum is then normalized by the length of the bookmark title + 10
259  // and capped at 1.0.  The +10 is to expand the scoring range so fewer
260  // bookmarks will hit the 1.0 cap and hence lose all ability to distinguish
261  // between these high-quality bookmarks.
262  //
263  // The normalized value is multiplied against the scoring range available,
264  // which is 299.  The 299 is calculated by subtracting the minimum possible
265  // score, 900, from the maximum possible score, 1199.  This product, ranging
266  // from 0 to 299, is added to the minimum possible score, 900, giving the
267  // preliminary score.
268  //
269  // If the preliminary score is less than the maximum possible score, 1199,
270  // it can be boosted up to that maximum possible score if the URL referenced
271  // by the bookmark is also referenced by any of the user's other bookmarks.
272  // A count of how many times the bookmark's URL is referenced is determined
273  // and, for each additional reference beyond the one for the bookmark being
274  // scored up to a maximum of three, the score is boosted by a fixed amount
275  // given by |kURLCountBoost|, below.
276  //
277
278  // Pretend empty titles are identical to the URL.
279  if (title.empty())
280    title = base::ASCIIToUTF16(url.spec());
281  ScoringFunctor title_position_functor =
282      for_each(bookmark_match.title_match_positions.begin(),
283               bookmark_match.title_match_positions.end(),
284               ScoringFunctor(title.size()));
285  ScoringFunctor url_position_functor =
286      for_each(bookmark_match.url_match_positions.begin(),
287               bookmark_match.url_match_positions.end(),
288               ScoringFunctor(bookmark_match.node->url().spec().length()));
289  const double summed_factors = title_position_functor.ScoringFactor() +
290      url_position_functor.ScoringFactor();
291  const double normalized_sum =
292      std::min(summed_factors / (title.size() + 10), 1.0);
293  const int kBaseBookmarkScore = 900;
294  const int kMaxBookmarkScore = 1199;
295  const double kBookmarkScoreRange =
296      static_cast<double>(kMaxBookmarkScore - kBaseBookmarkScore);
297  match.relevance = static_cast<int>(normalized_sum * kBookmarkScoreRange) +
298      kBaseBookmarkScore;
299  // Don't waste any time searching for additional referenced URLs if we
300  // already have a perfect title match.
301  if (match.relevance >= kMaxBookmarkScore)
302    return match;
303  // Boost the score if the bookmark's URL is referenced by other bookmarks.
304  const int kURLCountBoost[4] = { 0, 75, 125, 150 };
305  std::vector<const BookmarkNode*> nodes;
306  bookmark_model_->GetNodesByURL(url, &nodes);
307  DCHECK_GE(std::min(arraysize(kURLCountBoost), nodes.size()), 1U);
308  match.relevance +=
309      kURLCountBoost[std::min(arraysize(kURLCountBoost), nodes.size()) - 1];
310  match.relevance = std::min(kMaxBookmarkScore, match.relevance);
311  return match;
312}
313
314// static
315ACMatchClassifications BookmarkProvider::ClassificationsFromMatch(
316    const query_parser::Snippet::MatchPositions& positions,
317    size_t text_length,
318    bool is_url) {
319  ACMatchClassification::Style url_style =
320      is_url ? ACMatchClassification::URL : ACMatchClassification::NONE;
321  ACMatchClassifications classifications;
322  if (positions.empty()) {
323    if (text_length > 0)
324      classifications.push_back(ACMatchClassification(0, url_style));
325    return classifications;
326  }
327
328  for (query_parser::Snippet::MatchPositions::const_iterator i =
329           positions.begin();
330       i != positions.end();
331       ++i) {
332    AutocompleteMatch::ACMatchClassifications new_class;
333    AutocompleteMatch::ClassifyLocationInString(i->first, i->second - i->first,
334        text_length, url_style, &new_class);
335    classifications = AutocompleteMatch::MergeClassifications(
336        classifications, new_class);
337  }
338  return classifications;
339}
340