in_memory_url_index.cc revision 3345a6884c488ff3a535c2c9acdd33d74b37e311
1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/history/in_memory_url_index.h"
6
7#include <algorithm>
8#include <limits>
9
10#include "app/l10n_util.h"
11#include "base/i18n/word_iterator.h"
12#include "base/string_util.h"
13#include "base/time.h"
14#include "base/utf_string_conversions.h"
15#include "chrome/browser/autocomplete/history_provider_util.h"
16#include "chrome/browser/history/url_database.h"
17#include "net/base/escape.h"
18#include "net/base/net_util.h"
19
20using base::Time;
21using base::TimeDelta;
22
23namespace history {
24
25ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {}
26
27ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info,
28                                       size_t input_location,
29                                       bool match_in_scheme,
30                                       bool innermost_match,
31                                       int score)
32    : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match),
33      raw_score(score) {
34}
35
36InMemoryURLIndex::InMemoryURLIndex() : history_item_count_(0) {}
37
38InMemoryURLIndex::~InMemoryURLIndex() {}
39
40static const int32_t kURLToLowerBufferSize = 2048;
41
42// Indexing
43
44bool InMemoryURLIndex::Init(history::URLDatabase* history_db,
45                            const std::string& languages) {
46  // TODO(mrossetti): Register for profile/language change notifications.
47  languages_ = languages;
48  // Reset our indexes.
49  char_word_map_.clear();
50  word_id_history_map_.clear();
51  if (!history_db)
52    return false;
53  URLDatabase::URLEnumerator history_enum;
54  if (history_db->InitURLEnumeratorForEverything(&history_enum)) {
55    URLRow row;
56    Time recent_threshold = InMemoryURLIndex::RecentThreshold();
57    while (history_enum.GetNextURL(&row)) {
58      // Do some filtering so that we only get history items which could
59      // possibly pass the HistoryURLProvider::CullPoorMatches filter later.
60      if ((row.typed_count() > kLowQualityMatchTypedLimit) ||
61          (row.visit_count() > kLowQualityMatchVisitLimit) ||
62          (row.last_visit() >= recent_threshold)) {
63        if (!IndexRow(row))
64          return false;
65      }
66    }
67  }
68  return true;
69}
70
71bool InMemoryURLIndex::IndexRow(URLRow row) {
72  const GURL& gurl(row.url());
73  string16 url(net::FormatUrl(gurl, languages_,
74      net::kFormatUrlOmitUsernamePassword,
75      UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS,
76      NULL, NULL, NULL));
77
78  // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max().
79  HistoryID history_id = static_cast<HistoryID>(row.id());
80
81  // Add the row for quick lookup in the history info store.
82  url = l10n_util::ToLower(url);
83  URLRow new_row(GURL(url), row.id());
84  new_row.set_visit_count(row.visit_count());
85  new_row.set_typed_count(row.typed_count());
86  new_row.set_last_visit(row.last_visit());
87  new_row.set_title(row.title());
88  history_info_map_.insert(std::make_pair(history_id, new_row));
89
90  // Split into individual, unique words.
91  String16Set words = WordSetFromString16(url);
92
93  // For each word, add a new entry into the word index referring to the
94  // associated history item.
95  for (String16Set::iterator iter = words.begin();
96       iter != words.end(); ++iter) {
97    String16Set::value_type uni_word = *iter;
98    AddWordToIndex(uni_word, history_id);
99  }
100  ++history_item_count_;
101  return true;
102}
103
104// Searching
105
106ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(
107    const String16Vector& terms) {
108  ScoredHistoryMatches scored_items;
109  if (!terms.empty()) {
110    // Reset used_ flags for term_char_word_set_cache_. We use a basic mark-
111    // and-sweep approach.
112    ResetTermCharWordSetCache();
113
114    // Lowercase the terms.
115    // TODO(mrossetti): Another opportunity for a transform algorithm.
116    String16Vector lower_terms;
117    for (String16Vector::const_iterator term_iter = terms.begin();
118         term_iter != terms.end(); ++term_iter) {
119      String16Vector::value_type lower_string(*term_iter);
120      std::transform(lower_string.begin(),
121                     lower_string.end(),
122                     lower_string.begin(),
123                     tolower);
124      lower_terms.push_back(lower_string);
125    }
126
127    String16Vector::value_type all_terms(JoinString(lower_terms, ' '));
128    HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);
129
130    // Pass over all of the candidates filtering out any without a proper
131    // substring match, inserting those which pass in order by score.
132    scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
133                                 AddHistoryMatch(*this,
134                                                 lower_terms)).ScoredMatches();
135  }
136
137  // Remove any stale TermCharWordSet's.
138  term_char_word_set_cache_.erase(
139      std::remove_if(term_char_word_set_cache_.begin(),
140                     term_char_word_set_cache_.end(),
141                     std::mem_fun_ref(&TermCharWordSet::IsNotUsed)),
142      term_char_word_set_cache_.end());
143  return scored_items;
144}
145
146void InMemoryURLIndex::ResetTermCharWordSetCache() {
147  // TODO(mrossetti): Consider keeping more of the cache around for possible
148  // repeat searches, except a 'shortcuts' approach might be better for that.
149  // TODO(mrossetti): Change TermCharWordSet to a class and use for_each.
150  for (TermCharWordSetVector::iterator iter =
151       term_char_word_set_cache_.begin();
152       iter != term_char_word_set_cache_.end(); ++iter)
153    iter->used_ = false;
154}
155
156InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(
157    const string16& uni_string) {
158  // Break the terms down into individual terms (words), get the candidate
159  // set for each term, and intersect each to get a final candidate list.
160  // Note that a single 'term' from the user's perspective might be
161  // a string like "http://www.somewebsite.com" which, from our perspective,
162  // is four words: 'http', 'www', 'somewebsite', and 'com'.
163  HistoryIDSet history_id_set;
164  String16Set words = WordSetFromString16(uni_string);
165  bool first_word = true;
166  for (String16Set::iterator iter = words.begin();
167       iter != words.end(); ++iter) {
168    String16Set::value_type uni_word = *iter;
169    HistoryIDSet term_history_id_set =
170        InMemoryURLIndex::HistoryIDsForTerm(uni_word);
171    if (first_word) {
172      history_id_set.swap(term_history_id_set);
173      first_word = false;
174    } else {
175      HistoryIDSet old_history_id_set(history_id_set);
176      history_id_set.clear();
177      std::set_intersection(old_history_id_set.begin(),
178                            old_history_id_set.end(),
179                            term_history_id_set.begin(),
180                            term_history_id_set.end(),
181                            std::inserter(history_id_set,
182                                          history_id_set.begin()));
183    }
184  }
185  return history_id_set;
186}
187
188InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(
189    const string16& uni_word) {
190  InMemoryURLIndex::HistoryIDSet history_id_set;
191
192  // For each character in the word, get the char/word_id map entry and
193  // intersect with the set in an incremental manner.
194  Char16Set uni_chars = CharactersFromString16(uni_word);
195  WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));
196
197  // If any words resulted then we can compose a set of history IDs by unioning
198  // the sets from each word.
199  if (!word_id_set.empty()) {
200    for (WordIDSet::iterator word_id_iter = word_id_set.begin();
201         word_id_iter != word_id_set.end(); ++word_id_iter) {
202      WordID word_id = *word_id_iter;
203      WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
204      if (word_iter != word_id_history_map_.end()) {
205        HistoryIDSet& word_history_id_set(word_iter->second);
206        history_id_set.insert(word_history_id_set.begin(),
207                              word_history_id_set.end());
208      }
209    }
210  }
211
212  return history_id_set;
213}
214
215// Utility Functions
216
217// static
218InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(
219    const string16& uni_string) {
220  String16Set words;
221  WordIterator iter(&uni_string, WordIterator::BREAK_WORD);
222  if (iter.Init()) {
223    while (iter.Advance()) {
224      if (iter.IsWord())
225        words.insert(iter.GetWord());
226    }
227  }
228  return words;
229}
230
231// static
232InMemoryURLIndex::Char16Set InMemoryURLIndex::CharactersFromString16(
233    const string16& uni_word) {
234  Char16Set characters;
235  for (string16::const_iterator iter = uni_word.begin();
236       iter != uni_word.end(); ++iter)
237    characters.insert(*iter);
238  return characters;
239}
240
241void InMemoryURLIndex::AddWordToIndex(const string16& uni_word,
242                                      HistoryID history_id) {
243  WordMap::iterator word_pos = word_map_.find(uni_word);
244  if (word_pos != word_map_.end())
245    UpdateWordHistory(word_pos->second, history_id);
246  else
247    AddWordHistory(uni_word, history_id);
248}
249
250void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) {
251    WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
252    DCHECK(history_pos != word_id_history_map_.end());
253    HistoryIDSet& history_id_set(history_pos->second);
254    history_id_set.insert(history_id);
255}
256
257// Add a new word to the word list and the word map, and then create a
258// new entry in the word/history map.
259void InMemoryURLIndex::AddWordHistory(const string16& uni_word,
260                                      HistoryID history_id) {
261  word_list_.push_back(uni_word);
262  WordID word_id = word_list_.size() - 1;
263  word_map_.insert(std::make_pair(uni_word, word_id));
264  HistoryIDSet history_id_set;
265  history_id_set.insert(history_id);
266  word_id_history_map_.insert(std::make_pair(word_id, history_id_set));
267  // For each character in the newly added word (i.e. a word that is not
268  // already in the word index), add the word to the character index.
269  Char16Set characters = CharactersFromString16(uni_word);
270  for (Char16Set::iterator uni_char_iter = characters.begin();
271       uni_char_iter != characters.end(); ++uni_char_iter) {
272    Char16Set::value_type uni_string = *uni_char_iter;
273    CharWordIDMap::iterator char_iter = char_word_map_.find(uni_string);
274    if (char_iter != char_word_map_.end()) {
275      // Update existing entry in the char/word index.
276      WordIDSet& word_id_set(char_iter->second);
277      word_id_set.insert(word_id);
278    } else {
279      // Create a new entry in the char/word index.
280      WordIDSet word_id_set;
281      word_id_set.insert(word_id);
282      char_word_map_.insert(std::make_pair(uni_string, word_id_set));
283    }
284  }
285}
286
287InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars(
288    InMemoryURLIndex::Char16Set const& uni_chars) {
289  TermCharWordSet* best_term_char_word_set = NULL;
290  bool set_found = false;
291  size_t outstanding_count = 0;
292  Char16Set outstanding_chars;
293  for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin();
294       iter != term_char_word_set_cache_.end(); ++iter) {
295    TermCharWordSetVector::value_type& term_char_word_set(*iter);
296    Char16Set& char_set(term_char_word_set.char_set_);
297    Char16Set exclusions;
298    std::set_difference(char_set.begin(), char_set.end(),
299                        uni_chars.begin(), uni_chars.end(),
300                        std::inserter(exclusions, exclusions.begin()));
301    if (exclusions.empty()) {
302      // Do a reverse difference so that we know which characters remain
303      // to be indexed. Then decide if this is a better match than any
304      // previous cached set.
305      std::set_difference(uni_chars.begin(), uni_chars.end(),
306                          char_set.begin(), char_set.end(),
307                          std::inserter(exclusions, exclusions.begin()));
308      if (!set_found || exclusions.size() < outstanding_count) {
309        outstanding_chars = exclusions;
310        best_term_char_word_set = &*iter;
311        outstanding_count = exclusions.size();
312        set_found = true;
313      }
314    }
315  }
316
317  WordIDSet word_id_set;
318  if (set_found && outstanding_count == 0) {
319    // If there were no oustanding characters then we can use the cached one.
320    best_term_char_word_set->used_ = true;
321    word_id_set = best_term_char_word_set->word_id_set_;
322  } else {
323    // Some or all of the characters remain to be indexed.
324    bool first_character = true;
325    if (set_found) {
326      first_character = false;
327      word_id_set = best_term_char_word_set->word_id_set_;
328    } else {
329      outstanding_chars = uni_chars;
330    }
331
332    for (Char16Set::iterator uni_char_iter = outstanding_chars.begin();
333         uni_char_iter != outstanding_chars.end(); ++uni_char_iter) {
334      Char16Set::value_type uni_char = *uni_char_iter;
335      CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
336      if (char_iter == char_word_map_.end()) {
337        // The character was not found so bail.
338        word_id_set.clear();
339        break;
340      }
341      WordIDSet& char_word_id_set(char_iter->second);
342      if (first_character) {
343        word_id_set = char_word_id_set;
344        first_character = false;
345      } else {
346        WordIDSet old_word_id_set(word_id_set);
347        word_id_set.clear();
348        std::set_intersection(old_word_id_set.begin(),
349                              old_word_id_set.end(),
350                              char_word_id_set.begin(),
351                              char_word_id_set.end(),
352                              std::inserter(word_id_set,
353                              word_id_set.begin()));
354      }
355    }
356    // Update the cache.
357    if (!set_found || outstanding_count) {
358      term_char_word_set_cache_.push_back(TermCharWordSet(uni_chars,
359          word_id_set, true));
360    }
361  }
362  return word_id_set;
363}
364
365// static
366int InMemoryURLIndex::RawScoreForURL(const URLRow& row,
367                                     const String16Vector& terms,
368                                     size_t* first_term_location) {
369  GURL gurl = row.url();
370  if (!gurl.is_valid())
371    return 0;
372
373  string16 url = UTF8ToUTF16(gurl.spec());
374
375  // Collect all term start positions so we can see if they appear in order.
376  std::vector<size_t> term_locations;
377  int out_of_order = 0;  // Count the terms which are out of order.
378  size_t start_location_total = 0;
379  size_t term_length_total = 0;
380  for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();
381       ++iter) {
382    string16 term = *iter;
383    size_t term_location = url.find(term);
384    if (term_location == string16::npos)
385      return 0;  // A term was not found.  This should not happen.
386    if (iter != terms.begin()) {
387      // See if this term is out-of-order.
388      for (std::vector<size_t>::const_iterator order_iter =
389               term_locations.begin(); order_iter != term_locations.end();
390               ++order_iter) {
391        if (term_location <= *order_iter)
392          ++out_of_order;
393      }
394    } else {
395      *first_term_location = term_location;
396    }
397    term_locations.push_back(term_location);
398    start_location_total += term_location;
399    term_length_total += term.size();
400  }
401
402  // Calculate a raw score.
403  // TODO(mrossetti): This is good enough for now but must be fine-tuned.
404  const float kOrderMaxValue = 10.0;
405  float order_value = 10.0;
406  if (terms.size() > 1) {
407    int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2;
408    order_value =
409        (static_cast<float>(max_possible_out_of_order - out_of_order) /
410         max_possible_out_of_order) * kOrderMaxValue;
411  }
412
413  const float kStartMaxValue = 10.0;
414  const size_t kMaxSignificantStart = 20;
415  float start_value =
416      (static_cast<float>(kMaxSignificantStart -
417       std::min(kMaxSignificantStart, start_location_total / terms.size()))) /
418      static_cast<float>(kMaxSignificantStart) * kStartMaxValue;
419
420  const float kCompleteMaxValue = 10.0;
421  float complete_value =
422      (static_cast<float>(term_length_total) / static_cast<float>(url.size())) *
423      kStartMaxValue;
424
425  const float kLastVisitMaxValue = 10.0;
426  const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);
427  int64 delta_time = (kMaxSignificantDay -
428      std::min((base::Time::Now() - row.last_visit()),
429               kMaxSignificantDay)).ToInternalValue();
430  float last_visit_value =
431      (static_cast<float>(delta_time) /
432       static_cast<float>(kMaxSignificantDay.ToInternalValue())) *
433      kLastVisitMaxValue;
434
435  const float kVisitCountMaxValue = 10.0;
436  const int kMaxSignificantVisits = 10;
437  float visit_count_value =
438      (static_cast<float>(std::min(row.visit_count(),
439       kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) *
440      kVisitCountMaxValue;
441  float raw_score = order_value + start_value + complete_value +
442      last_visit_value + visit_count_value;
443
444  // Normalize the score.
445  const float kMaxNormalizedRawScore = 1000.0;
446  raw_score =
447      (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue +
448                    kLastVisitMaxValue + kVisitCountMaxValue)) *
449      kMaxNormalizedRawScore;
450  return static_cast<int>(raw_score);
451}
452
453// static
454Time InMemoryURLIndex::RecentThreshold() {
455  return Time::Now() - TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays);
456}
457
458void InMemoryURLIndex::AddHistoryMatch::operator()(
459    const InMemoryURLIndex::HistoryID history_id) {
460  HistoryInfoMap::const_iterator hist_pos =
461      index_.history_info_map_.find(history_id);
462  if (hist_pos != index_.history_info_map_.end()) {
463    const URLRow& hist_item = hist_pos->second;
464    // TODO(mrossetti): Accommodate multiple term highlighting.
465    size_t input_location = 0;
466    int score = InMemoryURLIndex::RawScoreForURL(
467        hist_item, lower_terms_, &input_location);
468    if (score != 0) {
469      // We only retain the top 10 highest scoring results so
470      // see if this one fits into the top 10 and, if so, where.
471      ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin();
472      while (scored_iter != scored_matches_.end() &&
473             (*scored_iter).raw_score > score)
474        ++scored_iter;
475      if ((scored_matches_.size() < 10) ||
476          (scored_iter != scored_matches_.end())) {
477        // Create and insert the new item.
478        // TODO(mrossetti): Properly set |match_in_scheme| and
479        // |innermost_match|.
480        bool match_in_scheme = false;
481        bool innermost_match = true;
482        ScoredHistoryMatch match(hist_item, input_location,
483            match_in_scheme, innermost_match, score);
484        if (!scored_matches_.empty())
485          scored_matches_.insert(scored_iter, match);
486        else
487          scored_matches_.push_back(match);
488        // Trim any entries beyond 10.
489        if (scored_matches_.size() > 10) {
490          scored_matches_.erase(scored_matches_.begin() + 10,
491                                scored_matches_.end());
492        }
493      }
494    }
495  }
496}
497
498}  // namespace history
499