in_memory_url_index.cc revision 21d179b334e59e9a3bfcaed4c4430bef1bc5759d
1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/history/in_memory_url_index.h"
6
7#include <algorithm>
8#include <limits>
9
10#include "app/l10n_util.h"
11#include "base/i18n/break_iterator.h"
12#include "base/string_util.h"
13#include "base/time.h"
14#include "base/utf_string_conversions.h"
15#include "chrome/browser/autocomplete/history_provider_util.h"
16#include "chrome/browser/history/url_database.h"
17#include "net/base/escape.h"
18#include "net/base/net_util.h"
19#include "unicode/utypes.h"  // for int32_t
20
21using base::Time;
22using base::TimeDelta;
23
24namespace history {
25
26ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {}
27
28ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info,
29                                       size_t input_location,
30                                       bool match_in_scheme,
31                                       bool innermost_match,
32                                       int score)
33    : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match),
34      raw_score(score) {
35}
36
37struct InMemoryURLIndex::TermCharWordSet {
38  TermCharWordSet(const Char16Set& char_set,
39                  const WordIDSet& word_id_set,
40                  bool used)
41      : char_set_(char_set),
42        word_id_set_(word_id_set),
43        used_(used) {}
44
45  bool IsNotUsed() const { return !used_; }
46
47  Char16Set char_set_;
48  WordIDSet word_id_set_;
49  bool used_;  // true if this set has been used for the current term search.
50};
51
52InMemoryURLIndex::InMemoryURLIndex() : history_item_count_(0) {}
53
54InMemoryURLIndex::~InMemoryURLIndex() {}
55
56static const int32_t kURLToLowerBufferSize = 2048;
57
58// Indexing
59
60bool InMemoryURLIndex::Init(history::URLDatabase* history_db,
61                            const std::string& languages) {
62  // TODO(mrossetti): Register for profile/language change notifications.
63  languages_ = languages;
64  // Reset our indexes.
65  char_word_map_.clear();
66  word_id_history_map_.clear();
67  if (!history_db)
68    return false;
69  URLDatabase::URLEnumerator history_enum;
70  if (history_db->InitURLEnumeratorForEverything(&history_enum)) {
71    URLRow row;
72    Time recent_threshold = InMemoryURLIndex::RecentThreshold();
73    while (history_enum.GetNextURL(&row)) {
74      // Do some filtering so that we only get history items which could
75      // possibly pass the HistoryURLProvider::CullPoorMatches filter later.
76      if ((row.typed_count() > kLowQualityMatchTypedLimit) ||
77          (row.visit_count() > kLowQualityMatchVisitLimit) ||
78          (row.last_visit() >= recent_threshold)) {
79        if (!IndexRow(row))
80          return false;
81      }
82    }
83  }
84  return true;
85}
86
87bool InMemoryURLIndex::IndexRow(const URLRow& row) {
88  const GURL& gurl(row.url());
89  string16 url(net::FormatUrl(gurl, languages_,
90      net::kFormatUrlOmitUsernamePassword,
91      UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS,
92      NULL, NULL, NULL));
93
94  // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max().
95  HistoryID history_id = static_cast<HistoryID>(row.id());
96
97  // Add the row for quick lookup in the history info store.
98  url = l10n_util::ToLower(url);
99  URLRow new_row(GURL(url), row.id());
100  new_row.set_visit_count(row.visit_count());
101  new_row.set_typed_count(row.typed_count());
102  new_row.set_last_visit(row.last_visit());
103  new_row.set_title(row.title());
104  history_info_map_.insert(std::make_pair(history_id, new_row));
105
106  // Split into individual, unique words.
107  String16Set words = WordSetFromString16(url);
108
109  // For each word, add a new entry into the word index referring to the
110  // associated history item.
111  for (String16Set::iterator iter = words.begin();
112       iter != words.end(); ++iter) {
113    String16Set::value_type uni_word = *iter;
114    AddWordToIndex(uni_word, history_id);
115  }
116  ++history_item_count_;
117  return true;
118}
119
120// Searching
121
122ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(
123    const String16Vector& terms) {
124  ScoredHistoryMatches scored_items;
125  if (!terms.empty()) {
126    // Reset used_ flags for term_char_word_set_cache_. We use a basic mark-
127    // and-sweep approach.
128    ResetTermCharWordSetCache();
129
130    // Lowercase the terms.
131    // TODO(mrossetti): Another opportunity for a transform algorithm.
132    String16Vector lower_terms;
133    for (String16Vector::const_iterator term_iter = terms.begin();
134         term_iter != terms.end(); ++term_iter) {
135      String16Vector::value_type lower_string(*term_iter);
136      std::transform(lower_string.begin(),
137                     lower_string.end(),
138                     lower_string.begin(),
139                     tolower);
140      lower_terms.push_back(lower_string);
141    }
142
143    String16Vector::value_type all_terms(JoinString(lower_terms, ' '));
144    HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);
145
146    // Pass over all of the candidates filtering out any without a proper
147    // substring match, inserting those which pass in order by score.
148    scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
149                                 AddHistoryMatch(*this,
150                                                 lower_terms)).ScoredMatches();
151  }
152
153  // Remove any stale TermCharWordSet's.
154  term_char_word_set_cache_.erase(
155      std::remove_if(term_char_word_set_cache_.begin(),
156                     term_char_word_set_cache_.end(),
157                     std::mem_fun_ref(&TermCharWordSet::IsNotUsed)),
158      term_char_word_set_cache_.end());
159  return scored_items;
160}
161
162void InMemoryURLIndex::ResetTermCharWordSetCache() {
163  // TODO(mrossetti): Consider keeping more of the cache around for possible
164  // repeat searches, except a 'shortcuts' approach might be better for that.
165  // TODO(mrossetti): Change TermCharWordSet to a class and use for_each.
166  for (TermCharWordSetVector::iterator iter =
167       term_char_word_set_cache_.begin();
168       iter != term_char_word_set_cache_.end(); ++iter)
169    iter->used_ = false;
170}
171
172InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(
173    const string16& uni_string) {
174  // Break the terms down into individual terms (words), get the candidate
175  // set for each term, and intersect each to get a final candidate list.
176  // Note that a single 'term' from the user's perspective might be
177  // a string like "http://www.somewebsite.com" which, from our perspective,
178  // is four words: 'http', 'www', 'somewebsite', and 'com'.
179  HistoryIDSet history_id_set;
180  String16Set words = WordSetFromString16(uni_string);
181  bool first_word = true;
182  for (String16Set::iterator iter = words.begin();
183       iter != words.end(); ++iter) {
184    String16Set::value_type uni_word = *iter;
185    HistoryIDSet term_history_id_set =
186        InMemoryURLIndex::HistoryIDsForTerm(uni_word);
187    if (first_word) {
188      history_id_set.swap(term_history_id_set);
189      first_word = false;
190    } else {
191      HistoryIDSet old_history_id_set(history_id_set);
192      history_id_set.clear();
193      std::set_intersection(old_history_id_set.begin(),
194                            old_history_id_set.end(),
195                            term_history_id_set.begin(),
196                            term_history_id_set.end(),
197                            std::inserter(history_id_set,
198                                          history_id_set.begin()));
199    }
200  }
201  return history_id_set;
202}
203
204InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(
205    const string16& uni_word) {
206  InMemoryURLIndex::HistoryIDSet history_id_set;
207
208  // For each character in the word, get the char/word_id map entry and
209  // intersect with the set in an incremental manner.
210  Char16Set uni_chars = CharactersFromString16(uni_word);
211  WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));
212
213  // If any words resulted then we can compose a set of history IDs by unioning
214  // the sets from each word.
215  if (!word_id_set.empty()) {
216    for (WordIDSet::iterator word_id_iter = word_id_set.begin();
217         word_id_iter != word_id_set.end(); ++word_id_iter) {
218      WordID word_id = *word_id_iter;
219      WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
220      if (word_iter != word_id_history_map_.end()) {
221        HistoryIDSet& word_history_id_set(word_iter->second);
222        history_id_set.insert(word_history_id_set.begin(),
223                              word_history_id_set.end());
224      }
225    }
226  }
227
228  return history_id_set;
229}
230
231// Utility Functions
232
233// static
234InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(
235    const string16& uni_string) {
236  String16Set words;
237  base::BreakIterator iter(&uni_string, base::BreakIterator::BREAK_WORD);
238  if (iter.Init()) {
239    while (iter.Advance()) {
240      if (iter.IsWord())
241        words.insert(iter.GetString());
242    }
243  }
244  return words;
245}
246
247// static
248InMemoryURLIndex::Char16Set InMemoryURLIndex::CharactersFromString16(
249    const string16& uni_word) {
250  Char16Set characters;
251  for (string16::const_iterator iter = uni_word.begin();
252       iter != uni_word.end(); ++iter)
253    characters.insert(*iter);
254  return characters;
255}
256
257void InMemoryURLIndex::AddWordToIndex(const string16& uni_word,
258                                      HistoryID history_id) {
259  WordMap::iterator word_pos = word_map_.find(uni_word);
260  if (word_pos != word_map_.end())
261    UpdateWordHistory(word_pos->second, history_id);
262  else
263    AddWordHistory(uni_word, history_id);
264}
265
266void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) {
267    WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
268    DCHECK(history_pos != word_id_history_map_.end());
269    HistoryIDSet& history_id_set(history_pos->second);
270    history_id_set.insert(history_id);
271}
272
273// Add a new word to the word list and the word map, and then create a
274// new entry in the word/history map.
275void InMemoryURLIndex::AddWordHistory(const string16& uni_word,
276                                      HistoryID history_id) {
277  word_list_.push_back(uni_word);
278  WordID word_id = word_list_.size() - 1;
279  word_map_.insert(std::make_pair(uni_word, word_id));
280  HistoryIDSet history_id_set;
281  history_id_set.insert(history_id);
282  word_id_history_map_.insert(std::make_pair(word_id, history_id_set));
283  // For each character in the newly added word (i.e. a word that is not
284  // already in the word index), add the word to the character index.
285  Char16Set characters = CharactersFromString16(uni_word);
286  for (Char16Set::iterator uni_char_iter = characters.begin();
287       uni_char_iter != characters.end(); ++uni_char_iter) {
288    Char16Set::value_type uni_string = *uni_char_iter;
289    CharWordIDMap::iterator char_iter = char_word_map_.find(uni_string);
290    if (char_iter != char_word_map_.end()) {
291      // Update existing entry in the char/word index.
292      WordIDSet& word_id_set(char_iter->second);
293      word_id_set.insert(word_id);
294    } else {
295      // Create a new entry in the char/word index.
296      WordIDSet word_id_set;
297      word_id_set.insert(word_id);
298      char_word_map_.insert(std::make_pair(uni_string, word_id_set));
299    }
300  }
301}
302
303InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars(
304    InMemoryURLIndex::Char16Set const& uni_chars) {
305  TermCharWordSet* best_term_char_word_set = NULL;
306  bool set_found = false;
307  size_t outstanding_count = 0;
308  Char16Set outstanding_chars;
309  for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin();
310       iter != term_char_word_set_cache_.end(); ++iter) {
311    TermCharWordSetVector::value_type& term_char_word_set(*iter);
312    Char16Set& char_set(term_char_word_set.char_set_);
313    Char16Set exclusions;
314    std::set_difference(char_set.begin(), char_set.end(),
315                        uni_chars.begin(), uni_chars.end(),
316                        std::inserter(exclusions, exclusions.begin()));
317    if (exclusions.empty()) {
318      // Do a reverse difference so that we know which characters remain
319      // to be indexed. Then decide if this is a better match than any
320      // previous cached set.
321      std::set_difference(uni_chars.begin(), uni_chars.end(),
322                          char_set.begin(), char_set.end(),
323                          std::inserter(exclusions, exclusions.begin()));
324      if (!set_found || exclusions.size() < outstanding_count) {
325        outstanding_chars = exclusions;
326        best_term_char_word_set = &*iter;
327        outstanding_count = exclusions.size();
328        set_found = true;
329      }
330    }
331  }
332
333  WordIDSet word_id_set;
334  if (set_found && outstanding_count == 0) {
335    // If there were no oustanding characters then we can use the cached one.
336    best_term_char_word_set->used_ = true;
337    word_id_set = best_term_char_word_set->word_id_set_;
338  } else {
339    // Some or all of the characters remain to be indexed.
340    bool first_character = true;
341    if (set_found) {
342      first_character = false;
343      word_id_set = best_term_char_word_set->word_id_set_;
344    } else {
345      outstanding_chars = uni_chars;
346    }
347
348    for (Char16Set::iterator uni_char_iter = outstanding_chars.begin();
349         uni_char_iter != outstanding_chars.end(); ++uni_char_iter) {
350      Char16Set::value_type uni_char = *uni_char_iter;
351      CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
352      if (char_iter == char_word_map_.end()) {
353        // The character was not found so bail.
354        word_id_set.clear();
355        break;
356      }
357      WordIDSet& char_word_id_set(char_iter->second);
358      if (first_character) {
359        word_id_set = char_word_id_set;
360        first_character = false;
361      } else {
362        WordIDSet old_word_id_set(word_id_set);
363        word_id_set.clear();
364        std::set_intersection(old_word_id_set.begin(),
365                              old_word_id_set.end(),
366                              char_word_id_set.begin(),
367                              char_word_id_set.end(),
368                              std::inserter(word_id_set,
369                              word_id_set.begin()));
370      }
371    }
372    // Update the cache.
373    if (!set_found || outstanding_count) {
374      term_char_word_set_cache_.push_back(TermCharWordSet(uni_chars,
375          word_id_set, true));
376    }
377  }
378  return word_id_set;
379}
380
381// static
382int InMemoryURLIndex::RawScoreForURL(const URLRow& row,
383                                     const String16Vector& terms,
384                                     size_t* first_term_location) {
385  GURL gurl = row.url();
386  if (!gurl.is_valid())
387    return 0;
388
389  string16 url = UTF8ToUTF16(gurl.spec());
390
391  // Collect all term start positions so we can see if they appear in order.
392  std::vector<size_t> term_locations;
393  int out_of_order = 0;  // Count the terms which are out of order.
394  size_t start_location_total = 0;
395  size_t term_length_total = 0;
396  for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();
397       ++iter) {
398    string16 term = *iter;
399    size_t term_location = url.find(term);
400    if (term_location == string16::npos)
401      return 0;  // A term was not found.  This should not happen.
402    if (iter != terms.begin()) {
403      // See if this term is out-of-order.
404      for (std::vector<size_t>::const_iterator order_iter =
405               term_locations.begin(); order_iter != term_locations.end();
406               ++order_iter) {
407        if (term_location <= *order_iter)
408          ++out_of_order;
409      }
410    } else {
411      *first_term_location = term_location;
412    }
413    term_locations.push_back(term_location);
414    start_location_total += term_location;
415    term_length_total += term.size();
416  }
417
418  // Calculate a raw score.
419  // TODO(mrossetti): This is good enough for now but must be fine-tuned.
420  const float kOrderMaxValue = 10.0;
421  float order_value = 10.0;
422  if (terms.size() > 1) {
423    int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2;
424    order_value =
425        (static_cast<float>(max_possible_out_of_order - out_of_order) /
426         max_possible_out_of_order) * kOrderMaxValue;
427  }
428
429  const float kStartMaxValue = 10.0;
430  const size_t kMaxSignificantStart = 20;
431  float start_value =
432      (static_cast<float>(kMaxSignificantStart -
433       std::min(kMaxSignificantStart, start_location_total / terms.size()))) /
434      static_cast<float>(kMaxSignificantStart) * kStartMaxValue;
435
436  const float kCompleteMaxValue = 10.0;
437  float complete_value =
438      (static_cast<float>(term_length_total) / static_cast<float>(url.size())) *
439      kStartMaxValue;
440
441  const float kLastVisitMaxValue = 10.0;
442  const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);
443  int64 delta_time = (kMaxSignificantDay -
444      std::min((base::Time::Now() - row.last_visit()),
445               kMaxSignificantDay)).ToInternalValue();
446  float last_visit_value =
447      (static_cast<float>(delta_time) /
448       static_cast<float>(kMaxSignificantDay.ToInternalValue())) *
449      kLastVisitMaxValue;
450
451  const float kVisitCountMaxValue = 10.0;
452  const int kMaxSignificantVisits = 10;
453  float visit_count_value =
454      (static_cast<float>(std::min(row.visit_count(),
455       kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) *
456      kVisitCountMaxValue;
457
458  const float kTypedCountMaxValue = 20.0;
459  const int kMaxSignificantTyped = 10;
460  float typed_count_value =
461      (static_cast<float>(std::min(row.typed_count(),
462       kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) *
463      kTypedCountMaxValue;
464
465  float raw_score = order_value + start_value + complete_value +
466      last_visit_value + visit_count_value + typed_count_value;
467
468  // Normalize the score.
469  const float kMaxNormalizedRawScore = 1000.0;
470  raw_score =
471      (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue +
472                    kLastVisitMaxValue + kVisitCountMaxValue +
473                    kTypedCountMaxValue)) *
474      kMaxNormalizedRawScore;
475  return static_cast<int>(raw_score);
476}
477
478// static
479Time InMemoryURLIndex::RecentThreshold() {
480  return Time::Now() - TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays);
481}
482
483InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(
484    const InMemoryURLIndex& index,
485    const String16Vector& lower_terms)
486    : index_(index),
487      lower_terms_(lower_terms) {
488}
489
490InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}
491
492void InMemoryURLIndex::AddHistoryMatch::operator()(
493    const InMemoryURLIndex::HistoryID history_id) {
494  HistoryInfoMap::const_iterator hist_pos =
495      index_.history_info_map_.find(history_id);
496  if (hist_pos != index_.history_info_map_.end()) {
497    const URLRow& hist_item = hist_pos->second;
498    // TODO(mrossetti): Accommodate multiple term highlighting.
499    size_t input_location = 0;
500    int score = InMemoryURLIndex::RawScoreForURL(
501        hist_item, lower_terms_, &input_location);
502    if (score != 0) {
503      // We only retain the top 10 highest scoring results so
504      // see if this one fits into the top 10 and, if so, where.
505      ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin();
506      while (scored_iter != scored_matches_.end() &&
507             (*scored_iter).raw_score > score)
508        ++scored_iter;
509      if ((scored_matches_.size() < 10) ||
510          (scored_iter != scored_matches_.end())) {
511        // Create and insert the new item.
512        // TODO(mrossetti): Properly set |match_in_scheme| and
513        // |innermost_match|.
514        bool match_in_scheme = false;
515        bool innermost_match = true;
516        ScoredHistoryMatch match(hist_item, input_location,
517            match_in_scheme, innermost_match, score);
518        if (!scored_matches_.empty())
519          scored_matches_.insert(scored_iter, match);
520        else
521          scored_matches_.push_back(match);
522        // Trim any entries beyond 10.
523        if (scored_matches_.size() > 10) {
524          scored_matches_.erase(scored_matches_.begin() + 10,
525                                scored_matches_.end());
526        }
527      }
528    }
529  }
530}
531
532}  // namespace history
533