in_memory_url_index.cc revision 513209b27ff55e2841eac0e4120199c23acce758
1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/history/in_memory_url_index.h"
6
7#include <algorithm>
8#include <limits>
9
10#include "app/l10n_util.h"
11#include "base/i18n/word_iterator.h"
12#include "base/string_util.h"
13#include "base/time.h"
14#include "base/utf_string_conversions.h"
15#include "chrome/browser/autocomplete/history_provider_util.h"
16#include "chrome/browser/history/url_database.h"
17#include "net/base/escape.h"
18#include "net/base/net_util.h"
19
20using base::Time;
21using base::TimeDelta;
22
23namespace history {
24
25ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {}
26
27ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info,
28                                       size_t input_location,
29                                       bool match_in_scheme,
30                                       bool innermost_match,
31                                       int score)
32    : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match),
33      raw_score(score) {
34}
35
36struct InMemoryURLIndex::TermCharWordSet {
37  TermCharWordSet(const Char16Set& char_set,
38                  const WordIDSet& word_id_set,
39                  bool used)
40      : char_set_(char_set),
41        word_id_set_(word_id_set),
42        used_(used) {}
43
44  bool IsNotUsed() const { return !used_; }
45
46  Char16Set char_set_;
47  WordIDSet word_id_set_;
48  bool used_;  // true if this set has been used for the current term search.
49};
50
51InMemoryURLIndex::InMemoryURLIndex() : history_item_count_(0) {}
52
53InMemoryURLIndex::~InMemoryURLIndex() {}
54
55static const int32_t kURLToLowerBufferSize = 2048;
56
57// Indexing
58
59bool InMemoryURLIndex::Init(history::URLDatabase* history_db,
60                            const std::string& languages) {
61  // TODO(mrossetti): Register for profile/language change notifications.
62  languages_ = languages;
63  // Reset our indexes.
64  char_word_map_.clear();
65  word_id_history_map_.clear();
66  if (!history_db)
67    return false;
68  URLDatabase::URLEnumerator history_enum;
69  if (history_db->InitURLEnumeratorForEverything(&history_enum)) {
70    URLRow row;
71    Time recent_threshold = InMemoryURLIndex::RecentThreshold();
72    while (history_enum.GetNextURL(&row)) {
73      // Do some filtering so that we only get history items which could
74      // possibly pass the HistoryURLProvider::CullPoorMatches filter later.
75      if ((row.typed_count() > kLowQualityMatchTypedLimit) ||
76          (row.visit_count() > kLowQualityMatchVisitLimit) ||
77          (row.last_visit() >= recent_threshold)) {
78        if (!IndexRow(row))
79          return false;
80      }
81    }
82  }
83  return true;
84}
85
86bool InMemoryURLIndex::IndexRow(const URLRow& row) {
87  const GURL& gurl(row.url());
88  string16 url(net::FormatUrl(gurl, languages_,
89      net::kFormatUrlOmitUsernamePassword,
90      UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS,
91      NULL, NULL, NULL));
92
93  // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max().
94  HistoryID history_id = static_cast<HistoryID>(row.id());
95
96  // Add the row for quick lookup in the history info store.
97  url = l10n_util::ToLower(url);
98  URLRow new_row(GURL(url), row.id());
99  new_row.set_visit_count(row.visit_count());
100  new_row.set_typed_count(row.typed_count());
101  new_row.set_last_visit(row.last_visit());
102  new_row.set_title(row.title());
103  history_info_map_.insert(std::make_pair(history_id, new_row));
104
105  // Split into individual, unique words.
106  String16Set words = WordSetFromString16(url);
107
108  // For each word, add a new entry into the word index referring to the
109  // associated history item.
110  for (String16Set::iterator iter = words.begin();
111       iter != words.end(); ++iter) {
112    String16Set::value_type uni_word = *iter;
113    AddWordToIndex(uni_word, history_id);
114  }
115  ++history_item_count_;
116  return true;
117}
118
119// Searching
120
121ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(
122    const String16Vector& terms) {
123  ScoredHistoryMatches scored_items;
124  if (!terms.empty()) {
125    // Reset used_ flags for term_char_word_set_cache_. We use a basic mark-
126    // and-sweep approach.
127    ResetTermCharWordSetCache();
128
129    // Lowercase the terms.
130    // TODO(mrossetti): Another opportunity for a transform algorithm.
131    String16Vector lower_terms;
132    for (String16Vector::const_iterator term_iter = terms.begin();
133         term_iter != terms.end(); ++term_iter) {
134      String16Vector::value_type lower_string(*term_iter);
135      std::transform(lower_string.begin(),
136                     lower_string.end(),
137                     lower_string.begin(),
138                     tolower);
139      lower_terms.push_back(lower_string);
140    }
141
142    String16Vector::value_type all_terms(JoinString(lower_terms, ' '));
143    HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);
144
145    // Pass over all of the candidates filtering out any without a proper
146    // substring match, inserting those which pass in order by score.
147    scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
148                                 AddHistoryMatch(*this,
149                                                 lower_terms)).ScoredMatches();
150  }
151
152  // Remove any stale TermCharWordSet's.
153  term_char_word_set_cache_.erase(
154      std::remove_if(term_char_word_set_cache_.begin(),
155                     term_char_word_set_cache_.end(),
156                     std::mem_fun_ref(&TermCharWordSet::IsNotUsed)),
157      term_char_word_set_cache_.end());
158  return scored_items;
159}
160
161void InMemoryURLIndex::ResetTermCharWordSetCache() {
162  // TODO(mrossetti): Consider keeping more of the cache around for possible
163  // repeat searches, except a 'shortcuts' approach might be better for that.
164  // TODO(mrossetti): Change TermCharWordSet to a class and use for_each.
165  for (TermCharWordSetVector::iterator iter =
166       term_char_word_set_cache_.begin();
167       iter != term_char_word_set_cache_.end(); ++iter)
168    iter->used_ = false;
169}
170
171InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(
172    const string16& uni_string) {
173  // Break the terms down into individual terms (words), get the candidate
174  // set for each term, and intersect each to get a final candidate list.
175  // Note that a single 'term' from the user's perspective might be
176  // a string like "http://www.somewebsite.com" which, from our perspective,
177  // is four words: 'http', 'www', 'somewebsite', and 'com'.
178  HistoryIDSet history_id_set;
179  String16Set words = WordSetFromString16(uni_string);
180  bool first_word = true;
181  for (String16Set::iterator iter = words.begin();
182       iter != words.end(); ++iter) {
183    String16Set::value_type uni_word = *iter;
184    HistoryIDSet term_history_id_set =
185        InMemoryURLIndex::HistoryIDsForTerm(uni_word);
186    if (first_word) {
187      history_id_set.swap(term_history_id_set);
188      first_word = false;
189    } else {
190      HistoryIDSet old_history_id_set(history_id_set);
191      history_id_set.clear();
192      std::set_intersection(old_history_id_set.begin(),
193                            old_history_id_set.end(),
194                            term_history_id_set.begin(),
195                            term_history_id_set.end(),
196                            std::inserter(history_id_set,
197                                          history_id_set.begin()));
198    }
199  }
200  return history_id_set;
201}
202
203InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(
204    const string16& uni_word) {
205  InMemoryURLIndex::HistoryIDSet history_id_set;
206
207  // For each character in the word, get the char/word_id map entry and
208  // intersect with the set in an incremental manner.
209  Char16Set uni_chars = CharactersFromString16(uni_word);
210  WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));
211
212  // If any words resulted then we can compose a set of history IDs by unioning
213  // the sets from each word.
214  if (!word_id_set.empty()) {
215    for (WordIDSet::iterator word_id_iter = word_id_set.begin();
216         word_id_iter != word_id_set.end(); ++word_id_iter) {
217      WordID word_id = *word_id_iter;
218      WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
219      if (word_iter != word_id_history_map_.end()) {
220        HistoryIDSet& word_history_id_set(word_iter->second);
221        history_id_set.insert(word_history_id_set.begin(),
222                              word_history_id_set.end());
223      }
224    }
225  }
226
227  return history_id_set;
228}
229
230// Utility Functions
231
232// static
233InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(
234    const string16& uni_string) {
235  String16Set words;
236  WordIterator iter(&uni_string, WordIterator::BREAK_WORD);
237  if (iter.Init()) {
238    while (iter.Advance()) {
239      if (iter.IsWord())
240        words.insert(iter.GetWord());
241    }
242  }
243  return words;
244}
245
246// static
247InMemoryURLIndex::Char16Set InMemoryURLIndex::CharactersFromString16(
248    const string16& uni_word) {
249  Char16Set characters;
250  for (string16::const_iterator iter = uni_word.begin();
251       iter != uni_word.end(); ++iter)
252    characters.insert(*iter);
253  return characters;
254}
255
256void InMemoryURLIndex::AddWordToIndex(const string16& uni_word,
257                                      HistoryID history_id) {
258  WordMap::iterator word_pos = word_map_.find(uni_word);
259  if (word_pos != word_map_.end())
260    UpdateWordHistory(word_pos->second, history_id);
261  else
262    AddWordHistory(uni_word, history_id);
263}
264
265void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) {
266    WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
267    DCHECK(history_pos != word_id_history_map_.end());
268    HistoryIDSet& history_id_set(history_pos->second);
269    history_id_set.insert(history_id);
270}
271
272// Add a new word to the word list and the word map, and then create a
273// new entry in the word/history map.
274void InMemoryURLIndex::AddWordHistory(const string16& uni_word,
275                                      HistoryID history_id) {
276  word_list_.push_back(uni_word);
277  WordID word_id = word_list_.size() - 1;
278  word_map_.insert(std::make_pair(uni_word, word_id));
279  HistoryIDSet history_id_set;
280  history_id_set.insert(history_id);
281  word_id_history_map_.insert(std::make_pair(word_id, history_id_set));
282  // For each character in the newly added word (i.e. a word that is not
283  // already in the word index), add the word to the character index.
284  Char16Set characters = CharactersFromString16(uni_word);
285  for (Char16Set::iterator uni_char_iter = characters.begin();
286       uni_char_iter != characters.end(); ++uni_char_iter) {
287    Char16Set::value_type uni_string = *uni_char_iter;
288    CharWordIDMap::iterator char_iter = char_word_map_.find(uni_string);
289    if (char_iter != char_word_map_.end()) {
290      // Update existing entry in the char/word index.
291      WordIDSet& word_id_set(char_iter->second);
292      word_id_set.insert(word_id);
293    } else {
294      // Create a new entry in the char/word index.
295      WordIDSet word_id_set;
296      word_id_set.insert(word_id);
297      char_word_map_.insert(std::make_pair(uni_string, word_id_set));
298    }
299  }
300}
301
302InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars(
303    InMemoryURLIndex::Char16Set const& uni_chars) {
304  TermCharWordSet* best_term_char_word_set = NULL;
305  bool set_found = false;
306  size_t outstanding_count = 0;
307  Char16Set outstanding_chars;
308  for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin();
309       iter != term_char_word_set_cache_.end(); ++iter) {
310    TermCharWordSetVector::value_type& term_char_word_set(*iter);
311    Char16Set& char_set(term_char_word_set.char_set_);
312    Char16Set exclusions;
313    std::set_difference(char_set.begin(), char_set.end(),
314                        uni_chars.begin(), uni_chars.end(),
315                        std::inserter(exclusions, exclusions.begin()));
316    if (exclusions.empty()) {
317      // Do a reverse difference so that we know which characters remain
318      // to be indexed. Then decide if this is a better match than any
319      // previous cached set.
320      std::set_difference(uni_chars.begin(), uni_chars.end(),
321                          char_set.begin(), char_set.end(),
322                          std::inserter(exclusions, exclusions.begin()));
323      if (!set_found || exclusions.size() < outstanding_count) {
324        outstanding_chars = exclusions;
325        best_term_char_word_set = &*iter;
326        outstanding_count = exclusions.size();
327        set_found = true;
328      }
329    }
330  }
331
332  WordIDSet word_id_set;
333  if (set_found && outstanding_count == 0) {
334    // If there were no oustanding characters then we can use the cached one.
335    best_term_char_word_set->used_ = true;
336    word_id_set = best_term_char_word_set->word_id_set_;
337  } else {
338    // Some or all of the characters remain to be indexed.
339    bool first_character = true;
340    if (set_found) {
341      first_character = false;
342      word_id_set = best_term_char_word_set->word_id_set_;
343    } else {
344      outstanding_chars = uni_chars;
345    }
346
347    for (Char16Set::iterator uni_char_iter = outstanding_chars.begin();
348         uni_char_iter != outstanding_chars.end(); ++uni_char_iter) {
349      Char16Set::value_type uni_char = *uni_char_iter;
350      CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
351      if (char_iter == char_word_map_.end()) {
352        // The character was not found so bail.
353        word_id_set.clear();
354        break;
355      }
356      WordIDSet& char_word_id_set(char_iter->second);
357      if (first_character) {
358        word_id_set = char_word_id_set;
359        first_character = false;
360      } else {
361        WordIDSet old_word_id_set(word_id_set);
362        word_id_set.clear();
363        std::set_intersection(old_word_id_set.begin(),
364                              old_word_id_set.end(),
365                              char_word_id_set.begin(),
366                              char_word_id_set.end(),
367                              std::inserter(word_id_set,
368                              word_id_set.begin()));
369      }
370    }
371    // Update the cache.
372    if (!set_found || outstanding_count) {
373      term_char_word_set_cache_.push_back(TermCharWordSet(uni_chars,
374          word_id_set, true));
375    }
376  }
377  return word_id_set;
378}
379
380// static
381int InMemoryURLIndex::RawScoreForURL(const URLRow& row,
382                                     const String16Vector& terms,
383                                     size_t* first_term_location) {
384  GURL gurl = row.url();
385  if (!gurl.is_valid())
386    return 0;
387
388  string16 url = UTF8ToUTF16(gurl.spec());
389
390  // Collect all term start positions so we can see if they appear in order.
391  std::vector<size_t> term_locations;
392  int out_of_order = 0;  // Count the terms which are out of order.
393  size_t start_location_total = 0;
394  size_t term_length_total = 0;
395  for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();
396       ++iter) {
397    string16 term = *iter;
398    size_t term_location = url.find(term);
399    if (term_location == string16::npos)
400      return 0;  // A term was not found.  This should not happen.
401    if (iter != terms.begin()) {
402      // See if this term is out-of-order.
403      for (std::vector<size_t>::const_iterator order_iter =
404               term_locations.begin(); order_iter != term_locations.end();
405               ++order_iter) {
406        if (term_location <= *order_iter)
407          ++out_of_order;
408      }
409    } else {
410      *first_term_location = term_location;
411    }
412    term_locations.push_back(term_location);
413    start_location_total += term_location;
414    term_length_total += term.size();
415  }
416
417  // Calculate a raw score.
418  // TODO(mrossetti): This is good enough for now but must be fine-tuned.
419  const float kOrderMaxValue = 10.0;
420  float order_value = 10.0;
421  if (terms.size() > 1) {
422    int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2;
423    order_value =
424        (static_cast<float>(max_possible_out_of_order - out_of_order) /
425         max_possible_out_of_order) * kOrderMaxValue;
426  }
427
428  const float kStartMaxValue = 10.0;
429  const size_t kMaxSignificantStart = 20;
430  float start_value =
431      (static_cast<float>(kMaxSignificantStart -
432       std::min(kMaxSignificantStart, start_location_total / terms.size()))) /
433      static_cast<float>(kMaxSignificantStart) * kStartMaxValue;
434
435  const float kCompleteMaxValue = 10.0;
436  float complete_value =
437      (static_cast<float>(term_length_total) / static_cast<float>(url.size())) *
438      kStartMaxValue;
439
440  const float kLastVisitMaxValue = 10.0;
441  const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);
442  int64 delta_time = (kMaxSignificantDay -
443      std::min((base::Time::Now() - row.last_visit()),
444               kMaxSignificantDay)).ToInternalValue();
445  float last_visit_value =
446      (static_cast<float>(delta_time) /
447       static_cast<float>(kMaxSignificantDay.ToInternalValue())) *
448      kLastVisitMaxValue;
449
450  const float kVisitCountMaxValue = 10.0;
451  const int kMaxSignificantVisits = 10;
452  float visit_count_value =
453      (static_cast<float>(std::min(row.visit_count(),
454       kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) *
455      kVisitCountMaxValue;
456
457  const float kTypedCountMaxValue = 20.0;
458  const int kMaxSignificantTyped = 10;
459  float typed_count_value =
460      (static_cast<float>(std::min(row.typed_count(),
461       kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) *
462      kTypedCountMaxValue;
463
464  float raw_score = order_value + start_value + complete_value +
465      last_visit_value + visit_count_value + typed_count_value;
466
467  // Normalize the score.
468  const float kMaxNormalizedRawScore = 1000.0;
469  raw_score =
470      (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue +
471                    kLastVisitMaxValue + kVisitCountMaxValue +
472                    kTypedCountMaxValue)) *
473      kMaxNormalizedRawScore;
474  return static_cast<int>(raw_score);
475}
476
477// static
478Time InMemoryURLIndex::RecentThreshold() {
479  return Time::Now() - TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays);
480}
481
482InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(
483    const InMemoryURLIndex& index,
484    const String16Vector& lower_terms)
485    : index_(index),
486      lower_terms_(lower_terms) {
487}
488
489InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}
490
491void InMemoryURLIndex::AddHistoryMatch::operator()(
492    const InMemoryURLIndex::HistoryID history_id) {
493  HistoryInfoMap::const_iterator hist_pos =
494      index_.history_info_map_.find(history_id);
495  if (hist_pos != index_.history_info_map_.end()) {
496    const URLRow& hist_item = hist_pos->second;
497    // TODO(mrossetti): Accommodate multiple term highlighting.
498    size_t input_location = 0;
499    int score = InMemoryURLIndex::RawScoreForURL(
500        hist_item, lower_terms_, &input_location);
501    if (score != 0) {
502      // We only retain the top 10 highest scoring results so
503      // see if this one fits into the top 10 and, if so, where.
504      ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin();
505      while (scored_iter != scored_matches_.end() &&
506             (*scored_iter).raw_score > score)
507        ++scored_iter;
508      if ((scored_matches_.size() < 10) ||
509          (scored_iter != scored_matches_.end())) {
510        // Create and insert the new item.
511        // TODO(mrossetti): Properly set |match_in_scheme| and
512        // |innermost_match|.
513        bool match_in_scheme = false;
514        bool innermost_match = true;
515        ScoredHistoryMatch match(hist_item, input_location,
516            match_in_scheme, innermost_match, score);
517        if (!scored_matches_.empty())
518          scored_matches_.insert(scored_iter, match);
519        else
520          scored_matches_.push_back(match);
521        // Trim any entries beyond 10.
522        if (scored_matches_.size() > 10) {
523          scored_matches_.erase(scored_matches_.begin() + 10,
524                                scored_matches_.end());
525        }
526      }
527    }
528  }
529}
530
531}  // namespace history
532