1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/history/in_memory_url_index.h"
6
7#include <algorithm>
8#include <iterator>
9#include <limits>
10#include <numeric>
11
12#include "base/file_util.h"
13#include "base/i18n/break_iterator.h"
14#include "base/metrics/histogram.h"
15#include "base/string_util.h"
16#include "base/time.h"
17#include "base/utf_string_conversions.h"
18#include "chrome/browser/autocomplete/autocomplete.h"
19#include "chrome/browser/autocomplete/history_provider_util.h"
20#include "chrome/browser/history/url_database.h"
21#include "chrome/browser/profiles/profile.h"
22#include "chrome/common/url_constants.h"
23#include "googleurl/src/url_util.h"
24#include "net/base/escape.h"
25#include "net/base/net_util.h"
26#include "third_party/protobuf/src/google/protobuf/repeated_field.h"
27#include "ui/base/l10n/l10n_util.h"
28
29using google::protobuf::RepeatedField;
30using google::protobuf::RepeatedPtrField;
31using in_memory_url_index::InMemoryURLIndexCacheItem;
32
33namespace history {
34
35typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;
36typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;
37typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
38typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;
39typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry
40    CharWordMapEntry;
41typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
42    WordIDHistoryMapItem;
43typedef imui::
44    InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
45    WordIDHistoryMapEntry;
46typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;
47typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
48    HistoryInfoMapEntry;
49
50const size_t InMemoryURLIndex::kNoCachedResultForTerm = -1;
51
52// Score ranges used to get a 'base' score for each of the scoring factors
53// (such as recency of last visit, times visited, times the URL was typed,
54// and the quality of the string match). There is a matching value range for
55// each of these scores for each factor.
56const int kScoreRank[] = { 1425, 1200, 900, 400 };
57
58ScoredHistoryMatch::ScoredHistoryMatch()
59    : raw_score(0),
60      prefix_adjust(0) {}
61
62ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info)
63    : HistoryMatch(url_info, 0, false, false),
64      raw_score(0),
65      prefix_adjust(0) {}
66
67ScoredHistoryMatch::~ScoredHistoryMatch() {}
68
69// Comparison function for sorting ScoredMatches by their scores.
70bool ScoredHistoryMatch::MatchScoreGreater(const ScoredHistoryMatch& m1,
71                                           const ScoredHistoryMatch& m2) {
72  return m1.raw_score >= m2.raw_score;
73}
74
75struct InMemoryURLIndex::TermCharWordSet {
76  TermCharWordSet()  // Required for STL resize().
77      : char_(0),
78        word_id_set_(),
79        used_(false) {}
80  TermCharWordSet(const char16& uni_char,
81                  const WordIDSet& word_id_set,
82                  bool used)
83      : char_(uni_char),
84        word_id_set_(word_id_set),
85        used_(used) {}
86
87  // Predicate for STL algorithm use.
88  bool is_not_used() const { return !used_; }
89
90  char16 char_;
91  WordIDSet word_id_set_;
92  bool used_;  // true if this set has been used for the current term search.
93};
94
95// Comparison function for sorting TermMatches by their offsets.
96bool MatchOffsetLess(const TermMatch& m1, const TermMatch& m2) {
97  return m1.offset < m2.offset;
98}
99
100// std::accumulate helper function to add up TermMatches' lengths.
101int AccumulateMatchLength(int total, const TermMatch& match) {
102  return total + match.length;
103}
104
105// Converts a raw value for some particular scoring factor into a score
106// component for that factor.  The conversion function is piecewise linear, with
107// input values provided in |value_ranks| and resulting output scores from
108// |kScoreRank| (mathematically, f(value_rank[i]) = kScoreRank[i]).  A score
109// cannot be higher than kScoreRank[0], and drops directly to 0 if lower than
110// kScoreRank[3].
111//
112// For example, take |value| == 70 and |value_ranks| == { 100, 50, 30, 10 }.
113// Because 70 falls between ranks 0 (100) and 1 (50), the score is given by the
114// linear function:
115//   score = m * value + b, where
116//   m = (kScoreRank[0] - kScoreRank[1]) / (value_ranks[0] - value_ranks[1])
117//   b = value_ranks[1]
118// Any value higher than 100 would be scored as if it were 100, and any value
119// lower than 10 scored 0.
120int ScoreForValue(int value, const int* value_ranks) {
121  int i = 0;
122  int rank_count = arraysize(kScoreRank);
123  while ((i < rank_count) && ((value_ranks[0] < value_ranks[1]) ?
124         (value > value_ranks[i]) : (value < value_ranks[i])))
125    ++i;
126  if (i >= rank_count)
127    return 0;
128  int score = kScoreRank[i];
129  if (i > 0) {
130    score += (value - value_ranks[i]) *
131        (kScoreRank[i - 1] - kScoreRank[i]) /
132        (value_ranks[i - 1] - value_ranks[i]);
133  }
134  return score;
135}
136
137InMemoryURLIndex::InMemoryURLIndex(const FilePath& history_dir)
138    : history_dir_(history_dir),
139      history_item_count_(0) {
140}
141
142// Called only by unit tests.
143InMemoryURLIndex::InMemoryURLIndex()
144    : history_item_count_(0) {
145}
146
147InMemoryURLIndex::~InMemoryURLIndex() {}
148
149// Indexing
150
151bool InMemoryURLIndex::Init(history::URLDatabase* history_db,
152                            const std::string& languages) {
153  // TODO(mrossetti): Register for profile/language change notifications.
154  languages_ = languages;
155  return ReloadFromHistory(history_db, false);
156}
157
158void InMemoryURLIndex::ShutDown() {
159  // Write our cache.
160  SaveToCacheFile();
161}
162
163bool InMemoryURLIndex::IndexRow(const URLRow& row) {
164  const GURL& gurl(row.url());
165  string16 url(net::FormatUrl(gurl, languages_,
166      net::kFormatUrlOmitUsernamePassword,
167      UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS,
168      NULL, NULL, NULL));
169
170  HistoryID history_id = static_cast<HistoryID>(row.id());
171  DCHECK_LT(row.id(), std::numeric_limits<HistoryID>::max());
172
173  // Add the row for quick lookup in the history info store.
174  URLRow new_row(GURL(url), row.id());
175  new_row.set_visit_count(row.visit_count());
176  new_row.set_typed_count(row.typed_count());
177  new_row.set_last_visit(row.last_visit());
178  new_row.set_title(row.title());
179  history_info_map_[history_id] = new_row;
180
181  // Split URL into individual, unique words then add in the title words.
182  url = l10n_util::ToLower(url);
183  String16Set url_words = WordSetFromString16(url);
184  String16Set title_words = WordSetFromString16(row.title());
185  String16Set words;
186  std::set_union(url_words.begin(), url_words.end(),
187                 title_words.begin(), title_words.end(),
188                 std::insert_iterator<String16Set>(words, words.begin()));
189  for (String16Set::iterator word_iter = words.begin();
190       word_iter != words.end(); ++word_iter)
191    AddWordToIndex(*word_iter, history_id);
192
193  ++history_item_count_;
194  return true;
195}
196
197bool InMemoryURLIndex::ReloadFromHistory(history::URLDatabase* history_db,
198                                         bool clear_cache) {
199  ClearPrivateData();
200
201  if (!history_db)
202    return false;
203
204  if (clear_cache || !RestoreFromCacheFile()) {
205    base::TimeTicks beginning_time = base::TimeTicks::Now();
206    // The index has to be built from scratch.
207    URLDatabase::URLEnumerator history_enum;
208    if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
209      return false;
210    URLRow row;
211    while (history_enum.GetNextURL(&row)) {
212      if (!IndexRow(row))
213        return false;
214    }
215    UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
216                        base::TimeTicks::Now() - beginning_time);
217    SaveToCacheFile();
218  }
219  return true;
220}
221
222void InMemoryURLIndex::ClearPrivateData() {
223  history_item_count_ = 0;
224  word_list_.clear();
225  word_map_.clear();
226  char_word_map_.clear();
227  word_id_history_map_.clear();
228  term_char_word_set_cache_.clear();
229  history_info_map_.clear();
230}
231
232bool InMemoryURLIndex::RestoreFromCacheFile() {
233  // TODO(mrossetti): Figure out how to determine if the cache is up-to-date.
234  // That is: ensure that the database has not been modified since the cache
235  // was last saved. DB file modification date is inadequate. There are no
236  // SQLite table checksums automatically stored.
237  base::TimeTicks beginning_time = base::TimeTicks::Now();
238  FilePath file_path;
239  if (!GetCacheFilePath(&file_path) || !file_util::PathExists(file_path))
240    return false;
241  std::string data;
242  if (!file_util::ReadFileToString(file_path, &data)) {
243    LOG(WARNING) << "Failed to read InMemoryURLIndex cache from "
244                 << file_path.value();
245    return false;
246  }
247
248  InMemoryURLIndexCacheItem index_cache;
249  if (!index_cache.ParseFromArray(data.c_str(), data.size())) {
250    LOG(WARNING) << "Failed to parse InMemoryURLIndex cache data read from "
251                 << file_path.value();
252    return false;
253  }
254
255  if (!RestorePrivateData(index_cache)) {
256    ClearPrivateData();  // Back to square one -- must build from scratch.
257    return false;
258  }
259
260  UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
261                      base::TimeTicks::Now() - beginning_time);
262  UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", history_item_count_);
263  UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
264  UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", word_map_.size());
265  UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", char_word_map_.size());
266  return true;
267}
268
269bool InMemoryURLIndex::SaveToCacheFile() {
270  base::TimeTicks beginning_time = base::TimeTicks::Now();
271  InMemoryURLIndexCacheItem index_cache;
272  SavePrivateData(&index_cache);
273  std::string data;
274  if (!index_cache.SerializeToString(&data)) {
275    LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";
276    return false;
277  }
278
279  // Write the cache to a file then swap it for the old cache, if any, and
280  // delete the old cache.
281  FilePath file_path;
282  if (!GetCacheFilePath(&file_path))
283    return false;
284  file_util::ScopedFILE file(file_util::OpenFile(file_path, "w"));
285  if (!file.get())
286    return false;
287
288  int size = data.size();
289  if (file_util::WriteFile(file_path, data.c_str(), size) != size) {
290    LOG(WARNING) << "Failed to write " << file_path.value();
291    return false;
292  }
293  UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
294                      base::TimeTicks::Now() - beginning_time);
295  return true;
296}
297
298void InMemoryURLIndex::UpdateURL(URLID row_id, const URLRow& row) {
299  // The row may or may not already be in our index. If it is not already
300  // indexed and it qualifies then it gets indexed. If it is already
301  // indexed and still qualifies then it gets updated, otherwise it
302  // is deleted from the index.
303  HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
304  if (row_pos == history_info_map_.end()) {
305    // This new row should be indexed if it qualifies.
306    if (RowQualifiesAsSignificant(row, base::Time()))
307      IndexRow(row);
308  } else if (RowQualifiesAsSignificant(row, base::Time())) {
309    // This indexed row still qualifies and will be re-indexed.
310    // The url won't have changed but the title, visit count, etc.
311    // might have changed.
312    URLRow& old_row = row_pos->second;
313    old_row.set_visit_count(row.visit_count());
314    old_row.set_typed_count(row.typed_count());
315    old_row.set_last_visit(row.last_visit());
316    // TODO(mrossetti): When we start indexing the title the next line
317    // will need attention.
318    old_row.set_title(row.title());
319  } else {
320    // This indexed row no longer qualifies and will be de-indexed.
321    history_info_map_.erase(row_id);
322  }
323  // This invalidates the cache.
324  term_char_word_set_cache_.clear();
325  // TODO(mrossetti): Record this transaction in the cache.
326}
327
328void InMemoryURLIndex::DeleteURL(URLID row_id) {
329  // Note that this does not remove any reference to this row from the
330  // word_id_history_map_. That map will continue to contain (and return)
331  // hits against this row until that map is rebuilt, but since the
332  // history_info_map_ no longer references the row no erroneous results
333  // will propagate to the user.
334  history_info_map_.erase(row_id);
335  // This invalidates the word cache.
336  term_char_word_set_cache_.clear();
337  // TODO(mrossetti): Record this transaction in the cache.
338}
339
340// Searching
341
342ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(
343    const String16Vector& terms) {
344  ScoredHistoryMatches scored_items;
345  if (!terms.empty()) {
346    // Reset used_ flags for term_char_word_set_cache_. We use a basic mark-
347    // and-sweep approach.
348    ResetTermCharWordSetCache();
349
350    // Lowercase the terms.
351    // TODO(mrossetti): Another opportunity for a transform algorithm.
352    String16Vector lower_terms;
353    for (String16Vector::const_iterator term_iter = terms.begin();
354         term_iter != terms.end(); ++term_iter)
355      lower_terms.push_back(l10n_util::ToLower(*term_iter));
356
357    String16Vector::value_type all_terms(JoinString(lower_terms, ' '));
358    HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);
359
360    // Don't perform any scoring (and don't return any matches) if the
361    // candidate pool is large. (See comments in header.)
362    const size_t kItemsToScoreLimit = 500;
363    if (history_id_set.size() <= kItemsToScoreLimit) {
364      // Pass over all of the candidates filtering out any without a proper
365      // substring match, inserting those which pass in order by score.
366      scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
367          AddHistoryMatch(*this, lower_terms)).ScoredMatches();
368
369      // Select and sort only the top kMaxMatches results.
370      if (scored_items.size() > AutocompleteProvider::kMaxMatches) {
371        std::partial_sort(scored_items.begin(),
372                          scored_items.begin() +
373                              AutocompleteProvider::kMaxMatches,
374                          scored_items.end(),
375                          ScoredHistoryMatch::MatchScoreGreater);
376          scored_items.resize(AutocompleteProvider::kMaxMatches);
377      } else {
378        std::sort(scored_items.begin(), scored_items.end(),
379                  ScoredHistoryMatch::MatchScoreGreater);
380      }
381    }
382  }
383
384  // Remove any stale TermCharWordSet's.
385  term_char_word_set_cache_.erase(
386      std::remove_if(term_char_word_set_cache_.begin(),
387                     term_char_word_set_cache_.end(),
388                     std::mem_fun_ref(&TermCharWordSet::is_not_used)),
389      term_char_word_set_cache_.end());
390  return scored_items;
391}
392
393void InMemoryURLIndex::ResetTermCharWordSetCache() {
394  // TODO(mrossetti): Consider keeping more of the cache around for possible
395  // repeat searches, except a 'shortcuts' approach might be better for that.
396  // TODO(mrossetti): Change TermCharWordSet to a class and use for_each.
397  for (TermCharWordSetVector::iterator iter =
398       term_char_word_set_cache_.begin();
399       iter != term_char_word_set_cache_.end(); ++iter)
400    iter->used_ = false;
401}
402
403InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(
404    const string16& uni_string) {
405  // Break the terms down into individual terms (words), get the candidate
406  // set for each term, and intersect each to get a final candidate list.
407  // Note that a single 'term' from the user's perspective might be
408  // a string like "http://www.somewebsite.com" which, from our perspective,
409  // is four words: 'http', 'www', 'somewebsite', and 'com'.
410  HistoryIDSet history_id_set;
411  String16Set words = WordSetFromString16(uni_string);
412  bool first_word = true;
413  for (String16Set::iterator iter = words.begin();
414       iter != words.end(); ++iter) {
415    String16Set::value_type uni_word = *iter;
416    HistoryIDSet term_history_id_set =
417        InMemoryURLIndex::HistoryIDsForTerm(uni_word);
418    if (first_word) {
419      history_id_set.swap(term_history_id_set);
420      first_word = false;
421    } else {
422      HistoryIDSet old_history_id_set(history_id_set);
423      history_id_set.clear();
424      std::set_intersection(old_history_id_set.begin(),
425                            old_history_id_set.end(),
426                            term_history_id_set.begin(),
427                            term_history_id_set.end(),
428                            std::inserter(history_id_set,
429                                          history_id_set.begin()));
430    }
431  }
432  return history_id_set;
433}
434
435InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(
436    const string16& uni_word) {
437  InMemoryURLIndex::HistoryIDSet history_id_set;
438
439  // For each unique character in the word, in order of first appearance, get
440  // the char/word_id map entry and intersect with the set in an incremental
441  // manner.
442  Char16Vector uni_chars = Char16VectorFromString16(uni_word);
443  WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));
444
445  // TODO(mrossetti): At this point, as a possible optimization, we could
446  // scan through all candidate words and make sure the |uni_word| is a
447  // substring within the candidate words, eliminating those which aren't.
448  // I'm not sure it would be worth the effort. And remember, we've got to do
449  // a final substring match in order to get the highlighting ranges later
450  // in the process in any case.
451
452  // If any words resulted then we can compose a set of history IDs by unioning
453  // the sets from each word.
454  if (!word_id_set.empty()) {
455    for (WordIDSet::iterator word_id_iter = word_id_set.begin();
456         word_id_iter != word_id_set.end(); ++word_id_iter) {
457      WordID word_id = *word_id_iter;
458      WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
459      if (word_iter != word_id_history_map_.end()) {
460        HistoryIDSet& word_history_id_set(word_iter->second);
461        history_id_set.insert(word_history_id_set.begin(),
462                              word_history_id_set.end());
463      }
464    }
465  }
466
467  return history_id_set;
468}
469
470// Utility Functions
471
472// static
473InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(
474    const string16& uni_string) {
475  String16Vector words = WordVectorFromString16(uni_string, false);
476  String16Set word_set;
477  for (String16Vector::const_iterator iter = words.begin(); iter != words.end();
478       ++iter)
479    word_set.insert(l10n_util::ToLower(*iter));
480  return word_set;
481}
482
483// static
484InMemoryURLIndex::String16Vector InMemoryURLIndex::WordVectorFromString16(
485    const string16& uni_string,
486    bool break_on_space) {
487  base::BreakIterator iter(&uni_string, break_on_space ?
488      base::BreakIterator::BREAK_SPACE : base::BreakIterator::BREAK_WORD);
489  String16Vector words;
490  if (!iter.Init())
491    return words;
492  while (iter.Advance()) {
493    if (break_on_space || iter.IsWord()) {
494      string16 word = iter.GetString();
495      if (break_on_space)
496        TrimWhitespace(word, TRIM_ALL, &word);
497      if (!word.empty())
498        words.push_back(word);
499    }
500  }
501  return words;
502}
503
504// static
505InMemoryURLIndex::Char16Vector InMemoryURLIndex::Char16VectorFromString16(
506    const string16& uni_word) {
507  InMemoryURLIndex::Char16Vector characters;
508  InMemoryURLIndex::Char16Set unique_characters;
509  for (string16::const_iterator iter = uni_word.begin();
510       iter != uni_word.end(); ++iter) {
511    if (!unique_characters.count(*iter)) {
512      unique_characters.insert(*iter);
513      characters.push_back(*iter);
514    }
515  }
516  return characters;
517}
518
519// static
520InMemoryURLIndex::Char16Set InMemoryURLIndex::Char16SetFromString16(
521    const string16& uni_word) {
522  Char16Set characters;
523  for (string16::const_iterator iter = uni_word.begin();
524       iter != uni_word.end(); ++iter)
525    characters.insert(*iter);
526  return characters;
527}
528
529void InMemoryURLIndex::AddWordToIndex(const string16& uni_word,
530                                      HistoryID history_id) {
531  WordMap::iterator word_pos = word_map_.find(uni_word);
532  if (word_pos != word_map_.end())
533    UpdateWordHistory(word_pos->second, history_id);
534  else
535    AddWordHistory(uni_word, history_id);
536}
537
538void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) {
539    WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
540    DCHECK(history_pos != word_id_history_map_.end());
541    HistoryIDSet& history_id_set(history_pos->second);
542    history_id_set.insert(history_id);
543}
544
545// Add a new word to the word list and the word map, and then create a
546// new entry in the word/history map.
547void InMemoryURLIndex::AddWordHistory(const string16& uni_word,
548                                      HistoryID history_id) {
549  word_list_.push_back(uni_word);
550  WordID word_id = word_list_.size() - 1;
551  word_map_[uni_word] = word_id;
552  HistoryIDSet history_id_set;
553  history_id_set.insert(history_id);
554  word_id_history_map_[word_id] = history_id_set;
555  // For each character in the newly added word (i.e. a word that is not
556  // already in the word index), add the word to the character index.
557  Char16Set characters = Char16SetFromString16(uni_word);
558  for (Char16Set::iterator uni_char_iter = characters.begin();
559       uni_char_iter != characters.end(); ++uni_char_iter) {
560    Char16Set::value_type uni_char = *uni_char_iter;
561    CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
562    if (char_iter != char_word_map_.end()) {
563      // Update existing entry in the char/word index.
564      WordIDSet& word_id_set(char_iter->second);
565      word_id_set.insert(word_id);
566    } else {
567      // Create a new entry in the char/word index.
568      WordIDSet word_id_set;
569      word_id_set.insert(word_id);
570      char_word_map_[uni_char] = word_id_set;
571    }
572  }
573}
574
575InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars(
576    const InMemoryURLIndex::Char16Vector& uni_chars) {
577  size_t index = CachedResultsIndexForTerm(uni_chars);
578
579  // If there were no unprocessed characters in the search term |uni_chars|
580  // then we can use the cached one as-is as the results with no further
581  // filtering.
582  if (index != kNoCachedResultForTerm && index == uni_chars.size() - 1)
583    return term_char_word_set_cache_[index].word_id_set_;
584
585  // Some or all of the characters remain to be indexed so trim the cache.
586  if (index + 1 < term_char_word_set_cache_.size())
587    term_char_word_set_cache_.resize(index + 1);
588  WordIDSet word_id_set;
589  // Take advantage of our cached starting point, if any.
590  Char16Vector::const_iterator c_iter = uni_chars.begin();
591  if (index != kNoCachedResultForTerm) {
592    word_id_set = term_char_word_set_cache_[index].word_id_set_;
593    c_iter += index + 1;
594  }
595  // Now process the remaining characters in the search term.
596  for (; c_iter != uni_chars.end(); ++c_iter) {
597    Char16Vector::value_type uni_char = *c_iter;
598    CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
599    if (char_iter == char_word_map_.end()) {
600      // A character was not found so there are no matching results: bail.
601      word_id_set.clear();
602      break;
603    }
604    WordIDSet& char_word_id_set(char_iter->second);
605    // It is possible for there to no longer be any words associated with
606    // a particular character. Give up in that case.
607    if (char_word_id_set.empty()) {
608      word_id_set.clear();
609      break;
610    }
611
612    if (word_id_set.empty()) {
613      word_id_set = char_word_id_set;
614    } else {
615      WordIDSet old_word_id_set(word_id_set);
616      word_id_set.clear();
617      std::set_intersection(old_word_id_set.begin(),
618                            old_word_id_set.end(),
619                            char_word_id_set.begin(),
620                            char_word_id_set.end(),
621                            std::inserter(word_id_set,
622                            word_id_set.begin()));
623    }
624    // Add this new char/set instance to the cache.
625    term_char_word_set_cache_.push_back(TermCharWordSet(
626        uni_char, word_id_set, true));
627  }
628  return word_id_set;
629}
630
631size_t InMemoryURLIndex::CachedResultsIndexForTerm(
632    const InMemoryURLIndex::Char16Vector& uni_chars) {
633  TermCharWordSetVector::iterator t_iter = term_char_word_set_cache_.begin();
634  for (Char16Vector::const_iterator c_iter = uni_chars.begin();
635       (c_iter != uni_chars.end()) &&
636       (t_iter != term_char_word_set_cache_.end()) &&
637       (*c_iter == t_iter->char_);
638       ++c_iter, ++t_iter)
639    t_iter->used_ = true;  // Update the cache.
640  return t_iter - term_char_word_set_cache_.begin() - 1;
641}
642
643// static
644TermMatches InMemoryURLIndex::MatchTermInString(const string16& term,
645                                                const string16& string,
646                                                int term_num) {
647  TermMatches matches;
648  for (size_t location = string.find(term); location != string16::npos;
649       location = string.find(term, location + 1))
650    matches.push_back(TermMatch(term_num, location, term.size()));
651  return matches;
652}
653
654// static
655TermMatches InMemoryURLIndex::SortAndDeoverlap(const TermMatches& matches) {
656  if (matches.empty())
657    return matches;
658  TermMatches sorted_matches = matches;
659  std::sort(sorted_matches.begin(), sorted_matches.end(),
660            MatchOffsetLess);
661  TermMatches clean_matches;
662  TermMatch last_match = sorted_matches[0];
663  clean_matches.push_back(last_match);
664  for (TermMatches::const_iterator iter = sorted_matches.begin() + 1;
665       iter != sorted_matches.end(); ++iter) {
666    if (iter->offset >= last_match.offset + last_match.length) {
667      last_match = *iter;
668      clean_matches.push_back(last_match);
669    }
670  }
671  return clean_matches;
672}
673
674// static
675std::vector<size_t> InMemoryURLIndex::OffsetsFromTermMatches(
676    const TermMatches& matches) {
677  std::vector<size_t> offsets;
678  for (TermMatches::const_iterator i = matches.begin(); i != matches.end(); ++i)
679    offsets.push_back(i->offset);
680  return offsets;
681}
682
683// static
684TermMatches InMemoryURLIndex::ReplaceOffsetsInTermMatches(
685    const TermMatches& matches,
686    const std::vector<size_t>& offsets) {
687  DCHECK_EQ(matches.size(), offsets.size());
688  TermMatches new_matches;
689  std::vector<size_t>::const_iterator offset_iter = offsets.begin();
690  for (TermMatches::const_iterator term_iter = matches.begin();
691       term_iter != matches.end(); ++term_iter, ++offset_iter) {
692    if (*offset_iter != string16::npos) {
693      TermMatch new_match(*term_iter);
694      new_match.offset = *offset_iter;
695      new_matches.push_back(new_match);
696    }
697  }
698  return new_matches;
699}
700
701// static
702ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL(
703    const URLRow& row,
704    const String16Vector& terms) {
705  ScoredHistoryMatch match(row);
706  GURL gurl = row.url();
707  if (!gurl.is_valid())
708    return match;
709
710  // Figure out where each search term appears in the URL and/or page title
711  // so that we can score as well as provide autocomplete highlighting.
712  string16 url = l10n_util::ToLower(UTF8ToUTF16(gurl.spec()));
713  // Strip any 'http://' prefix before matching.
714  if (url_util::FindAndCompareScheme(url, chrome::kHttpScheme, NULL)) {
715    match.prefix_adjust = strlen(chrome::kHttpScheme) + 3;  // Allow for '://'.
716    url = url.substr(match.prefix_adjust);
717  }
718
719  string16 title = l10n_util::ToLower(row.title());
720  int term_num = 0;
721  for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();
722       ++iter, ++term_num) {
723    string16 term = *iter;
724    TermMatches url_term_matches = MatchTermInString(term, url, term_num);
725    TermMatches title_term_matches = MatchTermInString(term, title, term_num);
726    if (url_term_matches.empty() && title_term_matches.empty())
727      return match;  // A term was not found in either URL or title - reject.
728    match.url_matches.insert(match.url_matches.end(), url_term_matches.begin(),
729                             url_term_matches.end());
730    match.title_matches.insert(match.title_matches.end(),
731                               title_term_matches.begin(),
732                               title_term_matches.end());
733  }
734
735  // Sort matches by offset and eliminate any which overlap.
736  match.url_matches = SortAndDeoverlap(match.url_matches);
737  match.title_matches = SortAndDeoverlap(match.title_matches);
738
739  // Get partial scores based on term matching. Note that the score for
740  // each of the URL and title are adjusted by the fraction of the
741  // terms appearing in each.
742  int url_score = ScoreComponentForMatches(match.url_matches, url.size()) *
743      match.url_matches.size() / terms.size();
744  int title_score =
745      ScoreComponentForMatches(match.title_matches, title.size()) *
746      static_cast<int>(match.title_matches.size()) /
747      static_cast<int>(terms.size());
748  // Arbitrarily pick the best.
749  // TODO(mrossetti): It might make sense that a term which appears in both the
750  // URL and the Title should boost the score a bit.
751  int term_score = std::max(url_score, title_score);
752  if (term_score == 0)
753    return match;
754
755  // Factor in recency of visit, visit count and typed count attributes of the
756  // URLRow.
757  const int kDaysAgoLevel[] = { 0, 10, 20, 30 };
758  int score = ScoreForValue((base::Time::Now() -
759      row.last_visit()).InDays(), kDaysAgoLevel);
760  const int kVisitCountLevel[] = { 30, 10, 5, 3 };
761  int visit_count_value = ScoreForValue(row.visit_count(), kVisitCountLevel);
762  const int kTypedCountLevel[] = { 10, 5, 3, 1 };
763  int typed_count_value = ScoreForValue(row.typed_count(), kTypedCountLevel);
764
765  // Determine how many of the factors comprising the final score are
766  // significant by summing the relative factors for each and subtracting how
767  // many will be 'discarded' even if they are low.
768  const int kVisitCountMultiplier = 2;
769  const int kTypedCountMultiplier = 3;
770  const int kSignificantFactors =
771      kVisitCountMultiplier +  // Visit count factor plus
772      kTypedCountMultiplier +  // typed count factor plus
773      2 -                      // one each for string match and last visit
774      2;                       // minus 2 insignificant factors.
775  // The following, in effect, discards up to |kSignificantFactors| low scoring
776  // elements which contribute little to the score but which can inordinately
777  // drag down an otherwise good score.
778  match.raw_score = std::min(kScoreRank[0], (term_score + score +
779      (visit_count_value * kVisitCountMultiplier) + (typed_count_value *
780      kTypedCountMultiplier)) / kSignificantFactors);
781
782  return match;
783}
784
785int InMemoryURLIndex::ScoreComponentForMatches(const TermMatches& matches,
786                                               size_t max_length) {
787  // TODO(mrossetti): This is good enough for now but must be fine-tuned.
788  if (matches.empty())
789    return 0;
790
791  // Score component for whether the input terms (if more than one) were found
792  // in the same order in the match.  Start with kOrderMaxValue points divided
793  // equally among (number of terms - 1); then discount each of those terms that
794  // is out-of-order in the match.
795  const int kOrderMaxValue = 250;
796  int order_value = kOrderMaxValue;
797  if (matches.size() > 1) {
798    int max_possible_out_of_order = matches.size() - 1;
799    int out_of_order = 0;
800    for (size_t i = 1; i < matches.size(); ++i) {
801      if (matches[i - 1].term_num > matches[i].term_num)
802        ++out_of_order;
803    }
804    order_value = (max_possible_out_of_order - out_of_order) * kOrderMaxValue /
805        max_possible_out_of_order;
806  }
807
808  // Score component for how early in the match string the first search term
809  // appears.  Start with kStartMaxValue points and discount by
810  // 1/kMaxSignificantStart points for each character later than the first at
811  // which the term begins. No points are earned if the start of the match
812  // occurs at or after kMaxSignificantStart.
813  const size_t kMaxSignificantStart = 20;
814  const int kStartMaxValue = 250;
815  int start_value = (kMaxSignificantStart -
816      std::min(kMaxSignificantStart, matches[0].offset)) * kStartMaxValue /
817      kMaxSignificantStart;
818
819  // Score component for how much of the matched string the input terms cover.
820  // kCompleteMaxValue points times the fraction of the URL/page title string
821  // that was matched.
822  size_t term_length_total = std::accumulate(matches.begin(), matches.end(),
823                                             0, AccumulateMatchLength);
824  const int kCompleteMaxValue = 500;
825  int complete_value = term_length_total * kCompleteMaxValue / max_length;
826
827  int raw_score = order_value + start_value + complete_value;
828  const int kTermScoreLevel[] = { 1000, 650, 500, 200 };
829
830  // Scale the sum of the three components above into a single score component
831  // on the same scale as that used in ScoredMatchForURL().
832  return ScoreForValue(raw_score, kTermScoreLevel);
833}
834
835InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(
836    const InMemoryURLIndex& index,
837    const String16Vector& lower_terms)
838  : index_(index),
839    lower_terms_(lower_terms) {
840}
841
842InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {}
843
844void InMemoryURLIndex::AddHistoryMatch::operator()(
845    const InMemoryURLIndex::HistoryID history_id) {
846  HistoryInfoMap::const_iterator hist_pos =
847      index_.history_info_map_.find(history_id);
848  // Note that a history_id may be present in the word_id_history_map_ yet not
849  // be found in the history_info_map_. This occurs when an item has been
850  // deleted by the user or the item no longer qualifies as a quick result.
851  if (hist_pos != index_.history_info_map_.end()) {
852    const URLRow& hist_item = hist_pos->second;
853    ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_terms_));
854    if (match.raw_score > 0)
855      scored_matches_.push_back(match);
856  }
857}
858
859bool InMemoryURLIndex::GetCacheFilePath(FilePath* file_path) {
860  if (history_dir_.empty())
861    return false;
862  *file_path = history_dir_.Append(FILE_PATH_LITERAL("History Provider Cache"));
863  return true;
864}
865
866void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const {
867  DCHECK(cache);
868  cache->set_timestamp(base::Time::Now().ToInternalValue());
869  cache->set_history_item_count(history_item_count_);
870  SaveWordList(cache);
871  SaveWordMap(cache);
872  SaveCharWordMap(cache);
873  SaveWordIDHistoryMap(cache);
874  SaveHistoryInfoMap(cache);
875}
876
877bool InMemoryURLIndex::RestorePrivateData(
878    const InMemoryURLIndexCacheItem& cache) {
879  last_saved_ = base::Time::FromInternalValue(cache.timestamp());
880  history_item_count_ = cache.history_item_count();
881  return (history_item_count_ == 0) || (RestoreWordList(cache) &&
882      RestoreWordMap(cache) && RestoreCharWordMap(cache) &&
883      RestoreWordIDHistoryMap(cache) && RestoreHistoryInfoMap(cache));
884}
885
886
887void InMemoryURLIndex::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
888  if (word_list_.empty())
889    return;
890  WordListItem* list_item = cache->mutable_word_list();
891  list_item->set_word_count(word_list_.size());
892  for (String16Vector::const_iterator iter = word_list_.begin();
893       iter != word_list_.end(); ++iter)
894    list_item->add_word(UTF16ToUTF8(*iter));
895}
896
897bool InMemoryURLIndex::RestoreWordList(const InMemoryURLIndexCacheItem& cache) {
898  if (!cache.has_word_list())
899    return false;
900  const WordListItem& list_item(cache.word_list());
901  uint32 expected_item_count = list_item.word_count();
902  uint32 actual_item_count = list_item.word_size();
903  if (actual_item_count == 0 || actual_item_count != expected_item_count)
904    return false;
905  const RepeatedPtrField<std::string>& words(list_item.word());
906  for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
907       iter != words.end(); ++iter)
908    word_list_.push_back(UTF8ToUTF16(*iter));
909  return true;
910}
911
912void InMemoryURLIndex::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
913  if (word_map_.empty())
914    return;
915  WordMapItem* map_item = cache->mutable_word_map();
916  map_item->set_item_count(word_map_.size());
917  for (WordMap::const_iterator iter = word_map_.begin();
918       iter != word_map_.end(); ++iter) {
919    WordMapEntry* map_entry = map_item->add_word_map_entry();
920    map_entry->set_word(UTF16ToUTF8(iter->first));
921    map_entry->set_word_id(iter->second);
922  }
923}
924
925bool InMemoryURLIndex::RestoreWordMap(const InMemoryURLIndexCacheItem& cache) {
926  if (!cache.has_word_map())
927    return false;
928  const WordMapItem& list_item(cache.word_map());
929  uint32 expected_item_count = list_item.item_count();
930  uint32 actual_item_count = list_item.word_map_entry_size();
931  if (actual_item_count == 0 || actual_item_count != expected_item_count)
932    return false;
933  const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
934  for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
935       iter != entries.end(); ++iter)
936    word_map_[UTF8ToUTF16(iter->word())] = iter->word_id();
937  return true;
938}
939
940void InMemoryURLIndex::SaveCharWordMap(InMemoryURLIndexCacheItem* cache) const {
941  if (char_word_map_.empty())
942    return;
943  CharWordMapItem* map_item = cache->mutable_char_word_map();
944  map_item->set_item_count(char_word_map_.size());
945  for (CharWordIDMap::const_iterator iter = char_word_map_.begin();
946       iter != char_word_map_.end(); ++iter) {
947    CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
948    map_entry->set_char_16(iter->first);
949    const WordIDSet& word_id_set(iter->second);
950    map_entry->set_item_count(word_id_set.size());
951    for (WordIDSet::const_iterator set_iter = word_id_set.begin();
952         set_iter != word_id_set.end(); ++set_iter)
953      map_entry->add_word_id(*set_iter);
954  }
955}
956
957bool InMemoryURLIndex::RestoreCharWordMap(
958    const InMemoryURLIndexCacheItem& cache) {
959  if (!cache.has_char_word_map())
960    return false;
961  const CharWordMapItem& list_item(cache.char_word_map());
962  uint32 expected_item_count = list_item.item_count();
963  uint32 actual_item_count = list_item.char_word_map_entry_size();
964  if (actual_item_count == 0 || actual_item_count != expected_item_count)
965    return false;
966  const RepeatedPtrField<CharWordMapEntry>&
967      entries(list_item.char_word_map_entry());
968  for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =
969       entries.begin(); iter != entries.end(); ++iter) {
970    expected_item_count = iter->item_count();
971    actual_item_count = iter->word_id_size();
972    if (actual_item_count == 0 || actual_item_count != expected_item_count)
973      return false;
974    char16 uni_char = static_cast<char16>(iter->char_16());
975    WordIDSet word_id_set;
976    const RepeatedField<int32>& word_ids(iter->word_id());
977    for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
978         jiter != word_ids.end(); ++jiter)
979      word_id_set.insert(*jiter);
980    char_word_map_[uni_char] = word_id_set;
981  }
982  return true;
983}
984
985void InMemoryURLIndex::SaveWordIDHistoryMap(InMemoryURLIndexCacheItem* cache)
986    const {
987  if (word_id_history_map_.empty())
988    return;
989  WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
990  map_item->set_item_count(word_id_history_map_.size());
991  for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();
992       iter != word_id_history_map_.end(); ++iter) {
993    WordIDHistoryMapEntry* map_entry =
994        map_item->add_word_id_history_map_entry();
995    map_entry->set_word_id(iter->first);
996    const HistoryIDSet& history_id_set(iter->second);
997    map_entry->set_item_count(history_id_set.size());
998    for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
999         set_iter != history_id_set.end(); ++set_iter)
1000      map_entry->add_history_id(*set_iter);
1001  }
1002}
1003
1004bool InMemoryURLIndex::RestoreWordIDHistoryMap(
1005    const InMemoryURLIndexCacheItem& cache) {
1006  if (!cache.has_word_id_history_map())
1007    return false;
1008  const WordIDHistoryMapItem& list_item(cache.word_id_history_map());
1009  uint32 expected_item_count = list_item.item_count();
1010  uint32 actual_item_count = list_item.word_id_history_map_entry_size();
1011  if (actual_item_count == 0 || actual_item_count != expected_item_count)
1012    return false;
1013  const RepeatedPtrField<WordIDHistoryMapEntry>&
1014      entries(list_item.word_id_history_map_entry());
1015  for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
1016       entries.begin(); iter != entries.end(); ++iter) {
1017    expected_item_count = iter->item_count();
1018    actual_item_count = iter->history_id_size();
1019    if (actual_item_count == 0 || actual_item_count != expected_item_count)
1020      return false;
1021    WordID word_id = iter->word_id();
1022    HistoryIDSet history_id_set;
1023    const RepeatedField<int64>& history_ids(iter->history_id());
1024    for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
1025         jiter != history_ids.end(); ++jiter)
1026      history_id_set.insert(*jiter);
1027    word_id_history_map_[word_id] = history_id_set;
1028  }
1029  return true;
1030}
1031
1032void InMemoryURLIndex::SaveHistoryInfoMap(
1033    InMemoryURLIndexCacheItem* cache) const {
1034  if (history_info_map_.empty())
1035    return;
1036  HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
1037  map_item->set_item_count(history_info_map_.size());
1038  for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
1039       iter != history_info_map_.end(); ++iter) {
1040    HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
1041    map_entry->set_history_id(iter->first);
1042    const URLRow& url_row(iter->second);
1043    // Note: We only save information that contributes to the index so there
1044    // is no need to save term_char_word_set_cache_ (not persistent),
1045    // languages_, etc.
1046    map_entry->set_visit_count(url_row.visit_count());
1047    map_entry->set_typed_count(url_row.typed_count());
1048    map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
1049    map_entry->set_url(url_row.url().spec());
1050    map_entry->set_title(UTF16ToUTF8(url_row.title()));
1051  }
1052}
1053
1054bool InMemoryURLIndex::RestoreHistoryInfoMap(
1055    const InMemoryURLIndexCacheItem& cache) {
1056  if (!cache.has_history_info_map())
1057    return false;
1058  const HistoryInfoMapItem& list_item(cache.history_info_map());
1059  uint32 expected_item_count = list_item.item_count();
1060  uint32 actual_item_count = list_item.history_info_map_entry_size();
1061  if (actual_item_count == 0 || actual_item_count != expected_item_count)
1062    return false;
1063  const RepeatedPtrField<HistoryInfoMapEntry>&
1064      entries(list_item.history_info_map_entry());
1065  for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =
1066       entries.begin(); iter != entries.end(); ++iter) {
1067    HistoryID history_id = iter->history_id();
1068    GURL url(iter->url());
1069    URLRow url_row(url, history_id);
1070    url_row.set_visit_count(iter->visit_count());
1071    url_row.set_typed_count(iter->typed_count());
1072    url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
1073    if (iter->has_title()) {
1074      string16 title(UTF8ToUTF16(iter->title()));
1075      url_row.set_title(title);
1076    }
1077    history_info_map_[history_id] = url_row;
1078  }
1079  return true;
1080}
1081
1082}  // namespace history
1083