1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/history/url_index_private_data.h"
6
7#include <algorithm>
8#include <functional>
9#include <iterator>
10#include <limits>
11#include <numeric>
12#include <string>
13#include <vector>
14
15#include "base/basictypes.h"
16#include "base/file_util.h"
17#include "base/i18n/case_conversion.h"
18#include "base/metrics/histogram.h"
19#include "base/strings/string_util.h"
20#include "base/strings/utf_string_conversions.h"
21#include "base/time/time.h"
22#include "chrome/browser/autocomplete/autocomplete_provider.h"
23#include "chrome/browser/autocomplete/url_prefix.h"
24#include "chrome/browser/bookmarks/bookmark_service.h"
25#include "chrome/browser/history/history_database.h"
26#include "chrome/browser/history/history_db_task.h"
27#include "chrome/browser/history/history_service.h"
28#include "chrome/browser/history/in_memory_url_index.h"
29#include "content/public/browser/browser_thread.h"
30#include "content/public/browser/notification_details.h"
31#include "content/public/browser/notification_service.h"
32#include "content/public/browser/notification_source.h"
33#include "net/base/net_util.h"
34
35#if defined(USE_SYSTEM_PROTOBUF)
36#include <google/protobuf/repeated_field.h>
37#else
38#include "third_party/protobuf/src/google/protobuf/repeated_field.h"
39#endif
40
41using google::protobuf::RepeatedField;
42using google::protobuf::RepeatedPtrField;
43using in_memory_url_index::InMemoryURLIndexCacheItem;
44
45namespace {
46static const size_t kMaxVisitsToStoreInCache = 10u;
47}  // anonymous namespace
48
49namespace history {
50
51typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;
52typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;
53typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
54typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;
55typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry
56    CharWordMapEntry;
57typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
58    WordIDHistoryMapItem;
59typedef imui::
60    InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
61    WordIDHistoryMapEntry;
62typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;
63typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
64    HistoryInfoMapEntry;
65typedef imui::
66    InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo
67    HistoryInfoMapEntry_VisitInfo;
68typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem;
69typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
70    WordStartsMapEntry;
71
72
73// Algorithm Functions ---------------------------------------------------------
74
75// Comparison function for sorting search terms by descending length.
76bool LengthGreater(const base::string16& string_a,
77                   const base::string16& string_b) {
78  return string_a.length() > string_b.length();
79}
80
81
82// UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
83
84// HistoryDBTask used to update the recent visit data for a particular
85// row from the history database.
86class UpdateRecentVisitsFromHistoryDBTask : public HistoryDBTask {
87 public:
88  explicit UpdateRecentVisitsFromHistoryDBTask(
89      URLIndexPrivateData* private_data,
90      URLID url_id);
91
92  virtual bool RunOnDBThread(HistoryBackend* backend,
93                             history::HistoryDatabase* db) OVERRIDE;
94  virtual void DoneRunOnMainThread() OVERRIDE;
95
96 private:
97  virtual ~UpdateRecentVisitsFromHistoryDBTask();
98
99  // The URLIndexPrivateData that gets updated after the historyDB
100  // task returns.
101  URLIndexPrivateData* private_data_;
102  // The ID of the URL to get visits for and then update.
103  URLID url_id_;
104  // Whether fetching the recent visits for the URL succeeded.
105  bool succeeded_;
106  // The awaited data that's shown to private_data_ for it to copy and
107  // store.
108  VisitVector recent_visits_;
109
110  DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask);
111};
112
113UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
114    URLIndexPrivateData* private_data,
115    URLID url_id)
116    : private_data_(private_data),
117      url_id_(url_id),
118      succeeded_(false) {
119}
120
121bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
122    HistoryBackend* backend,
123    HistoryDatabase* db) {
124  // Make sure the private data is going to get as many recent visits as
125  // ScoredHistoryMatch::GetFrecency() hopes to use.
126  DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
127  succeeded_ = db->GetMostRecentVisitsForURL(url_id_,
128                                             kMaxVisitsToStoreInCache,
129                                             &recent_visits_);
130  if (!succeeded_)
131    recent_visits_.clear();
132  return true;  // Always claim to be done; do not retry failures.
133}
134
135void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
136  if (succeeded_)
137    private_data_->UpdateRecentVisits(url_id_, recent_visits_);
138}
139
140UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
141}
142
143
144// URLIndexPrivateData ---------------------------------------------------------
145
146URLIndexPrivateData::URLIndexPrivateData()
147    : restored_cache_version_(0),
148      saved_cache_version_(kCurrentCacheFileVersion),
149      pre_filter_item_count_(0),
150      post_filter_item_count_(0),
151      post_scoring_item_count_(0) {
152}
153
154ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
155    base::string16 search_string,
156    size_t cursor_position,
157    const std::string& languages,
158    BookmarkService* bookmark_service) {
159  // If cursor position is set and useful (not at either end of the
160  // string), allow the search string to be broken at cursor position.
161  // We do this by pretending there's a space where the cursor is.
162  if ((cursor_position != base::string16::npos) &&
163      (cursor_position < search_string.length()) &&
164      (cursor_position > 0)) {
165    search_string.insert(cursor_position, ASCIIToUTF16(" "));
166  }
167  pre_filter_item_count_ = 0;
168  post_filter_item_count_ = 0;
169  post_scoring_item_count_ = 0;
170  // The search string we receive may contain escaped characters. For reducing
171  // the index we need individual, lower-cased words, ignoring escapings. For
172  // the final filtering we need whitespace separated substrings possibly
173  // containing escaped characters.
174  base::string16 lower_raw_string(base::i18n::ToLower(search_string));
175  base::string16 lower_unescaped_string =
176      net::UnescapeURLComponent(lower_raw_string,
177          net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
178  // Extract individual 'words' (as opposed to 'terms'; see below) from the
179  // search string. When the user types "colspec=ID%20Mstone Release" we get
180  // four 'words': "colspec", "id", "mstone" and "release".
181  String16Vector lower_words(
182      history::String16VectorFromString16(lower_unescaped_string, false, NULL));
183  ScoredHistoryMatches scored_items;
184
185  // Do nothing if we have indexed no words (probably because we've not been
186  // initialized yet) or the search string has no words.
187  if (word_list_.empty() || lower_words.empty()) {
188    search_term_cache_.clear();  // Invalidate the term cache.
189    return scored_items;
190  }
191
192  // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
193  // approach.
194  ResetSearchTermCache();
195
196  HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
197
198  // Trim the candidate pool if it is large. Note that we do not filter out
199  // items that do not contain the search terms as proper substrings -- doing
200  // so is the performance-costly operation we are trying to avoid in order
201  // to maintain omnibox responsiveness.
202  const size_t kItemsToScoreLimit = 500;
203  pre_filter_item_count_ = history_id_set.size();
204  // If we trim the results set we do not want to cache the results for next
205  // time as the user's ultimately desired result could easily be eliminated
206  // in this early rough filter.
207  bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
208  if (was_trimmed) {
209    HistoryIDVector history_ids;
210    std::copy(history_id_set.begin(), history_id_set.end(),
211              std::back_inserter(history_ids));
212    // Trim down the set by sorting by typed-count, visit-count, and last
213    // visit.
214    HistoryItemFactorGreater
215        item_factor_functor(history_info_map_);
216    std::partial_sort(history_ids.begin(),
217                      history_ids.begin() + kItemsToScoreLimit,
218                      history_ids.end(),
219                      item_factor_functor);
220    history_id_set.clear();
221    std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
222              std::inserter(history_id_set, history_id_set.end()));
223    post_filter_item_count_ = history_id_set.size();
224  }
225
226  // Pass over all of the candidates filtering out any without a proper
227  // substring match, inserting those which pass in order by score. Note that
228  // in this step we are using the raw search string complete with escaped
229  // URL elements. When the user has specifically typed something akin to
230  // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
231  // specific substring appears in the URL or page title.
232
233  // We call these 'terms' (as opposed to 'words'; see above) as in this case
234  // we only want to break up the search string on 'true' whitespace rather than
235  // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
236  // get two 'terms': "colspec=id%20mstone" and "release".
237  history::String16Vector lower_raw_terms;
238  if (Tokenize(lower_raw_string, base::kWhitespaceUTF16,
239               &lower_raw_terms) == 0) {
240    // Don't score matches when there are no terms to score against.  (It's
241    // possible that the word break iterater that extracts words to search
242    // for in the database allows some whitespace "words" whereas Tokenize
243    // excludes a long list of whitespace.)  One could write a scoring
244    // function that gives a reasonable order to matches when there
245    // are no terms (i.e., all the words are some form of whitespace),
246    // but this is such a rare edge case that it's not worth the time.
247    return scored_items;
248  }
249  scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
250      AddHistoryMatch(*this, languages, bookmark_service, lower_raw_string,
251                      lower_raw_terms, base::Time::Now())).ScoredMatches();
252
253  // Select and sort only the top kMaxMatches results.
254  if (scored_items.size() > AutocompleteProvider::kMaxMatches) {
255    std::partial_sort(scored_items.begin(),
256                      scored_items.begin() +
257                          AutocompleteProvider::kMaxMatches,
258                      scored_items.end(),
259                      ScoredHistoryMatch::MatchScoreGreater);
260      scored_items.resize(AutocompleteProvider::kMaxMatches);
261  } else {
262    std::sort(scored_items.begin(), scored_items.end(),
263              ScoredHistoryMatch::MatchScoreGreater);
264  }
265  post_scoring_item_count_ = scored_items.size();
266
267  if (was_trimmed) {
268    search_term_cache_.clear();  // Invalidate the term cache.
269  } else {
270    // Remove any stale SearchTermCacheItems.
271    for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
272         cache_iter != search_term_cache_.end(); ) {
273      if (!cache_iter->second.used_)
274        search_term_cache_.erase(cache_iter++);
275      else
276        ++cache_iter;
277    }
278  }
279
280  return scored_items;
281}
282
283bool URLIndexPrivateData::UpdateURL(
284    HistoryService* history_service,
285    const URLRow& row,
286    const std::string& languages,
287    const std::set<std::string>& scheme_whitelist) {
288  // The row may or may not already be in our index. If it is not already
289  // indexed and it qualifies then it gets indexed. If it is already
290  // indexed and still qualifies then it gets updated, otherwise it
291  // is deleted from the index.
292  bool row_was_updated = false;
293  URLID row_id = row.id();
294  HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
295  if (row_pos == history_info_map_.end()) {
296    // This new row should be indexed if it qualifies.
297    URLRow new_row(row);
298    new_row.set_id(row_id);
299    row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&
300        IndexRow(NULL, history_service, new_row, languages, scheme_whitelist);
301  } else if (RowQualifiesAsSignificant(row, base::Time())) {
302    // This indexed row still qualifies and will be re-indexed.
303    // The url won't have changed but the title, visit count, etc.
304    // might have changed.
305    URLRow& row_to_update = row_pos->second.url_row;
306    bool title_updated = row_to_update.title() != row.title();
307    if (row_to_update.visit_count() != row.visit_count() ||
308        row_to_update.typed_count() != row.typed_count() ||
309        row_to_update.last_visit() != row.last_visit() || title_updated) {
310      row_to_update.set_visit_count(row.visit_count());
311      row_to_update.set_typed_count(row.typed_count());
312      row_to_update.set_last_visit(row.last_visit());
313      // If something appears to have changed, update the recent visits
314      // information.
315      ScheduleUpdateRecentVisits(history_service, row_id);
316      // While the URL is guaranteed to remain stable, the title may have
317      // changed. If so, then update the index with the changed words.
318      if (title_updated) {
319        // Clear all words associated with this row and re-index both the
320        // URL and title.
321        RemoveRowWordsFromIndex(row_to_update);
322        row_to_update.set_title(row.title());
323        RowWordStarts word_starts;
324        AddRowWordsToIndex(row_to_update, &word_starts, languages);
325        word_starts_map_[row_id] = word_starts;
326      }
327      row_was_updated = true;
328    }
329  } else {
330    // This indexed row no longer qualifies and will be de-indexed by
331    // clearing all words associated with this row.
332    RemoveRowFromIndex(row);
333    row_was_updated = true;
334  }
335  if (row_was_updated)
336    search_term_cache_.clear();  // This invalidates the cache.
337  return row_was_updated;
338}
339
340void URLIndexPrivateData::UpdateRecentVisits(
341    URLID url_id,
342    const VisitVector& recent_visits) {
343  HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id);
344  if (row_pos != history_info_map_.end()) {
345    VisitInfoVector* visits = &row_pos->second.visits;
346    visits->clear();
347    const size_t size =
348        std::min(recent_visits.size(), kMaxVisitsToStoreInCache);
349    visits->reserve(size);
350    for (size_t i = 0; i < size; i++) {
351      // Copy from the VisitVector the only fields visits needs.
352      visits->push_back(std::make_pair(recent_visits[i].visit_time,
353                                       recent_visits[i].transition));
354    }
355  }
356  // Else: Oddly, the URL doesn't seem to exist in the private index.
357  // Ignore this update.  This can happen if, for instance, the user
358  // removes the URL from URLIndexPrivateData before the historyDB call
359  // returns.
360}
361
362void URLIndexPrivateData::ScheduleUpdateRecentVisits(
363    HistoryService* history_service,
364    URLID url_id) {
365  history_service->ScheduleDBTask(
366      new UpdateRecentVisitsFromHistoryDBTask(this, url_id),
367      &recent_visits_consumer_);
368}
369
370// Helper functor for DeleteURL.
371class HistoryInfoMapItemHasURL {
372 public:
373  explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}
374
375  bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) {
376    return item.second.url_row.url() == url_;
377  }
378
379 private:
380  const GURL& url_;
381};
382
383bool URLIndexPrivateData::DeleteURL(const GURL& url) {
384  // Find the matching entry in the history_info_map_.
385  HistoryInfoMap::iterator pos = std::find_if(
386      history_info_map_.begin(),
387      history_info_map_.end(),
388      HistoryInfoMapItemHasURL(url));
389  if (pos == history_info_map_.end())
390    return false;
391  RemoveRowFromIndex(pos->second.url_row);
392  search_term_cache_.clear();  // This invalidates the cache.
393  return true;
394}
395
396// static
397scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(
398    const base::FilePath& file_path,
399    const std::string& languages) {
400  base::TimeTicks beginning_time = base::TimeTicks::Now();
401  if (!base::PathExists(file_path))
402    return NULL;
403  std::string data;
404  // If there is no cache file then simply give up. This will cause us to
405  // attempt to rebuild from the history database.
406  if (!base::ReadFileToString(file_path, &data))
407    return NULL;
408
409  scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);
410  InMemoryURLIndexCacheItem index_cache;
411  if (!index_cache.ParseFromArray(data.c_str(), data.size())) {
412    LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "
413                 << file_path.value();
414    return restored_data;
415  }
416
417  if (!restored_data->RestorePrivateData(index_cache, languages))
418    return NULL;
419
420  UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
421                      base::TimeTicks::Now() - beginning_time);
422  UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
423                       restored_data->history_id_word_map_.size());
424  UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
425  UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
426                             restored_data->word_map_.size());
427  UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
428                             restored_data->char_word_map_.size());
429  if (restored_data->Empty())
430    return NULL;  // 'No data' is the same as a failed reload.
431  return restored_data;
432}
433
434// static
435scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
436    HistoryDatabase* history_db,
437    const std::string& languages,
438    const std::set<std::string>& scheme_whitelist) {
439  if (!history_db)
440    return NULL;
441
442  base::TimeTicks beginning_time = base::TimeTicks::Now();
443
444  scoped_refptr<URLIndexPrivateData>
445      rebuilt_data(new URLIndexPrivateData);
446  URLDatabase::URLEnumerator history_enum;
447  if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
448    return NULL;
449  rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now();
450  for (URLRow row; history_enum.GetNextURL(&row); ) {
451    rebuilt_data->IndexRow(history_db, NULL, row, languages,
452                           scheme_whitelist);
453  }
454
455  UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
456                      base::TimeTicks::Now() - beginning_time);
457  UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
458                       rebuilt_data->history_id_word_map_.size());
459  UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
460                             rebuilt_data->word_map_.size());
461  UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
462                             rebuilt_data->char_word_map_.size());
463  return rebuilt_data;
464}
465
466// static
467bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(
468    scoped_refptr<URLIndexPrivateData> private_data,
469    const base::FilePath& file_path) {
470  DCHECK(private_data.get());
471  DCHECK(!file_path.empty());
472  return private_data->SaveToFile(file_path);
473}
474
475void URLIndexPrivateData::CancelPendingUpdates() {
476  recent_visits_consumer_.CancelAllRequests();
477}
478
479scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {
480  scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;
481  data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_;
482  data_copy->word_list_ = word_list_;
483  data_copy->available_words_ = available_words_;
484  data_copy->word_map_ = word_map_;
485  data_copy->char_word_map_ = char_word_map_;
486  data_copy->word_id_history_map_ = word_id_history_map_;
487  data_copy->history_id_word_map_ = history_id_word_map_;
488  data_copy->history_info_map_ = history_info_map_;
489  data_copy->word_starts_map_ = word_starts_map_;
490  return data_copy;
491  // Not copied:
492  //    search_term_cache_
493  //    pre_filter_item_count_
494  //    post_filter_item_count_
495  //    post_scoring_item_count_
496};
497
498bool URLIndexPrivateData::Empty() const {
499  return history_info_map_.empty();
500}
501
502void URLIndexPrivateData::Clear() {
503  last_time_rebuilt_from_history_ = base::Time();
504  word_list_.clear();
505  available_words_.clear();
506  word_map_.clear();
507  char_word_map_.clear();
508  word_id_history_map_.clear();
509  history_id_word_map_.clear();
510  history_info_map_.clear();
511  word_starts_map_.clear();
512}
513
514URLIndexPrivateData::~URLIndexPrivateData() {}
515
516HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(
517    const String16Vector& unsorted_words) {
518  // Break the terms down into individual terms (words), get the candidate
519  // set for each term, and intersect each to get a final candidate list.
520  // Note that a single 'term' from the user's perspective might be
521  // a string like "http://www.somewebsite.com" which, from our perspective,
522  // is four words: 'http', 'www', 'somewebsite', and 'com'.
523  HistoryIDSet history_id_set;
524  String16Vector words(unsorted_words);
525  // Sort the words into the longest first as such are likely to narrow down
526  // the results quicker. Also, single character words are the most expensive
527  // to process so save them for last.
528  std::sort(words.begin(), words.end(), LengthGreater);
529  for (String16Vector::iterator iter = words.begin(); iter != words.end();
530       ++iter) {
531    base::string16 uni_word = *iter;
532    HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
533    if (term_history_set.empty()) {
534      history_id_set.clear();
535      break;
536    }
537    if (iter == words.begin()) {
538      history_id_set.swap(term_history_set);
539    } else {
540      HistoryIDSet new_history_id_set;
541      std::set_intersection(history_id_set.begin(), history_id_set.end(),
542                            term_history_set.begin(), term_history_set.end(),
543                            std::inserter(new_history_id_set,
544                                          new_history_id_set.begin()));
545      history_id_set.swap(new_history_id_set);
546    }
547  }
548  return history_id_set;
549}
550
551HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(
552    const base::string16& term) {
553  if (term.empty())
554    return HistoryIDSet();
555
556  // TODO(mrossetti): Consider optimizing for very common terms such as
557  // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
558  // occuring words in the user's searches.
559
560  size_t term_length = term.length();
561  WordIDSet word_id_set;
562  if (term_length > 1) {
563    // See if this term or a prefix thereof is present in the cache.
564    SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());
565    for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
566         cache_iter != search_term_cache_.end(); ++cache_iter) {
567      if (StartsWith(term, cache_iter->first, false) &&
568          (best_prefix == search_term_cache_.end() ||
569           cache_iter->first.length() > best_prefix->first.length()))
570        best_prefix = cache_iter;
571    }
572
573    // If a prefix was found then determine the leftover characters to be used
574    // for further refining the results from that prefix.
575    Char16Set prefix_chars;
576    base::string16 leftovers(term);
577    if (best_prefix != search_term_cache_.end()) {
578      // If the prefix is an exact match for the term then grab the cached
579      // results and we're done.
580      size_t prefix_length = best_prefix->first.length();
581      if (prefix_length == term_length) {
582        best_prefix->second.used_ = true;
583        return best_prefix->second.history_id_set_;
584      }
585
586      // Otherwise we have a handy starting point.
587      // If there are no history results for this prefix then we can bail early
588      // as there will be no history results for the full term.
589      if (best_prefix->second.history_id_set_.empty()) {
590        search_term_cache_[term] = SearchTermCacheItem();
591        return HistoryIDSet();
592      }
593      word_id_set = best_prefix->second.word_id_set_;
594      prefix_chars = Char16SetFromString16(best_prefix->first);
595      leftovers = term.substr(prefix_length);
596    }
597
598    // Filter for each remaining, unique character in the term.
599    Char16Set leftover_chars = Char16SetFromString16(leftovers);
600    Char16Set unique_chars =
601        base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);
602
603    // Reduce the word set with any leftover, unprocessed characters.
604    if (!unique_chars.empty()) {
605      WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));
606      // We might come up empty on the leftovers.
607      if (leftover_set.empty()) {
608        search_term_cache_[term] = SearchTermCacheItem();
609        return HistoryIDSet();
610      }
611      // Or there may not have been a prefix from which to start.
612      if (prefix_chars.empty()) {
613        word_id_set.swap(leftover_set);
614      } else {
615        WordIDSet new_word_id_set;
616        std::set_intersection(word_id_set.begin(), word_id_set.end(),
617                              leftover_set.begin(), leftover_set.end(),
618                              std::inserter(new_word_id_set,
619                                            new_word_id_set.begin()));
620        word_id_set.swap(new_word_id_set);
621      }
622    }
623
624    // We must filter the word list because the resulting word set surely
625    // contains words which do not have the search term as a proper subset.
626    for (WordIDSet::iterator word_set_iter = word_id_set.begin();
627         word_set_iter != word_id_set.end(); ) {
628      if (word_list_[*word_set_iter].find(term) == base::string16::npos)
629        word_id_set.erase(word_set_iter++);
630      else
631        ++word_set_iter;
632    }
633  } else {
634    word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));
635  }
636
637  // If any words resulted then we can compose a set of history IDs by unioning
638  // the sets from each word.
639  HistoryIDSet history_id_set;
640  if (!word_id_set.empty()) {
641    for (WordIDSet::iterator word_id_iter = word_id_set.begin();
642         word_id_iter != word_id_set.end(); ++word_id_iter) {
643      WordID word_id = *word_id_iter;
644      WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
645      if (word_iter != word_id_history_map_.end()) {
646        HistoryIDSet& word_history_id_set(word_iter->second);
647        history_id_set.insert(word_history_id_set.begin(),
648                              word_history_id_set.end());
649      }
650    }
651  }
652
653  // Record a new cache entry for this word if the term is longer than
654  // a single character.
655  if (term_length > 1)
656    search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);
657
658  return history_id_set;
659}
660
661WordIDSet URLIndexPrivateData::WordIDSetForTermChars(
662    const Char16Set& term_chars) {
663  WordIDSet word_id_set;
664  for (Char16Set::const_iterator c_iter = term_chars.begin();
665       c_iter != term_chars.end(); ++c_iter) {
666    CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);
667    if (char_iter == char_word_map_.end()) {
668      // A character was not found so there are no matching results: bail.
669      word_id_set.clear();
670      break;
671    }
672    WordIDSet& char_word_id_set(char_iter->second);
673    // It is possible for there to no longer be any words associated with
674    // a particular character. Give up in that case.
675    if (char_word_id_set.empty()) {
676      word_id_set.clear();
677      break;
678    }
679
680    if (c_iter == term_chars.begin()) {
681      // First character results becomes base set of results.
682      word_id_set = char_word_id_set;
683    } else {
684      // Subsequent character results get intersected in.
685      WordIDSet new_word_id_set;
686      std::set_intersection(word_id_set.begin(), word_id_set.end(),
687                            char_word_id_set.begin(), char_word_id_set.end(),
688                            std::inserter(new_word_id_set,
689                                          new_word_id_set.begin()));
690      word_id_set.swap(new_word_id_set);
691    }
692  }
693  return word_id_set;
694}
695
696bool URLIndexPrivateData::IndexRow(
697    HistoryDatabase* history_db,
698    HistoryService* history_service,
699    const URLRow& row,
700    const std::string& languages,
701    const std::set<std::string>& scheme_whitelist) {
702  const GURL& gurl(row.url());
703
704  // Index only URLs with a whitelisted scheme.
705  if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))
706    return false;
707
708  URLID row_id = row.id();
709  // Strip out username and password before saving and indexing.
710  base::string16 url(net::FormatUrl(gurl, languages,
711      net::kFormatUrlOmitUsernamePassword,
712      net::UnescapeRule::NONE,
713      NULL, NULL, NULL));
714
715  HistoryID history_id = static_cast<HistoryID>(row_id);
716  DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());
717
718  // Add the row for quick lookup in the history info store.
719  URLRow new_row(GURL(url), row_id);
720  new_row.set_visit_count(row.visit_count());
721  new_row.set_typed_count(row.typed_count());
722  new_row.set_last_visit(row.last_visit());
723  new_row.set_title(row.title());
724  history_info_map_[history_id].url_row = new_row;
725
726  // Index the words contained in the URL and title of the row.
727  RowWordStarts word_starts;
728  AddRowWordsToIndex(new_row, &word_starts, languages);
729  word_starts_map_[history_id] = word_starts;
730
731  // Update the recent visits information or schedule the update
732  // as appropriate.
733  if (history_db) {
734    // We'd like to check that we're on the history DB thread.
735    // However, unittest code actually calls this on the UI thread.
736    // So we don't do any thread checks.
737    VisitVector recent_visits;
738    // Make sure the private data is going to get as many recent visits as
739    // ScoredHistoryMatch::GetFrecency() hopes to use.
740    DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
741    if (history_db->GetMostRecentVisitsForURL(row_id,
742                                              kMaxVisitsToStoreInCache,
743                                              &recent_visits))
744      UpdateRecentVisits(row_id, recent_visits);
745  } else {
746    DCHECK(history_service);
747    ScheduleUpdateRecentVisits(history_service, row_id);
748  }
749
750  return true;
751}
752
753void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row,
754                                             RowWordStarts* word_starts,
755                                             const std::string& languages) {
756  HistoryID history_id = static_cast<HistoryID>(row.id());
757  // Split URL into individual, unique words then add in the title words.
758  const GURL& gurl(row.url());
759  const base::string16& url = CleanUpUrlForMatching(gurl, languages);
760  String16Set url_words = String16SetFromString16(url,
761      word_starts ? &word_starts->url_word_starts_ : NULL);
762  const base::string16& title = CleanUpTitleForMatching(row.title());
763  String16Set title_words = String16SetFromString16(title,
764      word_starts ? &word_starts->title_word_starts_ : NULL);
765  String16Set words;
766  std::set_union(url_words.begin(), url_words.end(),
767                 title_words.begin(), title_words.end(),
768                 std::insert_iterator<String16Set>(words, words.begin()));
769  for (String16Set::iterator word_iter = words.begin();
770       word_iter != words.end(); ++word_iter)
771    AddWordToIndex(*word_iter, history_id);
772
773  search_term_cache_.clear();  // Invalidate the term cache.
774}
775
776void URLIndexPrivateData::AddWordToIndex(const base::string16& term,
777                                         HistoryID history_id) {
778  WordMap::iterator word_pos = word_map_.find(term);
779  if (word_pos != word_map_.end())
780    UpdateWordHistory(word_pos->second, history_id);
781  else
782    AddWordHistory(term, history_id);
783}
784
785void URLIndexPrivateData::AddWordHistory(const base::string16& term,
786                                         HistoryID history_id) {
787  WordID word_id = word_list_.size();
788  if (available_words_.empty()) {
789    word_list_.push_back(term);
790  } else {
791    word_id = *(available_words_.begin());
792    word_list_[word_id] = term;
793    available_words_.erase(word_id);
794  }
795  word_map_[term] = word_id;
796
797  HistoryIDSet history_id_set;
798  history_id_set.insert(history_id);
799  word_id_history_map_[word_id] = history_id_set;
800  AddToHistoryIDWordMap(history_id, word_id);
801
802  // For each character in the newly added word (i.e. a word that is not
803  // already in the word index), add the word to the character index.
804  Char16Set characters = Char16SetFromString16(term);
805  for (Char16Set::iterator uni_char_iter = characters.begin();
806       uni_char_iter != characters.end(); ++uni_char_iter) {
807    char16 uni_char = *uni_char_iter;
808    CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
809    if (char_iter != char_word_map_.end()) {
810      // Update existing entry in the char/word index.
811      WordIDSet& word_id_set(char_iter->second);
812      word_id_set.insert(word_id);
813    } else {
814      // Create a new entry in the char/word index.
815      WordIDSet word_id_set;
816      word_id_set.insert(word_id);
817      char_word_map_[uni_char] = word_id_set;
818    }
819  }
820}
821
822void URLIndexPrivateData::UpdateWordHistory(WordID word_id,
823                                            HistoryID history_id) {
824  WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
825  DCHECK(history_pos != word_id_history_map_.end());
826  HistoryIDSet& history_id_set(history_pos->second);
827  history_id_set.insert(history_id);
828  AddToHistoryIDWordMap(history_id, word_id);
829}
830
831void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,
832                                                WordID word_id) {
833  HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id);
834  if (iter != history_id_word_map_.end()) {
835    WordIDSet& word_id_set(iter->second);
836    word_id_set.insert(word_id);
837  } else {
838    WordIDSet word_id_set;
839    word_id_set.insert(word_id);
840    history_id_word_map_[history_id] = word_id_set;
841  }
842}
843
844void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) {
845  RemoveRowWordsFromIndex(row);
846  HistoryID history_id = static_cast<HistoryID>(row.id());
847  history_info_map_.erase(history_id);
848  word_starts_map_.erase(history_id);
849}
850
851void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) {
852  // Remove the entries in history_id_word_map_ and word_id_history_map_ for
853  // this row.
854  HistoryID history_id = static_cast<HistoryID>(row.id());
855  WordIDSet word_id_set = history_id_word_map_[history_id];
856  history_id_word_map_.erase(history_id);
857
858  // Reconcile any changes to word usage.
859  for (WordIDSet::iterator word_id_iter = word_id_set.begin();
860       word_id_iter != word_id_set.end(); ++word_id_iter) {
861    WordID word_id = *word_id_iter;
862    word_id_history_map_[word_id].erase(history_id);
863    if (!word_id_history_map_[word_id].empty())
864      continue;  // The word is still in use.
865
866    // The word is no longer in use. Reconcile any changes to character usage.
867    base::string16 word = word_list_[word_id];
868    Char16Set characters = Char16SetFromString16(word);
869    for (Char16Set::iterator uni_char_iter = characters.begin();
870         uni_char_iter != characters.end(); ++uni_char_iter) {
871      char16 uni_char = *uni_char_iter;
872      char_word_map_[uni_char].erase(word_id);
873      if (char_word_map_[uni_char].empty())
874        char_word_map_.erase(uni_char);  // No longer in use.
875    }
876
877    // Complete the removal of references to the word.
878    word_id_history_map_.erase(word_id);
879    word_map_.erase(word);
880    word_list_[word_id] = base::string16();
881    available_words_.insert(word_id);
882  }
883}
884
885void URLIndexPrivateData::ResetSearchTermCache() {
886  for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
887       iter != search_term_cache_.end(); ++iter)
888    iter->second.used_ = false;
889}
890
891bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) {
892  base::TimeTicks beginning_time = base::TimeTicks::Now();
893  InMemoryURLIndexCacheItem index_cache;
894  SavePrivateData(&index_cache);
895  std::string data;
896  if (!index_cache.SerializeToString(&data)) {
897    LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";
898    return false;
899  }
900
901  int size = data.size();
902  if (file_util::WriteFile(file_path, data.c_str(), size) != size) {
903    LOG(WARNING) << "Failed to write " << file_path.value();
904    return false;
905  }
906  UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
907                      base::TimeTicks::Now() - beginning_time);
908  return true;
909}
910
911void URLIndexPrivateData::SavePrivateData(
912    InMemoryURLIndexCacheItem* cache) const {
913  DCHECK(cache);
914  cache->set_last_rebuild_timestamp(
915      last_time_rebuilt_from_history_.ToInternalValue());
916  cache->set_version(saved_cache_version_);
917  // history_item_count_ is no longer used but rather than change the protobuf
918  // definition use a placeholder. This will go away with the switch to SQLite.
919  cache->set_history_item_count(0);
920  SaveWordList(cache);
921  SaveWordMap(cache);
922  SaveCharWordMap(cache);
923  SaveWordIDHistoryMap(cache);
924  SaveHistoryInfoMap(cache);
925  SaveWordStartsMap(cache);
926}
927
928void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
929  if (word_list_.empty())
930    return;
931  WordListItem* list_item = cache->mutable_word_list();
932  list_item->set_word_count(word_list_.size());
933  for (String16Vector::const_iterator iter = word_list_.begin();
934       iter != word_list_.end(); ++iter)
935    list_item->add_word(UTF16ToUTF8(*iter));
936}
937
938void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
939  if (word_map_.empty())
940    return;
941  WordMapItem* map_item = cache->mutable_word_map();
942  map_item->set_item_count(word_map_.size());
943  for (WordMap::const_iterator iter = word_map_.begin();
944       iter != word_map_.end(); ++iter) {
945    WordMapEntry* map_entry = map_item->add_word_map_entry();
946    map_entry->set_word(UTF16ToUTF8(iter->first));
947    map_entry->set_word_id(iter->second);
948  }
949}
950
951void URLIndexPrivateData::SaveCharWordMap(
952    InMemoryURLIndexCacheItem* cache) const {
953  if (char_word_map_.empty())
954    return;
955  CharWordMapItem* map_item = cache->mutable_char_word_map();
956  map_item->set_item_count(char_word_map_.size());
957  for (CharWordIDMap::const_iterator iter = char_word_map_.begin();
958       iter != char_word_map_.end(); ++iter) {
959    CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
960    map_entry->set_char_16(iter->first);
961    const WordIDSet& word_id_set(iter->second);
962    map_entry->set_item_count(word_id_set.size());
963    for (WordIDSet::const_iterator set_iter = word_id_set.begin();
964         set_iter != word_id_set.end(); ++set_iter)
965      map_entry->add_word_id(*set_iter);
966  }
967}
968
969void URLIndexPrivateData::SaveWordIDHistoryMap(
970    InMemoryURLIndexCacheItem* cache) const {
971  if (word_id_history_map_.empty())
972    return;
973  WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
974  map_item->set_item_count(word_id_history_map_.size());
975  for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();
976       iter != word_id_history_map_.end(); ++iter) {
977    WordIDHistoryMapEntry* map_entry =
978        map_item->add_word_id_history_map_entry();
979    map_entry->set_word_id(iter->first);
980    const HistoryIDSet& history_id_set(iter->second);
981    map_entry->set_item_count(history_id_set.size());
982    for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
983         set_iter != history_id_set.end(); ++set_iter)
984      map_entry->add_history_id(*set_iter);
985  }
986}
987
988void URLIndexPrivateData::SaveHistoryInfoMap(
989    InMemoryURLIndexCacheItem* cache) const {
990  if (history_info_map_.empty())
991    return;
992  HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
993  map_item->set_item_count(history_info_map_.size());
994  for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
995       iter != history_info_map_.end(); ++iter) {
996    HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
997    map_entry->set_history_id(iter->first);
998    const URLRow& url_row(iter->second.url_row);
999    // Note: We only save information that contributes to the index so there
1000    // is no need to save search_term_cache_ (not persistent).
1001    map_entry->set_visit_count(url_row.visit_count());
1002    map_entry->set_typed_count(url_row.typed_count());
1003    map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
1004    map_entry->set_url(url_row.url().spec());
1005    map_entry->set_title(UTF16ToUTF8(url_row.title()));
1006    const VisitInfoVector& visits(iter->second.visits);
1007    for (VisitInfoVector::const_iterator visit_iter = visits.begin();
1008         visit_iter != visits.end(); ++visit_iter) {
1009      HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits();
1010      visit_info->set_visit_time(visit_iter->first.ToInternalValue());
1011      visit_info->set_transition_type(visit_iter->second);
1012    }
1013  }
1014}
1015
1016void URLIndexPrivateData::SaveWordStartsMap(
1017    InMemoryURLIndexCacheItem* cache) const {
1018  if (word_starts_map_.empty())
1019    return;
1020  // For unit testing: Enable saving of the cache as an earlier version to
1021  // allow testing of cache file upgrading in ReadFromFile().
1022  // TODO(mrossetti): Instead of intruding on production code with this kind of
1023  // test harness, save a copy of an older version cache with known results.
1024  // Implement this when switching the caching over to SQLite.
1025  if (saved_cache_version_ < 1)
1026    return;
1027
1028  WordStartsMapItem* map_item = cache->mutable_word_starts_map();
1029  map_item->set_item_count(word_starts_map_.size());
1030  for (WordStartsMap::const_iterator iter = word_starts_map_.begin();
1031       iter != word_starts_map_.end(); ++iter) {
1032    WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();
1033    map_entry->set_history_id(iter->first);
1034    const RowWordStarts& word_starts(iter->second);
1035    for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();
1036         i != word_starts.url_word_starts_.end(); ++i)
1037      map_entry->add_url_word_starts(*i);
1038    for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();
1039         i != word_starts.title_word_starts_.end(); ++i)
1040      map_entry->add_title_word_starts(*i);
1041  }
1042}
1043
1044bool URLIndexPrivateData::RestorePrivateData(
1045    const InMemoryURLIndexCacheItem& cache,
1046    const std::string& languages) {
1047  last_time_rebuilt_from_history_ =
1048      base::Time::FromInternalValue(cache.last_rebuild_timestamp());
1049  const base::TimeDelta rebuilt_ago =
1050      base::Time::Now() - last_time_rebuilt_from_history_;
1051  if ((rebuilt_ago > base::TimeDelta::FromDays(7)) ||
1052      (rebuilt_ago < base::TimeDelta::FromDays(-1))) {
1053    // Cache is more than a week old or, somehow, from some time in the future.
1054    // It's probably a good time to rebuild the index from history to
1055    // allow synced entries to now appear, expired entries to disappear, etc.
1056    // Allow one day in the future to make the cache not rebuild on simple
1057    // system clock changes such as time zone changes.
1058    return false;
1059  }
1060  if (cache.has_version()) {
1061    if (cache.version() < kCurrentCacheFileVersion) {
1062      // Don't try to restore an old format cache file.  (This will cause
1063      // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData
1064      // from history.)
1065      return false;
1066    }
1067    restored_cache_version_ = cache.version();
1068  }
1069  return RestoreWordList(cache) && RestoreWordMap(cache) &&
1070      RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
1071      RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);
1072}
1073
1074bool URLIndexPrivateData::RestoreWordList(
1075    const InMemoryURLIndexCacheItem& cache) {
1076  if (!cache.has_word_list())
1077    return false;
1078  const WordListItem& list_item(cache.word_list());
1079  uint32 expected_item_count = list_item.word_count();
1080  uint32 actual_item_count = list_item.word_size();
1081  if (actual_item_count == 0 || actual_item_count != expected_item_count)
1082    return false;
1083  const RepeatedPtrField<std::string>& words(list_item.word());
1084  for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
1085       iter != words.end(); ++iter)
1086    word_list_.push_back(UTF8ToUTF16(*iter));
1087  return true;
1088}
1089
1090bool URLIndexPrivateData::RestoreWordMap(
1091    const InMemoryURLIndexCacheItem& cache) {
1092  if (!cache.has_word_map())
1093    return false;
1094  const WordMapItem& list_item(cache.word_map());
1095  uint32 expected_item_count = list_item.item_count();
1096  uint32 actual_item_count = list_item.word_map_entry_size();
1097  if (actual_item_count == 0 || actual_item_count != expected_item_count)
1098    return false;
1099  const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
1100  for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
1101       iter != entries.end(); ++iter)
1102    word_map_[UTF8ToUTF16(iter->word())] = iter->word_id();
1103  return true;
1104}
1105
1106bool URLIndexPrivateData::RestoreCharWordMap(
1107    const InMemoryURLIndexCacheItem& cache) {
1108  if (!cache.has_char_word_map())
1109    return false;
1110  const CharWordMapItem& list_item(cache.char_word_map());
1111  uint32 expected_item_count = list_item.item_count();
1112  uint32 actual_item_count = list_item.char_word_map_entry_size();
1113  if (actual_item_count == 0 || actual_item_count != expected_item_count)
1114    return false;
1115  const RepeatedPtrField<CharWordMapEntry>&
1116      entries(list_item.char_word_map_entry());
1117  for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =
1118       entries.begin(); iter != entries.end(); ++iter) {
1119    expected_item_count = iter->item_count();
1120    actual_item_count = iter->word_id_size();
1121    if (actual_item_count == 0 || actual_item_count != expected_item_count)
1122      return false;
1123    char16 uni_char = static_cast<char16>(iter->char_16());
1124    WordIDSet word_id_set;
1125    const RepeatedField<int32>& word_ids(iter->word_id());
1126    for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
1127         jiter != word_ids.end(); ++jiter)
1128      word_id_set.insert(*jiter);
1129    char_word_map_[uni_char] = word_id_set;
1130  }
1131  return true;
1132}
1133
1134bool URLIndexPrivateData::RestoreWordIDHistoryMap(
1135    const InMemoryURLIndexCacheItem& cache) {
1136  if (!cache.has_word_id_history_map())
1137    return false;
1138  const WordIDHistoryMapItem& list_item(cache.word_id_history_map());
1139  uint32 expected_item_count = list_item.item_count();
1140  uint32 actual_item_count = list_item.word_id_history_map_entry_size();
1141  if (actual_item_count == 0 || actual_item_count != expected_item_count)
1142    return false;
1143  const RepeatedPtrField<WordIDHistoryMapEntry>&
1144      entries(list_item.word_id_history_map_entry());
1145  for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
1146       entries.begin(); iter != entries.end(); ++iter) {
1147    expected_item_count = iter->item_count();
1148    actual_item_count = iter->history_id_size();
1149    if (actual_item_count == 0 || actual_item_count != expected_item_count)
1150      return false;
1151    WordID word_id = iter->word_id();
1152    HistoryIDSet history_id_set;
1153    const RepeatedField<int64>& history_ids(iter->history_id());
1154    for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
1155         jiter != history_ids.end(); ++jiter) {
1156      history_id_set.insert(*jiter);
1157      AddToHistoryIDWordMap(*jiter, word_id);
1158    }
1159    word_id_history_map_[word_id] = history_id_set;
1160  }
1161  return true;
1162}
1163
1164bool URLIndexPrivateData::RestoreHistoryInfoMap(
1165    const InMemoryURLIndexCacheItem& cache) {
1166  if (!cache.has_history_info_map())
1167    return false;
1168  const HistoryInfoMapItem& list_item(cache.history_info_map());
1169  uint32 expected_item_count = list_item.item_count();
1170  uint32 actual_item_count = list_item.history_info_map_entry_size();
1171  if (actual_item_count == 0 || actual_item_count != expected_item_count)
1172    return false;
1173  const RepeatedPtrField<HistoryInfoMapEntry>&
1174      entries(list_item.history_info_map_entry());
1175  for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =
1176       entries.begin(); iter != entries.end(); ++iter) {
1177    HistoryID history_id = iter->history_id();
1178    GURL url(iter->url());
1179    URLRow url_row(url, history_id);
1180    url_row.set_visit_count(iter->visit_count());
1181    url_row.set_typed_count(iter->typed_count());
1182    url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
1183    if (iter->has_title()) {
1184      base::string16 title(UTF8ToUTF16(iter->title()));
1185      url_row.set_title(title);
1186    }
1187    history_info_map_[history_id].url_row = url_row;
1188
1189    // Restore visits list.
1190    VisitInfoVector visits;
1191    visits.reserve(iter->visits_size());
1192    for (int i = 0; i < iter->visits_size(); ++i) {
1193      visits.push_back(std::make_pair(
1194          base::Time::FromInternalValue(iter->visits(i).visit_time()),
1195          static_cast<content::PageTransition>(iter->visits(i).
1196                                               transition_type())));
1197    }
1198    history_info_map_[history_id].visits = visits;
1199  }
1200  return true;
1201}
1202
1203bool URLIndexPrivateData::RestoreWordStartsMap(
1204    const InMemoryURLIndexCacheItem& cache,
1205    const std::string& languages) {
1206  // Note that this function must be called after RestoreHistoryInfoMap() has
1207  // been run as the word starts may have to be recalculated from the urls and
1208  // page titles.
1209  if (cache.has_word_starts_map()) {
1210    const WordStartsMapItem& list_item(cache.word_starts_map());
1211    uint32 expected_item_count = list_item.item_count();
1212    uint32 actual_item_count = list_item.word_starts_map_entry_size();
1213    if (actual_item_count == 0 || actual_item_count != expected_item_count)
1214      return false;
1215    const RepeatedPtrField<WordStartsMapEntry>&
1216        entries(list_item.word_starts_map_entry());
1217    for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =
1218         entries.begin(); iter != entries.end(); ++iter) {
1219      HistoryID history_id = iter->history_id();
1220      RowWordStarts word_starts;
1221      // Restore the URL word starts.
1222      const RepeatedField<int32>& url_starts(iter->url_word_starts());
1223      for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();
1224           jiter != url_starts.end(); ++jiter)
1225        word_starts.url_word_starts_.push_back(*jiter);
1226      // Restore the page title word starts.
1227      const RepeatedField<int32>& title_starts(iter->title_word_starts());
1228      for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();
1229           jiter != title_starts.end(); ++jiter)
1230        word_starts.title_word_starts_.push_back(*jiter);
1231      word_starts_map_[history_id] = word_starts;
1232    }
1233  } else {
1234    // Since the cache did not contain any word starts we must rebuild then from
1235    // the URL and page titles.
1236    for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
1237         iter != history_info_map_.end(); ++iter) {
1238      RowWordStarts word_starts;
1239      const URLRow& row(iter->second.url_row);
1240      const base::string16& url = CleanUpUrlForMatching(row.url(), languages);
1241      String16VectorFromString16(url, false, &word_starts.url_word_starts_);
1242      const base::string16& title = CleanUpTitleForMatching(row.title());
1243      String16VectorFromString16(title, false, &word_starts.title_word_starts_);
1244      word_starts_map_[iter->first] = word_starts;
1245    }
1246  }
1247  return true;
1248}
1249
1250// static
1251bool URLIndexPrivateData::URLSchemeIsWhitelisted(
1252    const GURL& gurl,
1253    const std::set<std::string>& whitelist) {
1254  return whitelist.find(gurl.scheme()) != whitelist.end();
1255}
1256
1257
1258// SearchTermCacheItem ---------------------------------------------------------
1259
1260URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1261    const WordIDSet& word_id_set,
1262    const HistoryIDSet& history_id_set)
1263    : word_id_set_(word_id_set),
1264      history_id_set_(history_id_set),
1265      used_(true) {}
1266
1267URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem()
1268    : used_(true) {}
1269
1270URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {}
1271
1272
1273// URLIndexPrivateData::AddHistoryMatch ----------------------------------------
1274
1275URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
1276    const URLIndexPrivateData& private_data,
1277    const std::string& languages,
1278    BookmarkService* bookmark_service,
1279    const base::string16& lower_string,
1280    const String16Vector& lower_terms,
1281    const base::Time now)
1282  : private_data_(private_data),
1283    languages_(languages),
1284    bookmark_service_(bookmark_service),
1285    lower_string_(lower_string),
1286    lower_terms_(lower_terms),
1287    now_(now) {}
1288
1289URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {}
1290
1291void URLIndexPrivateData::AddHistoryMatch::operator()(
1292    const HistoryID history_id) {
1293  HistoryInfoMap::const_iterator hist_pos =
1294      private_data_.history_info_map_.find(history_id);
1295  if (hist_pos != private_data_.history_info_map_.end()) {
1296    const URLRow& hist_item = hist_pos->second.url_row;
1297    const VisitInfoVector& visits = hist_pos->second.visits;
1298    WordStartsMap::const_iterator starts_pos =
1299        private_data_.word_starts_map_.find(history_id);
1300    DCHECK(starts_pos != private_data_.word_starts_map_.end());
1301    ScoredHistoryMatch match(hist_item, visits, languages_, lower_string_,
1302                             lower_terms_, starts_pos->second, now_,
1303                             bookmark_service_);
1304    if (match.raw_score() > 0)
1305      scored_matches_.push_back(match);
1306  }
1307}
1308
1309
1310// URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
1311
1312URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
1313    const HistoryInfoMap& history_info_map)
1314    : history_info_map_(history_info_map) {
1315}
1316
1317URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}
1318
1319bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
1320    const HistoryID h1,
1321    const HistoryID h2) {
1322  HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));
1323  if (entry1 == history_info_map_.end())
1324    return false;
1325  HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));
1326  if (entry2 == history_info_map_.end())
1327    return true;
1328  const URLRow& r1(entry1->second.url_row);
1329  const URLRow& r2(entry2->second.url_row);
1330  // First cut: typed count, visit count, recency.
1331  // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1332  // recently visited (within the last 12/24 hours) as highly important. Get
1333  // input from mpearson.
1334  if (r1.typed_count() != r2.typed_count())
1335    return (r1.typed_count() > r2.typed_count());
1336  if (r1.visit_count() != r2.visit_count())
1337    return (r1.visit_count() > r2.visit_count());
1338  return (r1.last_visit() > r2.last_visit());
1339}
1340
1341}  // namespace history
1342