1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/history/url_index_private_data.h"
6
7#include <functional>
8#include <iterator>
9#include <limits>
10#include <numeric>
11#include <string>
12#include <vector>
13
14#include "base/basictypes.h"
15#include "base/files/file_util.h"
16#include "base/i18n/break_iterator.h"
17#include "base/i18n/case_conversion.h"
18#include "base/metrics/histogram.h"
19#include "base/strings/string_util.h"
20#include "base/strings/utf_string_conversions.h"
21#include "base/time/time.h"
22#include "chrome/browser/history/history_database.h"
23#include "chrome/browser/history/history_db_task.h"
24#include "chrome/browser/history/history_service.h"
25#include "chrome/browser/history/in_memory_url_index.h"
26#include "components/bookmarks/browser/bookmark_utils.h"
27#include "components/history/core/browser/history_client.h"
28#include "net/base/net_util.h"
29
30#if defined(USE_SYSTEM_PROTOBUF)
31#include <google/protobuf/repeated_field.h>
32#else
33#include "third_party/protobuf/src/google/protobuf/repeated_field.h"
34#endif
35
36using google::protobuf::RepeatedField;
37using google::protobuf::RepeatedPtrField;
38using in_memory_url_index::InMemoryURLIndexCacheItem;
39
40namespace {
41static const size_t kMaxVisitsToStoreInCache = 10u;
42}  // anonymous namespace
43
44namespace history {
45
46typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;
47typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;
48typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
49typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;
50typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry
51    CharWordMapEntry;
52typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
53    WordIDHistoryMapItem;
54typedef imui::
55    InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
56    WordIDHistoryMapEntry;
57typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;
58typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
59    HistoryInfoMapEntry;
60typedef imui::
61    InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo
62    HistoryInfoMapEntry_VisitInfo;
63typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem;
64typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
65    WordStartsMapEntry;
66
67
68// Algorithm Functions ---------------------------------------------------------
69
70// Comparison function for sorting search terms by descending length.
71bool LengthGreater(const base::string16& string_a,
72                   const base::string16& string_b) {
73  return string_a.length() > string_b.length();
74}
75
76
77// UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
78
79// HistoryDBTask used to update the recent visit data for a particular
80// row from the history database.
81class UpdateRecentVisitsFromHistoryDBTask : public HistoryDBTask {
82 public:
83  explicit UpdateRecentVisitsFromHistoryDBTask(
84      URLIndexPrivateData* private_data,
85      URLID url_id);
86
87  virtual bool RunOnDBThread(HistoryBackend* backend,
88                             history::HistoryDatabase* db) OVERRIDE;
89  virtual void DoneRunOnMainThread() OVERRIDE;
90
91 private:
92  virtual ~UpdateRecentVisitsFromHistoryDBTask();
93
94  // The URLIndexPrivateData that gets updated after the historyDB
95  // task returns.
96  URLIndexPrivateData* private_data_;
97  // The ID of the URL to get visits for and then update.
98  URLID url_id_;
99  // Whether fetching the recent visits for the URL succeeded.
100  bool succeeded_;
101  // The awaited data that's shown to private_data_ for it to copy and
102  // store.
103  VisitVector recent_visits_;
104
105  DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask);
106};
107
108UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
109    URLIndexPrivateData* private_data,
110    URLID url_id)
111    : private_data_(private_data),
112      url_id_(url_id),
113      succeeded_(false) {
114}
115
116bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
117    HistoryBackend* backend,
118    HistoryDatabase* db) {
119  // Make sure the private data is going to get as many recent visits as
120  // ScoredHistoryMatch::GetFrequency() hopes to use.
121  DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
122  succeeded_ = db->GetMostRecentVisitsForURL(url_id_,
123                                             kMaxVisitsToStoreInCache,
124                                             &recent_visits_);
125  if (!succeeded_)
126    recent_visits_.clear();
127  return true;  // Always claim to be done; do not retry failures.
128}
129
130void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
131  if (succeeded_)
132    private_data_->UpdateRecentVisits(url_id_, recent_visits_);
133}
134
135UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
136}
137
138
139// URLIndexPrivateData ---------------------------------------------------------
140
141URLIndexPrivateData::URLIndexPrivateData()
142    : restored_cache_version_(0),
143      saved_cache_version_(kCurrentCacheFileVersion),
144      pre_filter_item_count_(0),
145      post_filter_item_count_(0),
146      post_scoring_item_count_(0) {
147}
148
149ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
150    base::string16 search_string,
151    size_t cursor_position,
152    size_t max_matches,
153    const std::string& languages,
154    HistoryClient* history_client) {
155  // If cursor position is set and useful (not at either end of the
156  // string), allow the search string to be broken at cursor position.
157  // We do this by pretending there's a space where the cursor is.
158  if ((cursor_position != base::string16::npos) &&
159      (cursor_position < search_string.length()) &&
160      (cursor_position > 0)) {
161    search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
162  }
163  pre_filter_item_count_ = 0;
164  post_filter_item_count_ = 0;
165  post_scoring_item_count_ = 0;
166  // The search string we receive may contain escaped characters. For reducing
167  // the index we need individual, lower-cased words, ignoring escapings. For
168  // the final filtering we need whitespace separated substrings possibly
169  // containing escaped characters.
170  base::string16 lower_raw_string(base::i18n::ToLower(search_string));
171  base::string16 lower_unescaped_string =
172      net::UnescapeURLComponent(lower_raw_string,
173          net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
174  // Extract individual 'words' (as opposed to 'terms'; see below) from the
175  // search string. When the user types "colspec=ID%20Mstone Release" we get
176  // four 'words': "colspec", "id", "mstone" and "release".
177  String16Vector lower_words(
178      history::String16VectorFromString16(lower_unescaped_string, false, NULL));
179  ScoredHistoryMatches scored_items;
180
181  // Do nothing if we have indexed no words (probably because we've not been
182  // initialized yet) or the search string has no words.
183  if (word_list_.empty() || lower_words.empty()) {
184    search_term_cache_.clear();  // Invalidate the term cache.
185    return scored_items;
186  }
187
188  // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
189  // approach.
190  ResetSearchTermCache();
191
192  HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
193
194  // Trim the candidate pool if it is large. Note that we do not filter out
195  // items that do not contain the search terms as proper substrings -- doing
196  // so is the performance-costly operation we are trying to avoid in order
197  // to maintain omnibox responsiveness.
198  const size_t kItemsToScoreLimit = 500;
199  pre_filter_item_count_ = history_id_set.size();
200  // If we trim the results set we do not want to cache the results for next
201  // time as the user's ultimately desired result could easily be eliminated
202  // in this early rough filter.
203  bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
204  if (was_trimmed) {
205    HistoryIDVector history_ids;
206    std::copy(history_id_set.begin(), history_id_set.end(),
207              std::back_inserter(history_ids));
208    // Trim down the set by sorting by typed-count, visit-count, and last
209    // visit.
210    HistoryItemFactorGreater
211        item_factor_functor(history_info_map_);
212    std::partial_sort(history_ids.begin(),
213                      history_ids.begin() + kItemsToScoreLimit,
214                      history_ids.end(),
215                      item_factor_functor);
216    history_id_set.clear();
217    std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
218              std::inserter(history_id_set, history_id_set.end()));
219    post_filter_item_count_ = history_id_set.size();
220  }
221
222  // Pass over all of the candidates filtering out any without a proper
223  // substring match, inserting those which pass in order by score. Note that
224  // in this step we are using the raw search string complete with escaped
225  // URL elements. When the user has specifically typed something akin to
226  // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
227  // specific substring appears in the URL or page title.
228
229  // We call these 'terms' (as opposed to 'words'; see above) as in this case
230  // we only want to break up the search string on 'true' whitespace rather than
231  // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
232  // get two 'terms': "colspec=id%20mstone" and "release".
233  history::String16Vector lower_raw_terms;
234  if (Tokenize(lower_raw_string, base::kWhitespaceUTF16,
235               &lower_raw_terms) == 0) {
236    // Don't score matches when there are no terms to score against.  (It's
237    // possible that the word break iterater that extracts words to search
238    // for in the database allows some whitespace "words" whereas Tokenize
239    // excludes a long list of whitespace.)  One could write a scoring
240    // function that gives a reasonable order to matches when there
241    // are no terms (i.e., all the words are some form of whitespace),
242    // but this is such a rare edge case that it's not worth the time.
243    return scored_items;
244  }
245  scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
246      AddHistoryMatch(*this, languages, history_client, lower_raw_string,
247                      lower_raw_terms, base::Time::Now())).ScoredMatches();
248
249  // Select and sort only the top |max_matches| results.
250  if (scored_items.size() > max_matches) {
251    std::partial_sort(scored_items.begin(),
252                      scored_items.begin() +
253                          max_matches,
254                      scored_items.end(),
255                      ScoredHistoryMatch::MatchScoreGreater);
256      scored_items.resize(max_matches);
257  } else {
258    std::sort(scored_items.begin(), scored_items.end(),
259              ScoredHistoryMatch::MatchScoreGreater);
260  }
261  post_scoring_item_count_ = scored_items.size();
262
263  if (was_trimmed) {
264    search_term_cache_.clear();  // Invalidate the term cache.
265  } else {
266    // Remove any stale SearchTermCacheItems.
267    for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
268         cache_iter != search_term_cache_.end(); ) {
269      if (!cache_iter->second.used_)
270        search_term_cache_.erase(cache_iter++);
271      else
272        ++cache_iter;
273    }
274  }
275
276  return scored_items;
277}
278
279bool URLIndexPrivateData::UpdateURL(
280    HistoryService* history_service,
281    const URLRow& row,
282    const std::string& languages,
283    const std::set<std::string>& scheme_whitelist,
284    base::CancelableTaskTracker* tracker) {
285  // The row may or may not already be in our index. If it is not already
286  // indexed and it qualifies then it gets indexed. If it is already
287  // indexed and still qualifies then it gets updated, otherwise it
288  // is deleted from the index.
289  bool row_was_updated = false;
290  URLID row_id = row.id();
291  HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
292  if (row_pos == history_info_map_.end()) {
293    // This new row should be indexed if it qualifies.
294    URLRow new_row(row);
295    new_row.set_id(row_id);
296    row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&
297                      IndexRow(NULL,
298                               history_service,
299                               new_row,
300                               languages,
301                               scheme_whitelist,
302                               tracker);
303  } else if (RowQualifiesAsSignificant(row, base::Time())) {
304    // This indexed row still qualifies and will be re-indexed.
305    // The url won't have changed but the title, visit count, etc.
306    // might have changed.
307    URLRow& row_to_update = row_pos->second.url_row;
308    bool title_updated = row_to_update.title() != row.title();
309    if (row_to_update.visit_count() != row.visit_count() ||
310        row_to_update.typed_count() != row.typed_count() ||
311        row_to_update.last_visit() != row.last_visit() || title_updated) {
312      row_to_update.set_visit_count(row.visit_count());
313      row_to_update.set_typed_count(row.typed_count());
314      row_to_update.set_last_visit(row.last_visit());
315      // If something appears to have changed, update the recent visits
316      // information.
317      ScheduleUpdateRecentVisits(history_service, row_id, tracker);
318      // While the URL is guaranteed to remain stable, the title may have
319      // changed. If so, then update the index with the changed words.
320      if (title_updated) {
321        // Clear all words associated with this row and re-index both the
322        // URL and title.
323        RemoveRowWordsFromIndex(row_to_update);
324        row_to_update.set_title(row.title());
325        RowWordStarts word_starts;
326        AddRowWordsToIndex(row_to_update, &word_starts, languages);
327        word_starts_map_[row_id] = word_starts;
328      }
329      row_was_updated = true;
330    }
331  } else {
332    // This indexed row no longer qualifies and will be de-indexed by
333    // clearing all words associated with this row.
334    RemoveRowFromIndex(row);
335    row_was_updated = true;
336  }
337  if (row_was_updated)
338    search_term_cache_.clear();  // This invalidates the cache.
339  return row_was_updated;
340}
341
342void URLIndexPrivateData::UpdateRecentVisits(
343    URLID url_id,
344    const VisitVector& recent_visits) {
345  HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id);
346  if (row_pos != history_info_map_.end()) {
347    VisitInfoVector* visits = &row_pos->second.visits;
348    visits->clear();
349    const size_t size =
350        std::min(recent_visits.size(), kMaxVisitsToStoreInCache);
351    visits->reserve(size);
352    for (size_t i = 0; i < size; i++) {
353      // Copy from the VisitVector the only fields visits needs.
354      visits->push_back(std::make_pair(recent_visits[i].visit_time,
355                                       recent_visits[i].transition));
356    }
357  }
358  // Else: Oddly, the URL doesn't seem to exist in the private index.
359  // Ignore this update.  This can happen if, for instance, the user
360  // removes the URL from URLIndexPrivateData before the historyDB call
361  // returns.
362}
363
364void URLIndexPrivateData::ScheduleUpdateRecentVisits(
365    HistoryService* history_service,
366    URLID url_id,
367    base::CancelableTaskTracker* tracker) {
368  history_service->ScheduleDBTask(
369      scoped_ptr<history::HistoryDBTask>(
370          new UpdateRecentVisitsFromHistoryDBTask(this, url_id)), tracker);
371}
372
373// Helper functor for DeleteURL.
374class HistoryInfoMapItemHasURL {
375 public:
376  explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}
377
378  bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) {
379    return item.second.url_row.url() == url_;
380  }
381
382 private:
383  const GURL& url_;
384};
385
386bool URLIndexPrivateData::DeleteURL(const GURL& url) {
387  // Find the matching entry in the history_info_map_.
388  HistoryInfoMap::iterator pos = std::find_if(
389      history_info_map_.begin(),
390      history_info_map_.end(),
391      HistoryInfoMapItemHasURL(url));
392  if (pos == history_info_map_.end())
393    return false;
394  RemoveRowFromIndex(pos->second.url_row);
395  search_term_cache_.clear();  // This invalidates the cache.
396  return true;
397}
398
399// static
400scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(
401    const base::FilePath& file_path,
402    const std::string& languages) {
403  base::TimeTicks beginning_time = base::TimeTicks::Now();
404  if (!base::PathExists(file_path))
405    return NULL;
406  std::string data;
407  // If there is no cache file then simply give up. This will cause us to
408  // attempt to rebuild from the history database.
409  if (!base::ReadFileToString(file_path, &data))
410    return NULL;
411
412  scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);
413  InMemoryURLIndexCacheItem index_cache;
414  if (!index_cache.ParseFromArray(data.c_str(), data.size())) {
415    LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "
416                 << file_path.value();
417    return restored_data;
418  }
419
420  if (!restored_data->RestorePrivateData(index_cache, languages))
421    return NULL;
422
423  UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
424                      base::TimeTicks::Now() - beginning_time);
425  UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
426                       restored_data->history_id_word_map_.size());
427  UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
428  UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
429                             restored_data->word_map_.size());
430  UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
431                             restored_data->char_word_map_.size());
432  if (restored_data->Empty())
433    return NULL;  // 'No data' is the same as a failed reload.
434  return restored_data;
435}
436
437// static
438scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
439    HistoryDatabase* history_db,
440    const std::string& languages,
441    const std::set<std::string>& scheme_whitelist) {
442  if (!history_db)
443    return NULL;
444
445  base::TimeTicks beginning_time = base::TimeTicks::Now();
446
447  scoped_refptr<URLIndexPrivateData>
448      rebuilt_data(new URLIndexPrivateData);
449  URLDatabase::URLEnumerator history_enum;
450  if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
451    return NULL;
452  rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now();
453  for (URLRow row; history_enum.GetNextURL(&row); ) {
454    rebuilt_data->IndexRow(
455        history_db, NULL, row, languages, scheme_whitelist, NULL);
456  }
457
458  UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
459                      base::TimeTicks::Now() - beginning_time);
460  UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
461                       rebuilt_data->history_id_word_map_.size());
462  UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
463                             rebuilt_data->word_map_.size());
464  UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
465                             rebuilt_data->char_word_map_.size());
466  return rebuilt_data;
467}
468
469// static
470bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(
471    scoped_refptr<URLIndexPrivateData> private_data,
472    const base::FilePath& file_path) {
473  DCHECK(private_data.get());
474  DCHECK(!file_path.empty());
475  return private_data->SaveToFile(file_path);
476}
477
478scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {
479  scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;
480  data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_;
481  data_copy->word_list_ = word_list_;
482  data_copy->available_words_ = available_words_;
483  data_copy->word_map_ = word_map_;
484  data_copy->char_word_map_ = char_word_map_;
485  data_copy->word_id_history_map_ = word_id_history_map_;
486  data_copy->history_id_word_map_ = history_id_word_map_;
487  data_copy->history_info_map_ = history_info_map_;
488  data_copy->word_starts_map_ = word_starts_map_;
489  return data_copy;
490  // Not copied:
491  //    search_term_cache_
492  //    pre_filter_item_count_
493  //    post_filter_item_count_
494  //    post_scoring_item_count_
495}
496
497bool URLIndexPrivateData::Empty() const {
498  return history_info_map_.empty();
499}
500
501void URLIndexPrivateData::Clear() {
502  last_time_rebuilt_from_history_ = base::Time();
503  word_list_.clear();
504  available_words_.clear();
505  word_map_.clear();
506  char_word_map_.clear();
507  word_id_history_map_.clear();
508  history_id_word_map_.clear();
509  history_info_map_.clear();
510  word_starts_map_.clear();
511}
512
513URLIndexPrivateData::~URLIndexPrivateData() {}
514
515HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(
516    const String16Vector& unsorted_words) {
517  // Break the terms down into individual terms (words), get the candidate
518  // set for each term, and intersect each to get a final candidate list.
519  // Note that a single 'term' from the user's perspective might be
520  // a string like "http://www.somewebsite.com" which, from our perspective,
521  // is four words: 'http', 'www', 'somewebsite', and 'com'.
522  HistoryIDSet history_id_set;
523  String16Vector words(unsorted_words);
524  // Sort the words into the longest first as such are likely to narrow down
525  // the results quicker. Also, single character words are the most expensive
526  // to process so save them for last.
527  std::sort(words.begin(), words.end(), LengthGreater);
528  for (String16Vector::iterator iter = words.begin(); iter != words.end();
529       ++iter) {
530    base::string16 uni_word = *iter;
531    HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
532    if (term_history_set.empty()) {
533      history_id_set.clear();
534      break;
535    }
536    if (iter == words.begin()) {
537      history_id_set.swap(term_history_set);
538    } else {
539      HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>(
540          history_id_set, term_history_set);
541      history_id_set.swap(new_history_id_set);
542    }
543  }
544  return history_id_set;
545}
546
547HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(
548    const base::string16& term) {
549  if (term.empty())
550    return HistoryIDSet();
551
552  // TODO(mrossetti): Consider optimizing for very common terms such as
553  // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
554  // occuring words in the user's searches.
555
556  size_t term_length = term.length();
557  WordIDSet word_id_set;
558  if (term_length > 1) {
559    // See if this term or a prefix thereof is present in the cache.
560    SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());
561    for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
562         cache_iter != search_term_cache_.end(); ++cache_iter) {
563      if (StartsWith(term, cache_iter->first, false) &&
564          (best_prefix == search_term_cache_.end() ||
565           cache_iter->first.length() > best_prefix->first.length()))
566        best_prefix = cache_iter;
567    }
568
569    // If a prefix was found then determine the leftover characters to be used
570    // for further refining the results from that prefix.
571    Char16Set prefix_chars;
572    base::string16 leftovers(term);
573    if (best_prefix != search_term_cache_.end()) {
574      // If the prefix is an exact match for the term then grab the cached
575      // results and we're done.
576      size_t prefix_length = best_prefix->first.length();
577      if (prefix_length == term_length) {
578        best_prefix->second.used_ = true;
579        return best_prefix->second.history_id_set_;
580      }
581
582      // Otherwise we have a handy starting point.
583      // If there are no history results for this prefix then we can bail early
584      // as there will be no history results for the full term.
585      if (best_prefix->second.history_id_set_.empty()) {
586        search_term_cache_[term] = SearchTermCacheItem();
587        return HistoryIDSet();
588      }
589      word_id_set = best_prefix->second.word_id_set_;
590      prefix_chars = Char16SetFromString16(best_prefix->first);
591      leftovers = term.substr(prefix_length);
592    }
593
594    // Filter for each remaining, unique character in the term.
595    Char16Set leftover_chars = Char16SetFromString16(leftovers);
596    Char16Set unique_chars =
597        base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);
598
599    // Reduce the word set with any leftover, unprocessed characters.
600    if (!unique_chars.empty()) {
601      WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));
602      // We might come up empty on the leftovers.
603      if (leftover_set.empty()) {
604        search_term_cache_[term] = SearchTermCacheItem();
605        return HistoryIDSet();
606      }
607      // Or there may not have been a prefix from which to start.
608      if (prefix_chars.empty()) {
609        word_id_set.swap(leftover_set);
610      } else {
611        WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
612            word_id_set, leftover_set);
613        word_id_set.swap(new_word_id_set);
614      }
615    }
616
617    // We must filter the word list because the resulting word set surely
618    // contains words which do not have the search term as a proper subset.
619    for (WordIDSet::iterator word_set_iter = word_id_set.begin();
620         word_set_iter != word_id_set.end(); ) {
621      if (word_list_[*word_set_iter].find(term) == base::string16::npos)
622        word_id_set.erase(word_set_iter++);
623      else
624        ++word_set_iter;
625    }
626  } else {
627    word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));
628  }
629
630  // If any words resulted then we can compose a set of history IDs by unioning
631  // the sets from each word.
632  HistoryIDSet history_id_set;
633  if (!word_id_set.empty()) {
634    for (WordIDSet::iterator word_id_iter = word_id_set.begin();
635         word_id_iter != word_id_set.end(); ++word_id_iter) {
636      WordID word_id = *word_id_iter;
637      WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
638      if (word_iter != word_id_history_map_.end()) {
639        HistoryIDSet& word_history_id_set(word_iter->second);
640        history_id_set.insert(word_history_id_set.begin(),
641                              word_history_id_set.end());
642      }
643    }
644  }
645
646  // Record a new cache entry for this word if the term is longer than
647  // a single character.
648  if (term_length > 1)
649    search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);
650
651  return history_id_set;
652}
653
654WordIDSet URLIndexPrivateData::WordIDSetForTermChars(
655    const Char16Set& term_chars) {
656  WordIDSet word_id_set;
657  for (Char16Set::const_iterator c_iter = term_chars.begin();
658       c_iter != term_chars.end(); ++c_iter) {
659    CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);
660    if (char_iter == char_word_map_.end()) {
661      // A character was not found so there are no matching results: bail.
662      word_id_set.clear();
663      break;
664    }
665    WordIDSet& char_word_id_set(char_iter->second);
666    // It is possible for there to no longer be any words associated with
667    // a particular character. Give up in that case.
668    if (char_word_id_set.empty()) {
669      word_id_set.clear();
670      break;
671    }
672
673    if (c_iter == term_chars.begin()) {
674      // First character results becomes base set of results.
675      word_id_set = char_word_id_set;
676    } else {
677      // Subsequent character results get intersected in.
678      WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
679          word_id_set, char_word_id_set);
680      word_id_set.swap(new_word_id_set);
681    }
682  }
683  return word_id_set;
684}
685
686bool URLIndexPrivateData::IndexRow(
687    HistoryDatabase* history_db,
688    HistoryService* history_service,
689    const URLRow& row,
690    const std::string& languages,
691    const std::set<std::string>& scheme_whitelist,
692    base::CancelableTaskTracker* tracker) {
693  const GURL& gurl(row.url());
694
695  // Index only URLs with a whitelisted scheme.
696  if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))
697    return false;
698
699  URLID row_id = row.id();
700  // Strip out username and password before saving and indexing.
701  base::string16 url(net::FormatUrl(gurl, languages,
702      net::kFormatUrlOmitUsernamePassword,
703      net::UnescapeRule::NONE,
704      NULL, NULL, NULL));
705
706  HistoryID history_id = static_cast<HistoryID>(row_id);
707  DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());
708
709  // Add the row for quick lookup in the history info store.
710  URLRow new_row(GURL(url), row_id);
711  new_row.set_visit_count(row.visit_count());
712  new_row.set_typed_count(row.typed_count());
713  new_row.set_last_visit(row.last_visit());
714  new_row.set_title(row.title());
715  history_info_map_[history_id].url_row = new_row;
716
717  // Index the words contained in the URL and title of the row.
718  RowWordStarts word_starts;
719  AddRowWordsToIndex(new_row, &word_starts, languages);
720  word_starts_map_[history_id] = word_starts;
721
722  // Update the recent visits information or schedule the update
723  // as appropriate.
724  if (history_db) {
725    // We'd like to check that we're on the history DB thread.
726    // However, unittest code actually calls this on the UI thread.
727    // So we don't do any thread checks.
728    VisitVector recent_visits;
729    // Make sure the private data is going to get as many recent visits as
730    // ScoredHistoryMatch::GetFrequency() hopes to use.
731    DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
732    if (history_db->GetMostRecentVisitsForURL(row_id,
733                                              kMaxVisitsToStoreInCache,
734                                              &recent_visits))
735      UpdateRecentVisits(row_id, recent_visits);
736  } else {
737    DCHECK(tracker);
738    DCHECK(history_service);
739    ScheduleUpdateRecentVisits(history_service, row_id, tracker);
740  }
741
742  return true;
743}
744
745void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row,
746                                             RowWordStarts* word_starts,
747                                             const std::string& languages) {
748  HistoryID history_id = static_cast<HistoryID>(row.id());
749  // Split URL into individual, unique words then add in the title words.
750  const GURL& gurl(row.url());
751  const base::string16& url =
752      bookmarks::CleanUpUrlForMatching(gurl, languages, NULL);
753  String16Set url_words = String16SetFromString16(url,
754      word_starts ? &word_starts->url_word_starts_ : NULL);
755  const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title());
756  String16Set title_words = String16SetFromString16(title,
757      word_starts ? &word_starts->title_word_starts_ : NULL);
758  String16Set words = base::STLSetUnion<String16Set>(url_words, title_words);
759  for (String16Set::iterator word_iter = words.begin();
760       word_iter != words.end(); ++word_iter)
761    AddWordToIndex(*word_iter, history_id);
762
763  search_term_cache_.clear();  // Invalidate the term cache.
764}
765
766void URLIndexPrivateData::AddWordToIndex(const base::string16& term,
767                                         HistoryID history_id) {
768  WordMap::iterator word_pos = word_map_.find(term);
769  if (word_pos != word_map_.end())
770    UpdateWordHistory(word_pos->second, history_id);
771  else
772    AddWordHistory(term, history_id);
773}
774
775void URLIndexPrivateData::AddWordHistory(const base::string16& term,
776                                         HistoryID history_id) {
777  WordID word_id = word_list_.size();
778  if (available_words_.empty()) {
779    word_list_.push_back(term);
780  } else {
781    word_id = *(available_words_.begin());
782    word_list_[word_id] = term;
783    available_words_.erase(word_id);
784  }
785  word_map_[term] = word_id;
786
787  HistoryIDSet history_id_set;
788  history_id_set.insert(history_id);
789  word_id_history_map_[word_id] = history_id_set;
790  AddToHistoryIDWordMap(history_id, word_id);
791
792  // For each character in the newly added word (i.e. a word that is not
793  // already in the word index), add the word to the character index.
794  Char16Set characters = Char16SetFromString16(term);
795  for (Char16Set::iterator uni_char_iter = characters.begin();
796       uni_char_iter != characters.end(); ++uni_char_iter) {
797    base::char16 uni_char = *uni_char_iter;
798    CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
799    if (char_iter != char_word_map_.end()) {
800      // Update existing entry in the char/word index.
801      WordIDSet& word_id_set(char_iter->second);
802      word_id_set.insert(word_id);
803    } else {
804      // Create a new entry in the char/word index.
805      WordIDSet word_id_set;
806      word_id_set.insert(word_id);
807      char_word_map_[uni_char] = word_id_set;
808    }
809  }
810}
811
812void URLIndexPrivateData::UpdateWordHistory(WordID word_id,
813                                            HistoryID history_id) {
814  WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
815  DCHECK(history_pos != word_id_history_map_.end());
816  HistoryIDSet& history_id_set(history_pos->second);
817  history_id_set.insert(history_id);
818  AddToHistoryIDWordMap(history_id, word_id);
819}
820
821void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,
822                                                WordID word_id) {
823  HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id);
824  if (iter != history_id_word_map_.end()) {
825    WordIDSet& word_id_set(iter->second);
826    word_id_set.insert(word_id);
827  } else {
828    WordIDSet word_id_set;
829    word_id_set.insert(word_id);
830    history_id_word_map_[history_id] = word_id_set;
831  }
832}
833
834void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) {
835  RemoveRowWordsFromIndex(row);
836  HistoryID history_id = static_cast<HistoryID>(row.id());
837  history_info_map_.erase(history_id);
838  word_starts_map_.erase(history_id);
839}
840
841void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) {
842  // Remove the entries in history_id_word_map_ and word_id_history_map_ for
843  // this row.
844  HistoryID history_id = static_cast<HistoryID>(row.id());
845  WordIDSet word_id_set = history_id_word_map_[history_id];
846  history_id_word_map_.erase(history_id);
847
848  // Reconcile any changes to word usage.
849  for (WordIDSet::iterator word_id_iter = word_id_set.begin();
850       word_id_iter != word_id_set.end(); ++word_id_iter) {
851    WordID word_id = *word_id_iter;
852    word_id_history_map_[word_id].erase(history_id);
853    if (!word_id_history_map_[word_id].empty())
854      continue;  // The word is still in use.
855
856    // The word is no longer in use. Reconcile any changes to character usage.
857    base::string16 word = word_list_[word_id];
858    Char16Set characters = Char16SetFromString16(word);
859    for (Char16Set::iterator uni_char_iter = characters.begin();
860         uni_char_iter != characters.end(); ++uni_char_iter) {
861      base::char16 uni_char = *uni_char_iter;
862      char_word_map_[uni_char].erase(word_id);
863      if (char_word_map_[uni_char].empty())
864        char_word_map_.erase(uni_char);  // No longer in use.
865    }
866
867    // Complete the removal of references to the word.
868    word_id_history_map_.erase(word_id);
869    word_map_.erase(word);
870    word_list_[word_id] = base::string16();
871    available_words_.insert(word_id);
872  }
873}
874
875void URLIndexPrivateData::ResetSearchTermCache() {
876  for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
877       iter != search_term_cache_.end(); ++iter)
878    iter->second.used_ = false;
879}
880
881bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) {
882  base::TimeTicks beginning_time = base::TimeTicks::Now();
883  InMemoryURLIndexCacheItem index_cache;
884  SavePrivateData(&index_cache);
885  std::string data;
886  if (!index_cache.SerializeToString(&data)) {
887    LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";
888    return false;
889  }
890
891  int size = data.size();
892  if (base::WriteFile(file_path, data.c_str(), size) != size) {
893    LOG(WARNING) << "Failed to write " << file_path.value();
894    return false;
895  }
896  UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
897                      base::TimeTicks::Now() - beginning_time);
898  return true;
899}
900
901void URLIndexPrivateData::SavePrivateData(
902    InMemoryURLIndexCacheItem* cache) const {
903  DCHECK(cache);
904  cache->set_last_rebuild_timestamp(
905      last_time_rebuilt_from_history_.ToInternalValue());
906  cache->set_version(saved_cache_version_);
907  // history_item_count_ is no longer used but rather than change the protobuf
908  // definition use a placeholder. This will go away with the switch to SQLite.
909  cache->set_history_item_count(0);
910  SaveWordList(cache);
911  SaveWordMap(cache);
912  SaveCharWordMap(cache);
913  SaveWordIDHistoryMap(cache);
914  SaveHistoryInfoMap(cache);
915  SaveWordStartsMap(cache);
916}
917
918void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
919  if (word_list_.empty())
920    return;
921  WordListItem* list_item = cache->mutable_word_list();
922  list_item->set_word_count(word_list_.size());
923  for (String16Vector::const_iterator iter = word_list_.begin();
924       iter != word_list_.end(); ++iter)
925    list_item->add_word(base::UTF16ToUTF8(*iter));
926}
927
928void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
929  if (word_map_.empty())
930    return;
931  WordMapItem* map_item = cache->mutable_word_map();
932  map_item->set_item_count(word_map_.size());
933  for (WordMap::const_iterator iter = word_map_.begin();
934       iter != word_map_.end(); ++iter) {
935    WordMapEntry* map_entry = map_item->add_word_map_entry();
936    map_entry->set_word(base::UTF16ToUTF8(iter->first));
937    map_entry->set_word_id(iter->second);
938  }
939}
940
941void URLIndexPrivateData::SaveCharWordMap(
942    InMemoryURLIndexCacheItem* cache) const {
943  if (char_word_map_.empty())
944    return;
945  CharWordMapItem* map_item = cache->mutable_char_word_map();
946  map_item->set_item_count(char_word_map_.size());
947  for (CharWordIDMap::const_iterator iter = char_word_map_.begin();
948       iter != char_word_map_.end(); ++iter) {
949    CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
950    map_entry->set_char_16(iter->first);
951    const WordIDSet& word_id_set(iter->second);
952    map_entry->set_item_count(word_id_set.size());
953    for (WordIDSet::const_iterator set_iter = word_id_set.begin();
954         set_iter != word_id_set.end(); ++set_iter)
955      map_entry->add_word_id(*set_iter);
956  }
957}
958
959void URLIndexPrivateData::SaveWordIDHistoryMap(
960    InMemoryURLIndexCacheItem* cache) const {
961  if (word_id_history_map_.empty())
962    return;
963  WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
964  map_item->set_item_count(word_id_history_map_.size());
965  for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();
966       iter != word_id_history_map_.end(); ++iter) {
967    WordIDHistoryMapEntry* map_entry =
968        map_item->add_word_id_history_map_entry();
969    map_entry->set_word_id(iter->first);
970    const HistoryIDSet& history_id_set(iter->second);
971    map_entry->set_item_count(history_id_set.size());
972    for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
973         set_iter != history_id_set.end(); ++set_iter)
974      map_entry->add_history_id(*set_iter);
975  }
976}
977
978void URLIndexPrivateData::SaveHistoryInfoMap(
979    InMemoryURLIndexCacheItem* cache) const {
980  if (history_info_map_.empty())
981    return;
982  HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
983  map_item->set_item_count(history_info_map_.size());
984  for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
985       iter != history_info_map_.end(); ++iter) {
986    HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
987    map_entry->set_history_id(iter->first);
988    const URLRow& url_row(iter->second.url_row);
989    // Note: We only save information that contributes to the index so there
990    // is no need to save search_term_cache_ (not persistent).
991    map_entry->set_visit_count(url_row.visit_count());
992    map_entry->set_typed_count(url_row.typed_count());
993    map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
994    map_entry->set_url(url_row.url().spec());
995    map_entry->set_title(base::UTF16ToUTF8(url_row.title()));
996    const VisitInfoVector& visits(iter->second.visits);
997    for (VisitInfoVector::const_iterator visit_iter = visits.begin();
998         visit_iter != visits.end(); ++visit_iter) {
999      HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits();
1000      visit_info->set_visit_time(visit_iter->first.ToInternalValue());
1001      visit_info->set_transition_type(visit_iter->second);
1002    }
1003  }
1004}
1005
1006void URLIndexPrivateData::SaveWordStartsMap(
1007    InMemoryURLIndexCacheItem* cache) const {
1008  if (word_starts_map_.empty())
1009    return;
1010  // For unit testing: Enable saving of the cache as an earlier version to
1011  // allow testing of cache file upgrading in ReadFromFile().
1012  // TODO(mrossetti): Instead of intruding on production code with this kind of
1013  // test harness, save a copy of an older version cache with known results.
1014  // Implement this when switching the caching over to SQLite.
1015  if (saved_cache_version_ < 1)
1016    return;
1017
1018  WordStartsMapItem* map_item = cache->mutable_word_starts_map();
1019  map_item->set_item_count(word_starts_map_.size());
1020  for (WordStartsMap::const_iterator iter = word_starts_map_.begin();
1021       iter != word_starts_map_.end(); ++iter) {
1022    WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();
1023    map_entry->set_history_id(iter->first);
1024    const RowWordStarts& word_starts(iter->second);
1025    for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();
1026         i != word_starts.url_word_starts_.end(); ++i)
1027      map_entry->add_url_word_starts(*i);
1028    for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();
1029         i != word_starts.title_word_starts_.end(); ++i)
1030      map_entry->add_title_word_starts(*i);
1031  }
1032}
1033
1034bool URLIndexPrivateData::RestorePrivateData(
1035    const InMemoryURLIndexCacheItem& cache,
1036    const std::string& languages) {
1037  last_time_rebuilt_from_history_ =
1038      base::Time::FromInternalValue(cache.last_rebuild_timestamp());
1039  const base::TimeDelta rebuilt_ago =
1040      base::Time::Now() - last_time_rebuilt_from_history_;
1041  if ((rebuilt_ago > base::TimeDelta::FromDays(7)) ||
1042      (rebuilt_ago < base::TimeDelta::FromDays(-1))) {
1043    // Cache is more than a week old or, somehow, from some time in the future.
1044    // It's probably a good time to rebuild the index from history to
1045    // allow synced entries to now appear, expired entries to disappear, etc.
1046    // Allow one day in the future to make the cache not rebuild on simple
1047    // system clock changes such as time zone changes.
1048    return false;
1049  }
1050  if (cache.has_version()) {
1051    if (cache.version() < kCurrentCacheFileVersion) {
1052      // Don't try to restore an old format cache file.  (This will cause
1053      // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData
1054      // from history.)
1055      return false;
1056    }
1057    restored_cache_version_ = cache.version();
1058  }
1059  return RestoreWordList(cache) && RestoreWordMap(cache) &&
1060      RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
1061      RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);
1062}
1063
1064bool URLIndexPrivateData::RestoreWordList(
1065    const InMemoryURLIndexCacheItem& cache) {
1066  if (!cache.has_word_list())
1067    return false;
1068  const WordListItem& list_item(cache.word_list());
1069  uint32 expected_item_count = list_item.word_count();
1070  uint32 actual_item_count = list_item.word_size();
1071  if (actual_item_count == 0 || actual_item_count != expected_item_count)
1072    return false;
1073  const RepeatedPtrField<std::string>& words(list_item.word());
1074  for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
1075       iter != words.end(); ++iter)
1076    word_list_.push_back(base::UTF8ToUTF16(*iter));
1077  return true;
1078}
1079
1080bool URLIndexPrivateData::RestoreWordMap(
1081    const InMemoryURLIndexCacheItem& cache) {
1082  if (!cache.has_word_map())
1083    return false;
1084  const WordMapItem& list_item(cache.word_map());
1085  uint32 expected_item_count = list_item.item_count();
1086  uint32 actual_item_count = list_item.word_map_entry_size();
1087  if (actual_item_count == 0 || actual_item_count != expected_item_count)
1088    return false;
1089  const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
1090  for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
1091       iter != entries.end(); ++iter)
1092    word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id();
1093  return true;
1094}
1095
1096bool URLIndexPrivateData::RestoreCharWordMap(
1097    const InMemoryURLIndexCacheItem& cache) {
1098  if (!cache.has_char_word_map())
1099    return false;
1100  const CharWordMapItem& list_item(cache.char_word_map());
1101  uint32 expected_item_count = list_item.item_count();
1102  uint32 actual_item_count = list_item.char_word_map_entry_size();
1103  if (actual_item_count == 0 || actual_item_count != expected_item_count)
1104    return false;
1105  const RepeatedPtrField<CharWordMapEntry>&
1106      entries(list_item.char_word_map_entry());
1107  for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =
1108       entries.begin(); iter != entries.end(); ++iter) {
1109    expected_item_count = iter->item_count();
1110    actual_item_count = iter->word_id_size();
1111    if (actual_item_count == 0 || actual_item_count != expected_item_count)
1112      return false;
1113    base::char16 uni_char = static_cast<base::char16>(iter->char_16());
1114    WordIDSet word_id_set;
1115    const RepeatedField<int32>& word_ids(iter->word_id());
1116    for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
1117         jiter != word_ids.end(); ++jiter)
1118      word_id_set.insert(*jiter);
1119    char_word_map_[uni_char] = word_id_set;
1120  }
1121  return true;
1122}
1123
1124bool URLIndexPrivateData::RestoreWordIDHistoryMap(
1125    const InMemoryURLIndexCacheItem& cache) {
1126  if (!cache.has_word_id_history_map())
1127    return false;
1128  const WordIDHistoryMapItem& list_item(cache.word_id_history_map());
1129  uint32 expected_item_count = list_item.item_count();
1130  uint32 actual_item_count = list_item.word_id_history_map_entry_size();
1131  if (actual_item_count == 0 || actual_item_count != expected_item_count)
1132    return false;
1133  const RepeatedPtrField<WordIDHistoryMapEntry>&
1134      entries(list_item.word_id_history_map_entry());
1135  for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
1136       entries.begin(); iter != entries.end(); ++iter) {
1137    expected_item_count = iter->item_count();
1138    actual_item_count = iter->history_id_size();
1139    if (actual_item_count == 0 || actual_item_count != expected_item_count)
1140      return false;
1141    WordID word_id = iter->word_id();
1142    HistoryIDSet history_id_set;
1143    const RepeatedField<int64>& history_ids(iter->history_id());
1144    for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
1145         jiter != history_ids.end(); ++jiter) {
1146      history_id_set.insert(*jiter);
1147      AddToHistoryIDWordMap(*jiter, word_id);
1148    }
1149    word_id_history_map_[word_id] = history_id_set;
1150  }
1151  return true;
1152}
1153
1154bool URLIndexPrivateData::RestoreHistoryInfoMap(
1155    const InMemoryURLIndexCacheItem& cache) {
1156  if (!cache.has_history_info_map())
1157    return false;
1158  const HistoryInfoMapItem& list_item(cache.history_info_map());
1159  uint32 expected_item_count = list_item.item_count();
1160  uint32 actual_item_count = list_item.history_info_map_entry_size();
1161  if (actual_item_count == 0 || actual_item_count != expected_item_count)
1162    return false;
1163  const RepeatedPtrField<HistoryInfoMapEntry>&
1164      entries(list_item.history_info_map_entry());
1165  for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =
1166       entries.begin(); iter != entries.end(); ++iter) {
1167    HistoryID history_id = iter->history_id();
1168    GURL url(iter->url());
1169    URLRow url_row(url, history_id);
1170    url_row.set_visit_count(iter->visit_count());
1171    url_row.set_typed_count(iter->typed_count());
1172    url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
1173    if (iter->has_title()) {
1174      base::string16 title(base::UTF8ToUTF16(iter->title()));
1175      url_row.set_title(title);
1176    }
1177    history_info_map_[history_id].url_row = url_row;
1178
1179    // Restore visits list.
1180    VisitInfoVector visits;
1181    visits.reserve(iter->visits_size());
1182    for (int i = 0; i < iter->visits_size(); ++i) {
1183      visits.push_back(std::make_pair(
1184          base::Time::FromInternalValue(iter->visits(i).visit_time()),
1185          ui::PageTransitionFromInt(iter->visits(i).transition_type())));
1186    }
1187    history_info_map_[history_id].visits = visits;
1188  }
1189  return true;
1190}
1191
1192bool URLIndexPrivateData::RestoreWordStartsMap(
1193    const InMemoryURLIndexCacheItem& cache,
1194    const std::string& languages) {
1195  // Note that this function must be called after RestoreHistoryInfoMap() has
1196  // been run as the word starts may have to be recalculated from the urls and
1197  // page titles.
1198  if (cache.has_word_starts_map()) {
1199    const WordStartsMapItem& list_item(cache.word_starts_map());
1200    uint32 expected_item_count = list_item.item_count();
1201    uint32 actual_item_count = list_item.word_starts_map_entry_size();
1202    if (actual_item_count == 0 || actual_item_count != expected_item_count)
1203      return false;
1204    const RepeatedPtrField<WordStartsMapEntry>&
1205        entries(list_item.word_starts_map_entry());
1206    for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =
1207         entries.begin(); iter != entries.end(); ++iter) {
1208      HistoryID history_id = iter->history_id();
1209      RowWordStarts word_starts;
1210      // Restore the URL word starts.
1211      const RepeatedField<int32>& url_starts(iter->url_word_starts());
1212      for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();
1213           jiter != url_starts.end(); ++jiter)
1214        word_starts.url_word_starts_.push_back(*jiter);
1215      // Restore the page title word starts.
1216      const RepeatedField<int32>& title_starts(iter->title_word_starts());
1217      for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();
1218           jiter != title_starts.end(); ++jiter)
1219        word_starts.title_word_starts_.push_back(*jiter);
1220      word_starts_map_[history_id] = word_starts;
1221    }
1222  } else {
1223    // Since the cache did not contain any word starts we must rebuild then from
1224    // the URL and page titles.
1225    for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
1226         iter != history_info_map_.end(); ++iter) {
1227      RowWordStarts word_starts;
1228      const URLRow& row(iter->second.url_row);
1229      const base::string16& url =
1230          bookmarks::CleanUpUrlForMatching(row.url(), languages, NULL);
1231      String16VectorFromString16(url, false, &word_starts.url_word_starts_);
1232      const base::string16& title =
1233          bookmarks::CleanUpTitleForMatching(row.title());
1234      String16VectorFromString16(title, false, &word_starts.title_word_starts_);
1235      word_starts_map_[iter->first] = word_starts;
1236    }
1237  }
1238  return true;
1239}
1240
1241// static
1242bool URLIndexPrivateData::URLSchemeIsWhitelisted(
1243    const GURL& gurl,
1244    const std::set<std::string>& whitelist) {
1245  return whitelist.find(gurl.scheme()) != whitelist.end();
1246}
1247
1248
1249// SearchTermCacheItem ---------------------------------------------------------
1250
1251URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1252    const WordIDSet& word_id_set,
1253    const HistoryIDSet& history_id_set)
1254    : word_id_set_(word_id_set),
1255      history_id_set_(history_id_set),
1256      used_(true) {}
1257
1258URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem()
1259    : used_(true) {}
1260
1261URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {}
1262
1263
1264// URLIndexPrivateData::AddHistoryMatch ----------------------------------------
1265
1266URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
1267    const URLIndexPrivateData& private_data,
1268    const std::string& languages,
1269    HistoryClient* history_client,
1270    const base::string16& lower_string,
1271    const String16Vector& lower_terms,
1272    const base::Time now)
1273    : private_data_(private_data),
1274      languages_(languages),
1275      history_client_(history_client),
1276      lower_string_(lower_string),
1277      lower_terms_(lower_terms),
1278      now_(now) {
1279  // Calculate offsets for each term.  For instance, the offset for
1280  // ".net" should be 1, indicating that the actual word-part of the term
1281  // starts at offset 1.
1282  lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u);
1283  for (size_t i = 0; i < lower_terms_.size(); ++i) {
1284    base::i18n::BreakIterator iter(lower_terms_[i],
1285                                   base::i18n::BreakIterator::BREAK_WORD);
1286    // If the iterator doesn't work, assume an offset of 0.
1287    if (!iter.Init())
1288      continue;
1289    // Find the first word start.
1290    while (iter.Advance() && !iter.IsWord()) {}
1291    if (iter.IsWord())
1292      lower_terms_to_word_starts_offsets_[i] = iter.prev();
1293    // Else: the iterator didn't find a word break.  Assume an offset of 0.
1294  }
1295}
1296
1297URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {}
1298
1299void URLIndexPrivateData::AddHistoryMatch::operator()(
1300    const HistoryID history_id) {
1301  HistoryInfoMap::const_iterator hist_pos =
1302      private_data_.history_info_map_.find(history_id);
1303  if (hist_pos != private_data_.history_info_map_.end()) {
1304    const URLRow& hist_item = hist_pos->second.url_row;
1305    const VisitInfoVector& visits = hist_pos->second.visits;
1306    WordStartsMap::const_iterator starts_pos =
1307        private_data_.word_starts_map_.find(history_id);
1308    DCHECK(starts_pos != private_data_.word_starts_map_.end());
1309    ScoredHistoryMatch match(hist_item, visits, languages_, lower_string_,
1310                             lower_terms_, lower_terms_to_word_starts_offsets_,
1311                             starts_pos->second, now_, history_client_);
1312    if (match.raw_score() > 0)
1313      scored_matches_.push_back(match);
1314  }
1315}
1316
1317
1318// URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
1319
1320URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
1321    const HistoryInfoMap& history_info_map)
1322    : history_info_map_(history_info_map) {
1323}
1324
1325URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}
1326
1327bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
1328    const HistoryID h1,
1329    const HistoryID h2) {
1330  HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));
1331  if (entry1 == history_info_map_.end())
1332    return false;
1333  HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));
1334  if (entry2 == history_info_map_.end())
1335    return true;
1336  const URLRow& r1(entry1->second.url_row);
1337  const URLRow& r2(entry2->second.url_row);
1338  // First cut: typed count, visit count, recency.
1339  // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1340  // recently visited (within the last 12/24 hours) as highly important. Get
1341  // input from mpearson.
1342  if (r1.typed_count() != r2.typed_count())
1343    return (r1.typed_count() > r2.typed_count());
1344  if (r1.visit_count() != r2.visit_count())
1345    return (r1.visit_count() > r2.visit_count());
1346  return (r1.last_visit() > r2.last_visit());
1347}
1348
1349}  // namespace history
1350