1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_
6#define CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_
7
8#include <set>
9#include <string>
10
11#include "base/files/file_path.h"
12#include "base/gtest_prod_util.h"
13#include "base/memory/ref_counted.h"
14#include "chrome/browser/history/history_service.h"
15#include "chrome/browser/history/in_memory_url_index_cache.pb.h"
16#include "chrome/browser/history/in_memory_url_index_types.h"
17#include "chrome/browser/history/scored_history_match.h"
18
19class HistoryQuickProviderTest;
20
21namespace in_memory_url_index {
22class InMemoryURLIndexCacheItem;
23}
24
25namespace history {
26
27namespace imui = in_memory_url_index;
28
29class HistoryClient;
30class HistoryDatabase;
31class InMemoryURLIndex;
32class RefCountedBool;
33
34// Current version of the cache file.
35static const int kCurrentCacheFileVersion = 5;
36
37// A structure private to InMemoryURLIndex describing its internal data and
38// providing for restoring, rebuilding and updating that internal data. As
39// this class is for exclusive use by the InMemoryURLIndex class there should
40// be no calls from any other class.
41//
42// All public member functions are called on the main thread unless otherwise
43// annotated.
44class URLIndexPrivateData
45    : public base::RefCountedThreadSafe<URLIndexPrivateData> {
46 public:
47  URLIndexPrivateData();
48
49  // Given a base::string16 in |term_string|, scans the history index and
50  // returns a vector with all scored, matching history items. The
51  // |term_string| is broken down into individual terms (words), each of which
52  // must occur in the candidate history item's URL or page title for the item
53  // to qualify; however, the terms do not necessarily have to be adjacent. We
54  // also allow breaking |term_string| at |cursor_position| (if
55  // set). Once we have a set of candidates, they are filtered to ensure
56  // that all |term_string| terms, as separated by whitespace and the
57  // cursor (if set), occur within the candidate's URL or page title.
58  // Scores are then calculated on no more than |kItemsToScoreLimit|
59  // candidates, as the scoring of such a large number of candidates may
60  // cause perceptible typing response delays in the omnibox. This is
61  // likely to occur for short omnibox terms such as 'h' and 'w' which
62  // will be found in nearly all history candidates. Results are sorted by
63  // descending score. The full results set (i.e. beyond the
64  // |kItemsToScoreLimit| limit) will be retained and used for subsequent calls
65  // to this function. |history_client| is used to boost a result's score if
66  // its URL is referenced by one or more of the user's bookmarks.  |languages|
67  // is used to help parse/format the URLs in the history index.  In total,
68  // |max_matches| of items will be returned in the |ScoredHistoryMatches|
69  // vector.
70  ScoredHistoryMatches HistoryItemsForTerms(base::string16 term_string,
71                                            size_t cursor_position,
72                                            size_t max_matches,
73                                            const std::string& languages,
74                                            HistoryClient* history_client);
75
76  // Adds the history item in |row| to the index if it does not already already
77  // exist and it meets the minimum 'quick' criteria. If the row already exists
78  // in the index then the index will be updated if the row still meets the
79  // criteria, otherwise the row will be removed from the index. Returns true
80  // if the index was actually updated. |languages| gives a list of language
81  // encodings by which the URLs and page titles are broken down into words and
82  // characters. |scheme_whitelist| is used to filter non-qualifying schemes.
83  // |history_service| is used to schedule an update to the recent visits
84  // component of this URL's entry in the index.
85  bool UpdateURL(HistoryService* history_service,
86                 const URLRow& row,
87                 const std::string& languages,
88                 const std::set<std::string>& scheme_whitelist,
89                 base::CancelableTaskTracker* tracker);
90
91  // Updates the entry for |url_id| in the index, replacing its
92  // recent visits information with |recent_visits|.  If |url_id|
93  // is not in the index, does nothing.
94  void UpdateRecentVisits(URLID url_id,
95                          const VisitVector& recent_visits);
96
97  // Using |history_service| schedules an update (using the historyDB
98  // thread) for the recent visits information for |url_id|.  Unless
99  // something unexpectedly goes wrong, UdpateRecentVisits() should
100  // eventually be called from a callback.
101  void ScheduleUpdateRecentVisits(HistoryService* history_service,
102                                  URLID url_id,
103                                  base::CancelableTaskTracker* tracker);
104
105  // Deletes index data for the history item with the given |url|.
106  // The item may not have actually been indexed, which is the case if it did
107  // not previously meet minimum 'quick' criteria. Returns true if the index
108  // was actually updated.
109  bool DeleteURL(const GURL& url);
110
111  // Constructs a new object by restoring its contents from the cache file
112  // at |path|. Returns the new URLIndexPrivateData which on success will
113  // contain the restored data but upon failure will be empty.  |languages|
114  // is used to break URLs and page titles into words.  This function
115  // should be run on the the file thread.
116  static scoped_refptr<URLIndexPrivateData> RestoreFromFile(
117      const base::FilePath& path,
118      const std::string& languages);
119
120  // Constructs a new object by rebuilding its contents from the history
121  // database in |history_db|. Returns the new URLIndexPrivateData which on
122  // success will contain the rebuilt data but upon failure will be empty.
123  // |languages| gives a list of language encodings by which the URLs and page
124  // titles are broken down into words and characters.
125  static scoped_refptr<URLIndexPrivateData> RebuildFromHistory(
126      HistoryDatabase* history_db,
127      const std::string& languages,
128      const std::set<std::string>& scheme_whitelist);
129
130  // Writes |private_data| as a cache file to |file_path| and returns success.
131  static bool WritePrivateDataToCacheFileTask(
132      scoped_refptr<URLIndexPrivateData> private_data,
133      const base::FilePath& file_path);
134
135  // Creates a copy of ourself.
136  scoped_refptr<URLIndexPrivateData> Duplicate() const;
137
138  // Returns true if there is no data in the index.
139  bool Empty() const;
140
141  // Initializes all index data members in preparation for restoring the index
142  // from the cache or a complete rebuild from the history database.
143  void Clear();
144
145 private:
146  friend class base::RefCountedThreadSafe<URLIndexPrivateData>;
147  ~URLIndexPrivateData();
148
149  friend class AddHistoryMatch;
150  friend class ::HistoryQuickProviderTest;
151  friend class InMemoryURLIndexTest;
152  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore);
153  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, HugeResultSet);
154  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, ReadVisitsFromHistory);
155  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, RebuildFromHistoryIfCacheOld);
156  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring);
157  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch);
158  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching);
159  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, WhitelistedURLs);
160  FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization);
161
162  // Support caching of term results so that we can optimize searches which
163  // build upon a previous search. Each entry in this map represents one
164  // search term from the most recent search. For example, if the user had
165  // typed "google blog trans" and then typed an additional 'l' (at the end,
166  // of course) then there would be four items in the cache: 'blog', 'google',
167  // 'trans', and 'transl'. All would be marked as being in use except for the
168  // 'trans' item; its cached data would have been used when optimizing the
169  // construction of the search results candidates for 'transl' but then would
170  // no longer needed.
171  //
172  // Items stored in the search term cache. If a search term exactly matches one
173  // in the cache then we can quickly supply the proper |history_id_set_| (and
174  // marking the cache item as being |used_|. If we find a prefix for a search
175  // term in the cache (which is very likely to occur as the user types each
176  // term into the omnibox) then we can short-circuit the index search for those
177  // characters in the prefix by returning the |word_id_set|. In that case we do
178  // not mark the item as being |used_|.
179  struct SearchTermCacheItem {
180    SearchTermCacheItem(const WordIDSet& word_id_set,
181                        const HistoryIDSet& history_id_set);
182    // Creates a cache item for a term which has no results.
183    SearchTermCacheItem();
184
185    ~SearchTermCacheItem();
186
187    WordIDSet word_id_set_;
188    HistoryIDSet history_id_set_;
189    bool used_;  // True if this item has been used for the current term search.
190  };
191  typedef std::map<base::string16, SearchTermCacheItem> SearchTermCacheMap;
192
193  // A helper class which performs the final filter on each candidate
194  // history URL match, inserting accepted matches into |scored_matches_|.
195  class AddHistoryMatch : public std::unary_function<HistoryID, void> {
196   public:
197    AddHistoryMatch(const URLIndexPrivateData& private_data,
198                    const std::string& languages,
199                    HistoryClient* history_client,
200                    const base::string16& lower_string,
201                    const String16Vector& lower_terms,
202                    const base::Time now);
203    ~AddHistoryMatch();
204
205    void operator()(const HistoryID history_id);
206
207    ScoredHistoryMatches ScoredMatches() const { return scored_matches_; }
208
209   private:
210    const URLIndexPrivateData& private_data_;
211    const std::string& languages_;
212    HistoryClient* history_client_;
213    ScoredHistoryMatches scored_matches_;
214    const base::string16& lower_string_;
215    const String16Vector& lower_terms_;
216    WordStarts lower_terms_to_word_starts_offsets_;
217    const base::Time now_;
218  };
219
220  // A helper predicate class used to filter excess history items when the
221  // candidate results set is too large.
222  class HistoryItemFactorGreater
223      : public std::binary_function<HistoryID, HistoryID, void> {
224   public:
225    explicit HistoryItemFactorGreater(const HistoryInfoMap& history_info_map);
226    ~HistoryItemFactorGreater();
227
228    bool operator()(const HistoryID h1, const HistoryID h2);
229
230   private:
231    const history::HistoryInfoMap& history_info_map_;
232  };
233
234  // URL History indexing support functions.
235
236  // Composes a set of history item IDs by intersecting the set for each word
237  // in |unsorted_words|.
238  HistoryIDSet HistoryIDSetFromWords(const String16Vector& unsorted_words);
239
240  // Helper function to HistoryIDSetFromWords which composes a set of history
241  // ids for the given term given in |term|.
242  HistoryIDSet HistoryIDsForTerm(const base::string16& term);
243
244  // Given a set of Char16s, finds words containing those characters.
245  WordIDSet WordIDSetForTermChars(const Char16Set& term_chars);
246
247  // Indexes one URL history item as described by |row|. Returns true if the
248  // row was actually indexed. |languages| gives a list of language encodings by
249  // which the URLs and page titles are broken down into words and characters.
250  // |scheme_whitelist| is used to filter non-qualifying schemes.  If
251  // |history_db| is not NULL then this function uses the history database
252  // synchronously to get the URL's recent visits information.  This mode should
253  // only be used on the historyDB thread.  If |history_db| is NULL, then
254  // this function uses |history_service| to schedule a task on the
255  // historyDB thread to fetch and update the recent visits
256  // information.
257  bool IndexRow(HistoryDatabase* history_db,
258                HistoryService* history_service,
259                const URLRow& row,
260                const std::string& languages,
261                const std::set<std::string>& scheme_whitelist,
262                base::CancelableTaskTracker* tracker);
263
264  // Parses and indexes the words in the URL and page title of |row| and
265  // calculate the word starts in each, saving the starts in |word_starts|.
266  // |languages| gives a list of language encodings by which the URLs and page
267  // titles are broken down into words and characters.
268  void AddRowWordsToIndex(const URLRow& row,
269                          RowWordStarts* word_starts,
270                          const std::string& languages);
271
272  // Given a single word in |uni_word|, adds a reference for the containing
273  // history item identified by |history_id| to the index.
274  void AddWordToIndex(const base::string16& uni_word, HistoryID history_id);
275
276  // Creates a new entry in the word/history map for |word_id| and add
277  // |history_id| as the initial element of the word's set.
278  void AddWordHistory(const base::string16& uni_word, HistoryID history_id);
279
280  // Updates an existing entry in the word/history index by adding the
281  // |history_id| to set for |word_id| in the word_id_history_map_.
282  void UpdateWordHistory(WordID word_id, HistoryID history_id);
283
284  // Adds |word_id| to |history_id|'s entry in the history/word map,
285  // creating a new entry if one does not already exist.
286  void AddToHistoryIDWordMap(HistoryID history_id, WordID word_id);
287
288  // Removes |row| and all associated words and characters from the index.
289  void RemoveRowFromIndex(const URLRow& row);
290
291  // Removes all words and characters associated with |row| from the index.
292  void RemoveRowWordsFromIndex(const URLRow& row);
293
294  // Clears |used_| for each item in the search term cache.
295  void ResetSearchTermCache();
296
297  // Caches the index private data and writes the cache file to the profile
298  // directory.  Called by WritePrivateDataToCacheFileTask.
299  bool SaveToFile(const base::FilePath& file_path);
300
301  // Encode a data structure into the protobuf |cache|.
302  void SavePrivateData(imui::InMemoryURLIndexCacheItem* cache) const;
303  void SaveWordList(imui::InMemoryURLIndexCacheItem* cache) const;
304  void SaveWordMap(imui::InMemoryURLIndexCacheItem* cache) const;
305  void SaveCharWordMap(imui::InMemoryURLIndexCacheItem* cache) const;
306  void SaveWordIDHistoryMap(imui::InMemoryURLIndexCacheItem* cache) const;
307  void SaveHistoryInfoMap(imui::InMemoryURLIndexCacheItem* cache) const;
308  void SaveWordStartsMap(imui::InMemoryURLIndexCacheItem* cache) const;
309
310  // Decode a data structure from the protobuf |cache|. Return false if there
311  // is any kind of failure. |languages| will be used to break URLs and page
312  // titles into words
313  bool RestorePrivateData(const imui::InMemoryURLIndexCacheItem& cache,
314                          const std::string& languages);
315  bool RestoreWordList(const imui::InMemoryURLIndexCacheItem& cache);
316  bool RestoreWordMap(const imui::InMemoryURLIndexCacheItem& cache);
317  bool RestoreCharWordMap(const imui::InMemoryURLIndexCacheItem& cache);
318  bool RestoreWordIDHistoryMap(const imui::InMemoryURLIndexCacheItem& cache);
319  bool RestoreHistoryInfoMap(const imui::InMemoryURLIndexCacheItem& cache);
320  bool RestoreWordStartsMap(const imui::InMemoryURLIndexCacheItem& cache,
321                            const std::string& languages);
322
323  // Determines if |gurl| has a whitelisted scheme and returns true if so.
324  static bool URLSchemeIsWhitelisted(const GURL& gurl,
325                                     const std::set<std::string>& whitelist);
326
327  // Cache of search terms.
328  SearchTermCacheMap search_term_cache_;
329
330  // Start of data members that are cached -------------------------------------
331
332  // The version of the cache file most recently used to restore this instance
333  // of the private data. If the private data was rebuilt from the history
334  // database this will be 0.
335  int restored_cache_version_;
336
337  // The last time the data was rebuilt from the history database.
338  base::Time last_time_rebuilt_from_history_;
339
340  // A list of all of indexed words. The index of a word in this list is the
341  // ID of the word in the word_map_. It reduces the memory overhead by
342  // replacing a potentially long and repeated string with a simple index.
343  String16Vector word_list_;
344
345  // A list of available words slots in |word_list_|. An available word slot
346  // is the index of a unused word in word_list_ vector, also referred to as
347  // a WordID. As URL visits are added or modified new words may be added to
348  // the index, in which case any available words are used, if any, and then
349  // words are added to the end of the word_list_. When URL visits are
350  // modified or deleted old words may be removed from the index, in which
351  // case the slots for those words are added to available_words_ for resuse
352  // by future URL updates.
353  WordIDSet available_words_;
354
355  // A one-to-one mapping from the a word string to its slot number (i.e.
356  // WordID) in the |word_list_|.
357  WordMap word_map_;
358
359  // A one-to-many mapping from a single character to all WordIDs of words
360  // containing that character.
361  CharWordIDMap char_word_map_;
362
363  // A one-to-many mapping from a WordID to all HistoryIDs (the row_id as
364  // used in the history database) of history items in which the word occurs.
365  WordIDHistoryMap word_id_history_map_;
366
367  // A one-to-many mapping from a HistoryID to all WordIDs of words that occur
368  // in the URL and/or page title of the history item referenced by that
369  // HistoryID.
370  HistoryIDWordMap history_id_word_map_;
371
372  // A one-to-one mapping from HistoryID to the history item data governing
373  // index inclusion and relevance scoring.
374  HistoryInfoMap history_info_map_;
375
376  // A one-to-one mapping from HistoryID to the word starts detected in each
377  // item's URL and page title.
378  WordStartsMap word_starts_map_;
379
380  // End of data members that are cached ---------------------------------------
381
382  // For unit testing only. Specifies the version of the cache file to be saved.
383  // Used only for testing upgrading of an older version of the cache upon
384  // restore.
385  int saved_cache_version_;
386
387  // Used for unit testing only. Records the number of candidate history items
388  // at three stages in the index searching process.
389  size_t pre_filter_item_count_;    // After word index is queried.
390  size_t post_filter_item_count_;   // After trimming large result set.
391  size_t post_scoring_item_count_;  // After performing final filter/scoring.
392};
393
394}  // namespace history
395
396#endif  // CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_
397