1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
6#define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
7#pragma once
8
9#include <functional>
10#include <map>
11#include <set>
12#include <string>
13#include <vector>
14
15#include "app/sql/connection.h"
16#include "base/basictypes.h"
17#include "base/file_path.h"
18#include "base/gtest_prod_util.h"
19#include "base/memory/linked_ptr.h"
20#include "base/memory/scoped_ptr.h"
21#include "base/string16.h"
22#include "chrome/browser/autocomplete/autocomplete_match.h"
23#include "chrome/browser/autocomplete/history_provider_util.h"
24#include "chrome/browser/history/history_types.h"
25#include "chrome/browser/history/in_memory_url_index_cache.pb.h"
26#include "testing/gtest/include/gtest/gtest_prod.h"
27
28class Profile;
29
30namespace base {
31class Time;
32}
33
34namespace in_memory_url_index {
35class InMemoryURLIndexCacheItem;
36}
37
38namespace history {
39
40namespace imui = in_memory_url_index;
41
42class URLDatabase;
43
44// Specifies where an omnibox term occurs within a string. Used for specifying
45// highlights in AutocompleteMatches (ACMatchClassifications) and to assist in
46// scoring a result.
47struct TermMatch {
48  TermMatch(int term_num, size_t offset, size_t length)
49      : term_num(term_num),
50        offset(offset),
51        length(length) {}
52
53  int term_num;  // The index of the term in the original search string.
54  size_t offset;  // The starting offset of the substring match.
55  size_t length;  // The length of the substring match.
56};
57typedef std::vector<TermMatch> TermMatches;
58
59// Used for intermediate history result operations.
60struct ScoredHistoryMatch : public HistoryMatch {
61  ScoredHistoryMatch();  // Required by STL.
62  explicit ScoredHistoryMatch(const URLRow& url_info);
63  ~ScoredHistoryMatch();
64
65  static bool MatchScoreGreater(const ScoredHistoryMatch& m1,
66                                const ScoredHistoryMatch& m2);
67
68  // An interim score taking into consideration location and completeness
69  // of the match.
70  int raw_score;
71  TermMatches url_matches;  // Term matches within the URL.
72  TermMatches title_matches;  // Term matches within the page title.
73  size_t prefix_adjust;  // The length of a prefix which should be ignored.
74};
75typedef std::vector<ScoredHistoryMatch> ScoredHistoryMatches;
76
77// The URL history source.
78// Holds portions of the URL database in memory in an indexed form.  Used to
79// quickly look up matching URLs for a given query string.  Used by
80// the HistoryURLProvider for inline autocomplete and to provide URL
81// matches to the omnibox.
82//
83// Note about multi-byte codepoints and the data structures in the
84// InMemoryURLIndex class: One will quickly notice that no effort is made to
85// insure that multi-byte character boundaries are detected when indexing the
86// words and characters in the URL history database except when converting
87// URL strings to lowercase. Multi-byte-edness makes no difference when
88// indexing or when searching the index as the final filtering of results
89// is dependent on the comparison of a string of bytes, not individual
90// characters. While the lookup of those bytes during a search in the
91// |char_word_map_| could serve up words in which the individual char16
92// occurs as a portion of a composite character the next filtering step
93// will eliminate such words except in the case where a single character
94// is being searched on and which character occurs as the second char16 of a
95// multi-char16 instance.
96class InMemoryURLIndex {
97 public:
98  // |history_dir| is a path to the directory containing the history database
99  // within the profile wherein the cache and transaction journals will be
100  // stored.
101  explicit InMemoryURLIndex(const FilePath& history_dir);
102  ~InMemoryURLIndex();
103
104  // Convenience types
105  typedef std::vector<string16> String16Vector;
106
107  // Opens and indexes the URL history database.
108  // |languages| gives a list of language encodings with which the history
109  // URLs and omnibox searches are interpreted, i.e. when each is broken
110  // down into words and each word is broken down into characters.
111  bool Init(URLDatabase* history_db, const std::string& languages);
112
113  // Reloads the history index. Attempts to reload from the cache unless
114  // |clear_cache| is true. If the cache is unavailable then reload the
115  // index from |history_db|.
116  bool ReloadFromHistory(URLDatabase* history_db, bool clear_cache);
117
118  // Signals that any outstanding initialization should be canceled and
119  // flushes the cache to disk.
120  void ShutDown();
121
122  // Restores the index's private data from the cache file stored in the
123  // profile directory and returns true if successful.
124  bool RestoreFromCacheFile();
125
126  // Caches the index private data and writes the cache file to the profile
127  // directory.
128  bool SaveToCacheFile();
129
130  // Given a vector containing one or more words as string16s, scans the
131  // history index and return a vector with all scored, matching history items.
132  // Each term must occur somewhere in the history item's URL or page title for
133  // the item to qualify; however, the terms do not necessarily have to be
134  // adjacent. Results are sorted with higher scoring items first. Each term
135  // from |terms| may contain punctuation but should not contain spaces.
136  // A search request which results in more than |kItemsToScoreLimit| total
137  // candidate items returns no matches (though the results set will be
138  // retained and used for subsequent calls to this function) as the scoring
139  // of such a large number of candidates may cause perceptible typing response
140  // delays in the omnibox. This is likely to occur for short omnibox terms
141  // such as 'h' and 'w' which will be found in nearly all history candidates.
142  ScoredHistoryMatches HistoryItemsForTerms(const String16Vector& terms);
143
144  // Updates or adds an history item to the index if it meets the minimum
145  // 'quick' criteria.
146  void UpdateURL(URLID row_id, const URLRow& row);
147
148  // Deletes indexing data for an history item. The item may not have actually
149  // been indexed (which is the case if it did not previously meet minimum
150  // 'quick' criteria).
151  void DeleteURL(URLID row_id);
152
153  // Breaks the |uni_string| string down into individual words and return
154  // a vector with the individual words in their original order. If
155  // |break_on_space| is false then the resulting list will contain only words
156  // containing alpha-numeric characters. If |break_on_space| is true then the
157  // resulting list will contain strings broken at whitespace.
158  //
159  // Example:
160  //   Given: |uni_string|: "http://www.google.com/ harry the rabbit."
161  //   With |break_on_space| false the returned list will contain:
162  //    "http", "www", "google", "com", "harry", "the", "rabbit"
163  //   With |break_on_space| true the returned list will contain:
164  //    "http://", "www.google.com/", "harry", "the", "rabbit."
165  static String16Vector WordVectorFromString16(const string16& uni_string,
166                                               bool break_on_space);
167
168  // Extract and return the offsets from |matches|.
169  static std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches);
170
171  // Replace the offsets in |matches| with those given in |offsets|, deleting
172  // any which are npos, and return the updated list of matches.
173  static TermMatches ReplaceOffsetsInTermMatches(
174      const TermMatches& matches,
175      const std::vector<size_t>& offsets);
176
177 private:
178  friend class AddHistoryMatch;
179  FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization);
180  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheFilePath);
181  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore);
182  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Char16Utilities);
183  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring);
184  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, StaticFunctions);
185  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch);
186  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching);
187
188  // Signals that there are no previously cached results for the typed term.
189  static const size_t kNoCachedResultForTerm;
190
191  // Creating one of me without a history path is not allowed (tests excepted).
192  InMemoryURLIndex();
193
194  // Convenience types.
195  typedef std::set<string16> String16Set;
196  typedef std::set<char16> Char16Set;
197  typedef std::vector<char16> Char16Vector;
198
199  // An index into list of all of the words we have indexed.
200  typedef int WordID;
201
202  // A map allowing a WordID to be determined given a word.
203  typedef std::map<string16, WordID> WordMap;
204
205  // A map from character to word_ids.
206  typedef std::set<WordID> WordIDSet;  // An index into the WordList.
207  typedef std::map<char16, WordIDSet> CharWordIDMap;
208
209  // A map from word_id to history item.
210  // TODO(mrossetti): URLID is 64 bit: a memory bloat and performance hit.
211  // Consider using a smaller type.
212  typedef URLID HistoryID;
213  typedef std::set<HistoryID> HistoryIDSet;
214  typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap;
215
216  // Support caching of term character results so that we can optimize
217  // searches which build upon a previous search. Each entry in this vector
218  // represents a progressive reduction of the result set for each unique
219  // character found in the search term, with each character being taken as
220  // initially encountered. For example, once the results for the search
221  // term of "mextexarkana" have been fully determined, this vector will
222  // contain one entry for the characters: m, e, x, t, a, r, k, & n, in
223  // that order. The result sets will naturally shrink in size for each
224  // subsequent character as the sets intersections are taken in an
225  // incremental manner.
226  struct TermCharWordSet;
227  typedef std::vector<TermCharWordSet> TermCharWordSetVector;
228
229  // TODO(rohitrao): Probably replace this with QueryResults.
230  typedef std::vector<URLRow> URLRowVector;
231
232  // A map from history_id to the history's URL and title.
233  typedef std::map<HistoryID, URLRow> HistoryInfoMap;
234
235  // A helper class which performs the final filter on each candidate
236  // history URL match, inserting accepted matches into |scored_matches_|
237  // and trimming the maximum number of matches to 10.
238  class AddHistoryMatch : public std::unary_function<HistoryID, void> {
239   public:
240    AddHistoryMatch(const InMemoryURLIndex& index,
241                    const String16Vector& lower_terms);
242    ~AddHistoryMatch();
243
244    void operator()(const HistoryID history_id);
245
246    ScoredHistoryMatches ScoredMatches() const { return scored_matches_; }
247
248   private:
249    const InMemoryURLIndex& index_;
250    ScoredHistoryMatches scored_matches_;
251    const String16Vector& lower_terms_;
252  };
253
254  // Initializes all index data members in preparation for restoring the index
255  // from the cache or a complete rebuild from the history database.
256  void ClearPrivateData();
257
258  // Breaks a string down into individual words.
259  static String16Set WordSetFromString16(const string16& uni_string);
260
261  // Given a vector of Char16s, representing the characters the user has typed
262  // into the omnibox, finds words containing those characters. If any
263  // existing, cached set is a proper subset then starts with that cached
264  // set. Updates the previously-typed-character cache.
265  WordIDSet WordIDSetForTermChars(const Char16Vector& uni_chars);
266
267  // Given a vector of Char16s in |uni_chars|, compare those characters, in
268  // order, with the previously searched term, returning the index of the
269  // cached results in the term_char_word_set_cache_ of the set with best
270  // matching number of characters. Returns kNoCachedResultForTerm if there
271  // was no match at all (i.e. the first character of |uni-chars| is not the
272  // same as the character of the first entry in term_char_word_set_cache_).
273  size_t CachedResultsIndexForTerm(const Char16Vector& uni_chars);
274
275  // Creates a TermMatches which has an entry for each occurrence of the string
276  // |term| found in the string |string|. Mark each match with |term_num| so
277  // that the resulting TermMatches can be merged with other TermMatches for
278  // other terms.
279  static TermMatches MatchTermInString(const string16& term,
280                                       const string16& string,
281                                       int term_num);
282
283  // URL History indexing support functions.
284
285  // Indexes one URL history item.
286  bool IndexRow(const URLRow& row);
287
288  // Breaks a string down into unique, individual characters in the order
289  // in which the characters are first encountered in the |uni_word| string.
290  static Char16Vector Char16VectorFromString16(const string16& uni_word);
291
292  // Breaks the |uni_word| string down into its individual characters.
293  // Note that this is temporarily intended to work on a single word, but
294  // _will_ work on a string of words, perhaps with unexpected results.
295  // TODO(mrossetti): Lots of optimizations possible here for not restarting
296  // a search if the user is just typing along. Also, change this to uniString
297  // and properly handle substring matches, scoring and sorting the results
298  // by score. Also, provide the metrics for where the matches occur so that
299  // the UI can highlight the matched sections.
300  static Char16Set Char16SetFromString16(const string16& uni_word);
301
302  // Given a single word in |uni_word|, adds a reference for the containing
303  // history item identified by |history_id| to the index.
304  void AddWordToIndex(const string16& uni_word, HistoryID history_id);
305
306  // Updates an existing entry in the word/history index by adding the
307  // |history_id| to set for |word_id| in the word_id_history_map_.
308  void UpdateWordHistory(WordID word_id, HistoryID history_id);
309
310  // Creates a new entry in the word/history map for |word_id| and add
311  // |history_id| as the initial element of the word's set.
312  void AddWordHistory(const string16& uni_word, HistoryID history_id);
313
314  // Clears the search term cache. This cache holds on to the intermediate
315  // word results for each previously typed character to eliminate the need
316  // to re-perform set operations for previously typed characters.
317  void ResetTermCharWordSetCache();
318
319  // Composes a set of history item IDs by intersecting the set for each word
320  // in |uni_string|.
321  HistoryIDSet HistoryIDSetFromWords(const string16& uni_string);
322
323  // Helper function to HistoryIDSetFromWords which composes a set of history
324  // ids for the given term given in |uni_word|.
325  HistoryIDSet HistoryIDsForTerm(const string16& uni_word);
326
327  // Calculates a raw score for this history item by first determining
328  // if all of the terms in |terms_vector| occur in |row| and, if so,
329  // calculating a raw score based on 1) starting position of each term
330  // in the user input, 2) completeness of each term's match, 3) ordering
331  // of the occurrence of each term (i.e. they appear in order), 4) last
332  // visit time, and 5) number of visits.
333  // This raw score allows the results to be ordered and can be used
334  // to influence the final score calculated by the client of this
335  // index. Returns a ScoredHistoryMatch structure with the raw score and
336  // substring matching metrics.
337  static ScoredHistoryMatch ScoredMatchForURL(
338      const URLRow& row,
339      const String16Vector& terms_vector);
340
341  // Calculates a component score based on position, ordering and total
342  // substring match size using metrics recorded in |matches|. |max_length|
343  // is the length of the string against which the terms are being searched.
344  static int ScoreComponentForMatches(const TermMatches& matches,
345                                      size_t max_length);
346
347  // Sorts and removes overlapping substring matches from |matches| and
348  // returns the cleaned up matches.
349  static TermMatches SortAndDeoverlap(const TermMatches& matches);
350
351  // Utility functions supporting RestoreFromCache and SaveToCache.
352
353  // Construct a file path for the cache file within the same directory where
354  // the history database is kept and saves that path to |file_path|. Returns
355  // true if |file_path| can be successfully constructed. (This function
356  // provided as a hook for unit testing.)
357  bool GetCacheFilePath(FilePath* file_path);
358
359  // Encode a data structure into the protobuf |cache|.
360  void SavePrivateData(imui::InMemoryURLIndexCacheItem* cache) const;
361  void SaveWordList(imui::InMemoryURLIndexCacheItem* cache) const;
362  void SaveWordMap(imui::InMemoryURLIndexCacheItem* cache) const;
363  void SaveCharWordMap(imui::InMemoryURLIndexCacheItem* cache) const;
364  void SaveWordIDHistoryMap(imui::InMemoryURLIndexCacheItem* cache) const;
365  void SaveHistoryInfoMap(imui::InMemoryURLIndexCacheItem* cache) const;
366
367  // Decode a data structure from the protobuf |cache|. Return false if there
368  // is any kind of failure.
369  bool RestorePrivateData(const imui::InMemoryURLIndexCacheItem& cache);
370  bool RestoreWordList(const imui::InMemoryURLIndexCacheItem& cache);
371  bool RestoreWordMap(const imui::InMemoryURLIndexCacheItem& cache);
372  bool RestoreCharWordMap(const imui::InMemoryURLIndexCacheItem& cache);
373  bool RestoreWordIDHistoryMap(const imui::InMemoryURLIndexCacheItem& cache);
374  bool RestoreHistoryInfoMap(const imui::InMemoryURLIndexCacheItem& cache);
375
376  // Directory where cache file resides. This is, except when unit testing,
377  // the same directory in which the profile's history database is found. It
378  // should never be empty.
379  FilePath history_dir_;
380
381  // The timestamp of when the cache was last saved. This is used to validate
382  // the transaction journal's applicability to the cache. The timestamp is
383  // initialized to the NULL time, indicating that the cache was not used with
384  // the InMemoryURLIndex was last populated.
385  base::Time last_saved_;
386
387  // A list of all of indexed words. The index of a word in this list is the
388  // ID of the word in the word_map_. It reduces the memory overhead by
389  // replacing a potentially long and repeated string with a simple index.
390  // NOTE: A word will _never_ be removed from this vector thus insuring
391  // the immutability of the word_id throughout the session, reducing
392  // maintenance complexity.
393  // TODO(mrossetti): Profile the vector allocation and determine if judicious
394  // 'reserve' calls are called for.
395  String16Vector word_list_;
396
397  int history_item_count_;
398  WordMap word_map_;
399  CharWordIDMap char_word_map_;
400  WordIDHistoryMap word_id_history_map_;
401  TermCharWordSetVector term_char_word_set_cache_;
402  HistoryInfoMap history_info_map_;
403  std::string languages_;
404
405  DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex);
406};
407
408}  // namespace history
409
410#endif  // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
411