text_database.cc revision c407dc5cd9bdc5668497f21b26b09d988ab439de
1// Copyright (c) 2009 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <limits>
6#include <set>
7#include <string>
8
9#include "chrome/browser/history/text_database.h"
10
11#include "app/sql/connection.h"
12#include "app/sql/statement.h"
13#include "app/sql/transaction.h"
14#include "base/file_util.h"
15#include "base/histogram.h"
16#include "base/logging.h"
17#include "base/string_util.h"
18#include "base/utf_string_conversions.h"
19#include "chrome/browser/diagnostics/sqlite_diagnostics.h"
20
21// There are two tables in each database, one full-text search (FTS) table which
22// indexes the contents and title of the pages. The other is a regular SQLITE
23// table which contains non-indexed information about the page. All columns of
24// a FTS table are indexed using the text search algorithm, which isn't what we
25// want for things like times. If this were in the FTS table, there would be
26// different words in the index for each time number.
27//
28// "pages" FTS table:
29//   url    URL of the page so searches will match the URL.
30//   title  Title of the page.
31//   body   Body of the page.
32//
33// "info" regular table:
34//   time     Time the corresponding FTS entry was visited.
35//
36// We do joins across these two tables by using their internal rowids, which we
37// keep in sync between the two tables. The internal rowid is the only part of
38// an FTS table that is indexed like a normal table, and the index over it is
39// free since sqlite always indexes the internal rowid.
40
41namespace history {
42
43namespace {
44
45// Version 1 uses FTS2 for index files.
46// Version 2 uses FTS3.
47static const int kCurrentVersionNumber = 2;
48static const int kCompatibleVersionNumber = 2;
49
50// Snippet computation relies on the index of the columns in the original
51// create statement. These are the 0-based indices (as strings) of the
52// corresponding columns.
53const char kTitleColumnIndex[] = "1";
54const char kBodyColumnIndex[] = "2";
55
56// The string prepended to the database identifier to generate the filename.
57const FilePath::CharType kFilePrefix[] = FILE_PATH_LITERAL("History Index ");
58
59}  // namespace
60
61TextDatabase::TextDatabase(const FilePath& path,
62                           DBIdent id,
63                           bool allow_create)
64    : path_(path),
65      ident_(id),
66      allow_create_(allow_create) {
67  // Compute the file name.
68  file_name_ = path_.Append(IDToFileName(ident_));
69}
70
71TextDatabase::~TextDatabase() {
72}
73
74// static
75const FilePath::CharType* TextDatabase::file_base() {
76  return kFilePrefix;
77}
78
79// static
80FilePath TextDatabase::IDToFileName(DBIdent id) {
81  // Identifiers are intended to be a combination of the year and month, for
82  // example, 200801 for January 2008. We convert this to
83  // "History Index 2008-01". However, we don't make assumptions about this
84  // scheme: the caller should assign IDs as it feels fit with the knowledge
85  // that they will apppear on disk in this form.
86  FilePath::StringType filename(file_base());
87  StringAppendF(&filename, FILE_PATH_LITERAL("%d-%02d"),
88                id / 100, id % 100);
89  return FilePath(filename);
90}
91
92// static
93TextDatabase::DBIdent TextDatabase::FileNameToID(const FilePath& file_path) {
94  FilePath::StringType file_name = file_path.BaseName().value();
95
96  // We don't actually check the prefix here. Since the file system could
97  // be case insensitive in ways we can't predict (NTFS), checking could
98  // potentially be the wrong thing to do. Instead, we just look for a suffix.
99  static const size_t kIDStringLength = 7;  // Room for "xxxx-xx".
100  if (file_name.length() < kIDStringLength)
101    return 0;
102  const FilePath::StringType suffix(
103      &file_name[file_name.length() - kIDStringLength]);
104
105  if (suffix.length() != kIDStringLength ||
106      suffix[4] != FILE_PATH_LITERAL('-')) {
107    return 0;
108  }
109
110  int year = StringToInt(suffix.substr(0, 4));
111  int month = StringToInt(suffix.substr(5, 2));
112
113  return year * 100 + month;
114}
115
116bool TextDatabase::Init() {
117  // Make sure, if we're not allowed to create the file, that it exists.
118  if (!allow_create_) {
119    if (!file_util::PathExists(file_name_))
120      return false;
121  }
122
123  // Set the exceptional sqlite error handler.
124  db_.set_error_delegate(GetErrorHandlerForTextDb());
125
126  // Set the database page size to something a little larger to give us
127  // better performance (we're typically seek rather than bandwidth limited).
128  // This only has an effect before any tables have been created, otherwise
129  // this is a NOP. Must be a power of 2 and a max of 8192.
130  db_.set_page_size(2096);
131
132  // The default cache size is 2000 which give >8MB of data. Since we will often
133  // have 2-3 of these objects, each with their own 8MB, this adds up very fast.
134  // We therefore reduce the size so when there are multiple objects, we're not
135  // too big.
136  db_.set_cache_size(512);
137
138  // Run the database in exclusive mode. Nobody else should be accessing the
139  // database while we're running, and this will give somewhat improved perf.
140  db_.set_exclusive_locking();
141
142  // Attach the database to our index file.
143  if (!db_.Open(file_name_))
144    return false;
145
146  // Meta table tracking version information.
147  if (!meta_table_.Init(&db_, kCurrentVersionNumber, kCompatibleVersionNumber))
148    return false;
149  if (meta_table_.GetCompatibleVersionNumber() > kCurrentVersionNumber) {
150    // This version is too new. We don't bother notifying the user on this
151    // error, and just fail to use the file. Normally if they have version skew,
152    // they will get it for the main history file and it won't be necessary
153    // here. If that's not the case, since this is only indexed data, it's
154    // probably better to just not give FTS results than strange errors when
155    // everything else is working OK.
156    LOG(WARNING) << "Text database is too new.";
157    return false;
158  }
159
160  return CreateTables();
161}
162
163void TextDatabase::BeginTransaction() {
164  db_.BeginTransaction();
165}
166
167void TextDatabase::CommitTransaction() {
168  db_.CommitTransaction();
169}
170
171bool TextDatabase::CreateTables() {
172  // FTS table of page contents.
173  if (!db_.DoesTableExist("pages")) {
174    if (!db_.Execute("CREATE VIRTUAL TABLE pages USING fts3("
175                     "TOKENIZE icu,"
176                     "url LONGVARCHAR,"
177                     "title LONGVARCHAR,"
178                     "body LONGVARCHAR)"))
179      return false;
180  }
181
182  // Non-FTS table containing URLs and times so we can efficiently find them
183  // using a regular index (all FTS columns are special and are treated as
184  // full-text-search, which is not what we want when retrieving this data).
185  if (!db_.DoesTableExist("info")) {
186    // Note that there is no point in creating an index over time. Since
187    // we must always query the entire FTS table (it can not efficiently do
188    // subsets), we will always end up doing that first, and joining the info
189    // table off of that.
190    if (!db_.Execute("CREATE TABLE info(time INTEGER NOT NULL)"))
191      return false;
192  }
193
194  // Create the index. This will fail when the index already exists, so we just
195  // ignore the error.
196  db_.Execute("CREATE INDEX info_time ON info(time)");
197  return true;
198}
199
200bool TextDatabase::AddPageData(base::Time time,
201                               const std::string& url,
202                               const std::string& title,
203                               const std::string& contents) {
204  sql::Transaction committer(&db_);
205  if (!committer.Begin())
206    return false;
207
208  // Add to the pages table.
209  sql::Statement add_to_pages(db_.GetCachedStatement(SQL_FROM_HERE,
210      "INSERT INTO pages (url, title, body) VALUES (?,?,?)"));
211  if (!add_to_pages) {
212    NOTREACHED() << db_.GetErrorMessage();
213    return false;
214  }
215  add_to_pages.BindString(0, url);
216  add_to_pages.BindString(1, title);
217  add_to_pages.BindString(2, contents);
218  if (!add_to_pages.Run()) {
219    NOTREACHED() << db_.GetErrorMessage();
220    return false;
221  }
222
223  int64 rowid = db_.GetLastInsertRowId();
224
225  // Add to the info table with the same rowid.
226  sql::Statement add_to_info(db_.GetCachedStatement(SQL_FROM_HERE,
227      "INSERT INTO info (rowid, time) VALUES (?,?)"));
228  if (!add_to_info) {
229    NOTREACHED() << db_.GetErrorMessage();
230    return false;
231  }
232  add_to_info.BindInt64(0, rowid);
233  add_to_info.BindInt64(1, time.ToInternalValue());
234  if (!add_to_info.Run()) {
235    NOTREACHED() << db_.GetErrorMessage();
236    return false;
237  }
238
239  return committer.Commit();
240}
241
242void TextDatabase::DeletePageData(base::Time time, const std::string& url) {
243  // First get all rows that match. Selecing on time (which has an index) allows
244  // us to avoid brute-force searches on the full-text-index table (there will
245  // generally be only one match per time).
246  sql::Statement select_ids(db_.GetCachedStatement(SQL_FROM_HERE,
247      "SELECT info.rowid "
248      "FROM info JOIN pages ON info.rowid = pages.rowid "
249      "WHERE info.time=? AND pages.url=?"));
250  if (!select_ids)
251    return;
252  select_ids.BindInt64(0, time.ToInternalValue());
253  select_ids.BindString(1, url);
254  std::set<int64> rows_to_delete;
255  while (select_ids.Step())
256    rows_to_delete.insert(select_ids.ColumnInt64(0));
257
258  // Delete from the pages table.
259  sql::Statement delete_page(db_.GetCachedStatement(SQL_FROM_HERE,
260      "DELETE FROM pages WHERE rowid=?"));
261  if (!delete_page)
262    return;
263  for (std::set<int64>::const_iterator i = rows_to_delete.begin();
264       i != rows_to_delete.end(); ++i) {
265    delete_page.BindInt64(0, *i);
266    if (!delete_page.Run()) {
267      NOTREACHED();
268      return;
269    }
270    delete_page.Reset();
271  }
272
273  // Delete from the info table.
274  sql::Statement delete_info(db_.GetCachedStatement(SQL_FROM_HERE,
275      "DELETE FROM info WHERE rowid=?"));
276  if (!delete_info)
277    return;
278  for (std::set<int64>::const_iterator i = rows_to_delete.begin();
279       i != rows_to_delete.end(); ++i) {
280    delete_info.BindInt64(0, *i);
281    if (!delete_info.Run()) {
282      NOTREACHED();
283      return;
284    }
285    delete_info.Reset();
286  }
287}
288
289void TextDatabase::Optimize() {
290  sql::Statement statement(db_.GetCachedStatement(SQL_FROM_HERE,
291      "SELECT OPTIMIZE(pages) FROM pages LIMIT 1"));
292  if (!statement)
293    return;
294  statement.Run();
295}
296
297void TextDatabase::GetTextMatches(const std::string& query,
298                                  const QueryOptions& options,
299                                  std::vector<Match>* results,
300                                  URLSet* found_urls,
301                                  base::Time* first_time_searched) {
302  *first_time_searched = options.begin_time;
303
304  sql::Statement statement(db_.GetCachedStatement(SQL_FROM_HERE,
305    "SELECT url, title, time, offsets(pages), body "
306        "FROM pages LEFT OUTER JOIN info ON pages.rowid = info.rowid "
307        "WHERE pages MATCH ? AND time >= ? AND time < ? "
308        "ORDER BY time DESC "
309        "LIMIT ?"));
310  if (!statement)
311    return;
312
313  // When their values indicate "unspecified", saturate the numbers to the max
314  // or min to get the correct result.
315  int64 effective_begin_time = options.begin_time.is_null() ?
316      0 : options.begin_time.ToInternalValue();
317  int64 effective_end_time = options.end_time.is_null() ?
318      std::numeric_limits<int64>::max() : options.end_time.ToInternalValue();
319  int effective_max_count = options.max_count ?
320      options.max_count : std::numeric_limits<int>::max();
321
322  statement.BindString(0, query);
323  statement.BindInt64(1, effective_begin_time);
324  statement.BindInt64(2, effective_end_time);
325  statement.BindInt(3, effective_max_count);
326
327  while (statement.Step()) {
328    // TODO(brettw) allow canceling the query in the middle.
329    // if (canceled_or_something)
330    //   break;
331
332    GURL url(statement.ColumnString(0));
333    URLSet::const_iterator found_url = found_urls->find(url);
334    if (found_url != found_urls->end())
335      continue;  // Don't add this duplicate.
336
337    // Fill the results into the vector (avoid copying the URL with Swap()).
338    results->resize(results->size() + 1);
339    Match& match = results->at(results->size() - 1);
340    match.url.Swap(&url);
341
342    match.title = statement.ColumnString16(1);
343    match.time = base::Time::FromInternalValue(statement.ColumnInt64(2));
344
345    // Extract any matches in the title.
346    std::string offsets_str = statement.ColumnString(3);
347    Snippet::ExtractMatchPositions(offsets_str, kTitleColumnIndex,
348                                   &match.title_match_positions);
349    Snippet::ConvertMatchPositionsToWide(statement.ColumnString(1),
350                                         &match.title_match_positions);
351
352    // Extract the matches in the body.
353    Snippet::MatchPositions match_positions;
354    Snippet::ExtractMatchPositions(offsets_str, kBodyColumnIndex,
355                                   &match_positions);
356
357    // Compute the snippet based on those matches.
358    std::string body = statement.ColumnString(4);
359    match.snippet.ComputeSnippet(match_positions, body);
360  }
361
362  // When we have returned all the results possible (or determined that there
363  // are none), then we have searched all the time requested, so we can
364  // set the first_time_searched to that value.
365  if (results->size() == 0 ||
366      options.max_count == 0 ||  // Special case for wanting all the results.
367      static_cast<int>(results->size()) < options.max_count) {
368    *first_time_searched = options.begin_time;
369  } else {
370    // Since we got the results in order, we know the last item is the last
371    // time we considered.
372    *first_time_searched = results->back().time;
373  }
374
375  statement.Reset();
376}
377
378}  // namespace history
379