text_database.cc revision c407dc5cd9bdc5668497f21b26b09d988ab439de
1// Copyright (c) 2009 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include <limits> 6#include <set> 7#include <string> 8 9#include "chrome/browser/history/text_database.h" 10 11#include "app/sql/connection.h" 12#include "app/sql/statement.h" 13#include "app/sql/transaction.h" 14#include "base/file_util.h" 15#include "base/histogram.h" 16#include "base/logging.h" 17#include "base/string_util.h" 18#include "base/utf_string_conversions.h" 19#include "chrome/browser/diagnostics/sqlite_diagnostics.h" 20 21// There are two tables in each database, one full-text search (FTS) table which 22// indexes the contents and title of the pages. The other is a regular SQLITE 23// table which contains non-indexed information about the page. All columns of 24// a FTS table are indexed using the text search algorithm, which isn't what we 25// want for things like times. If this were in the FTS table, there would be 26// different words in the index for each time number. 27// 28// "pages" FTS table: 29// url URL of the page so searches will match the URL. 30// title Title of the page. 31// body Body of the page. 32// 33// "info" regular table: 34// time Time the corresponding FTS entry was visited. 35// 36// We do joins across these two tables by using their internal rowids, which we 37// keep in sync between the two tables. The internal rowid is the only part of 38// an FTS table that is indexed like a normal table, and the index over it is 39// free since sqlite always indexes the internal rowid. 40 41namespace history { 42 43namespace { 44 45// Version 1 uses FTS2 for index files. 46// Version 2 uses FTS3. 47static const int kCurrentVersionNumber = 2; 48static const int kCompatibleVersionNumber = 2; 49 50// Snippet computation relies on the index of the columns in the original 51// create statement. These are the 0-based indices (as strings) of the 52// corresponding columns. 53const char kTitleColumnIndex[] = "1"; 54const char kBodyColumnIndex[] = "2"; 55 56// The string prepended to the database identifier to generate the filename. 57const FilePath::CharType kFilePrefix[] = FILE_PATH_LITERAL("History Index "); 58 59} // namespace 60 61TextDatabase::TextDatabase(const FilePath& path, 62 DBIdent id, 63 bool allow_create) 64 : path_(path), 65 ident_(id), 66 allow_create_(allow_create) { 67 // Compute the file name. 68 file_name_ = path_.Append(IDToFileName(ident_)); 69} 70 71TextDatabase::~TextDatabase() { 72} 73 74// static 75const FilePath::CharType* TextDatabase::file_base() { 76 return kFilePrefix; 77} 78 79// static 80FilePath TextDatabase::IDToFileName(DBIdent id) { 81 // Identifiers are intended to be a combination of the year and month, for 82 // example, 200801 for January 2008. We convert this to 83 // "History Index 2008-01". However, we don't make assumptions about this 84 // scheme: the caller should assign IDs as it feels fit with the knowledge 85 // that they will apppear on disk in this form. 86 FilePath::StringType filename(file_base()); 87 StringAppendF(&filename, FILE_PATH_LITERAL("%d-%02d"), 88 id / 100, id % 100); 89 return FilePath(filename); 90} 91 92// static 93TextDatabase::DBIdent TextDatabase::FileNameToID(const FilePath& file_path) { 94 FilePath::StringType file_name = file_path.BaseName().value(); 95 96 // We don't actually check the prefix here. Since the file system could 97 // be case insensitive in ways we can't predict (NTFS), checking could 98 // potentially be the wrong thing to do. Instead, we just look for a suffix. 99 static const size_t kIDStringLength = 7; // Room for "xxxx-xx". 100 if (file_name.length() < kIDStringLength) 101 return 0; 102 const FilePath::StringType suffix( 103 &file_name[file_name.length() - kIDStringLength]); 104 105 if (suffix.length() != kIDStringLength || 106 suffix[4] != FILE_PATH_LITERAL('-')) { 107 return 0; 108 } 109 110 int year = StringToInt(suffix.substr(0, 4)); 111 int month = StringToInt(suffix.substr(5, 2)); 112 113 return year * 100 + month; 114} 115 116bool TextDatabase::Init() { 117 // Make sure, if we're not allowed to create the file, that it exists. 118 if (!allow_create_) { 119 if (!file_util::PathExists(file_name_)) 120 return false; 121 } 122 123 // Set the exceptional sqlite error handler. 124 db_.set_error_delegate(GetErrorHandlerForTextDb()); 125 126 // Set the database page size to something a little larger to give us 127 // better performance (we're typically seek rather than bandwidth limited). 128 // This only has an effect before any tables have been created, otherwise 129 // this is a NOP. Must be a power of 2 and a max of 8192. 130 db_.set_page_size(2096); 131 132 // The default cache size is 2000 which give >8MB of data. Since we will often 133 // have 2-3 of these objects, each with their own 8MB, this adds up very fast. 134 // We therefore reduce the size so when there are multiple objects, we're not 135 // too big. 136 db_.set_cache_size(512); 137 138 // Run the database in exclusive mode. Nobody else should be accessing the 139 // database while we're running, and this will give somewhat improved perf. 140 db_.set_exclusive_locking(); 141 142 // Attach the database to our index file. 143 if (!db_.Open(file_name_)) 144 return false; 145 146 // Meta table tracking version information. 147 if (!meta_table_.Init(&db_, kCurrentVersionNumber, kCompatibleVersionNumber)) 148 return false; 149 if (meta_table_.GetCompatibleVersionNumber() > kCurrentVersionNumber) { 150 // This version is too new. We don't bother notifying the user on this 151 // error, and just fail to use the file. Normally if they have version skew, 152 // they will get it for the main history file and it won't be necessary 153 // here. If that's not the case, since this is only indexed data, it's 154 // probably better to just not give FTS results than strange errors when 155 // everything else is working OK. 156 LOG(WARNING) << "Text database is too new."; 157 return false; 158 } 159 160 return CreateTables(); 161} 162 163void TextDatabase::BeginTransaction() { 164 db_.BeginTransaction(); 165} 166 167void TextDatabase::CommitTransaction() { 168 db_.CommitTransaction(); 169} 170 171bool TextDatabase::CreateTables() { 172 // FTS table of page contents. 173 if (!db_.DoesTableExist("pages")) { 174 if (!db_.Execute("CREATE VIRTUAL TABLE pages USING fts3(" 175 "TOKENIZE icu," 176 "url LONGVARCHAR," 177 "title LONGVARCHAR," 178 "body LONGVARCHAR)")) 179 return false; 180 } 181 182 // Non-FTS table containing URLs and times so we can efficiently find them 183 // using a regular index (all FTS columns are special and are treated as 184 // full-text-search, which is not what we want when retrieving this data). 185 if (!db_.DoesTableExist("info")) { 186 // Note that there is no point in creating an index over time. Since 187 // we must always query the entire FTS table (it can not efficiently do 188 // subsets), we will always end up doing that first, and joining the info 189 // table off of that. 190 if (!db_.Execute("CREATE TABLE info(time INTEGER NOT NULL)")) 191 return false; 192 } 193 194 // Create the index. This will fail when the index already exists, so we just 195 // ignore the error. 196 db_.Execute("CREATE INDEX info_time ON info(time)"); 197 return true; 198} 199 200bool TextDatabase::AddPageData(base::Time time, 201 const std::string& url, 202 const std::string& title, 203 const std::string& contents) { 204 sql::Transaction committer(&db_); 205 if (!committer.Begin()) 206 return false; 207 208 // Add to the pages table. 209 sql::Statement add_to_pages(db_.GetCachedStatement(SQL_FROM_HERE, 210 "INSERT INTO pages (url, title, body) VALUES (?,?,?)")); 211 if (!add_to_pages) { 212 NOTREACHED() << db_.GetErrorMessage(); 213 return false; 214 } 215 add_to_pages.BindString(0, url); 216 add_to_pages.BindString(1, title); 217 add_to_pages.BindString(2, contents); 218 if (!add_to_pages.Run()) { 219 NOTREACHED() << db_.GetErrorMessage(); 220 return false; 221 } 222 223 int64 rowid = db_.GetLastInsertRowId(); 224 225 // Add to the info table with the same rowid. 226 sql::Statement add_to_info(db_.GetCachedStatement(SQL_FROM_HERE, 227 "INSERT INTO info (rowid, time) VALUES (?,?)")); 228 if (!add_to_info) { 229 NOTREACHED() << db_.GetErrorMessage(); 230 return false; 231 } 232 add_to_info.BindInt64(0, rowid); 233 add_to_info.BindInt64(1, time.ToInternalValue()); 234 if (!add_to_info.Run()) { 235 NOTREACHED() << db_.GetErrorMessage(); 236 return false; 237 } 238 239 return committer.Commit(); 240} 241 242void TextDatabase::DeletePageData(base::Time time, const std::string& url) { 243 // First get all rows that match. Selecing on time (which has an index) allows 244 // us to avoid brute-force searches on the full-text-index table (there will 245 // generally be only one match per time). 246 sql::Statement select_ids(db_.GetCachedStatement(SQL_FROM_HERE, 247 "SELECT info.rowid " 248 "FROM info JOIN pages ON info.rowid = pages.rowid " 249 "WHERE info.time=? AND pages.url=?")); 250 if (!select_ids) 251 return; 252 select_ids.BindInt64(0, time.ToInternalValue()); 253 select_ids.BindString(1, url); 254 std::set<int64> rows_to_delete; 255 while (select_ids.Step()) 256 rows_to_delete.insert(select_ids.ColumnInt64(0)); 257 258 // Delete from the pages table. 259 sql::Statement delete_page(db_.GetCachedStatement(SQL_FROM_HERE, 260 "DELETE FROM pages WHERE rowid=?")); 261 if (!delete_page) 262 return; 263 for (std::set<int64>::const_iterator i = rows_to_delete.begin(); 264 i != rows_to_delete.end(); ++i) { 265 delete_page.BindInt64(0, *i); 266 if (!delete_page.Run()) { 267 NOTREACHED(); 268 return; 269 } 270 delete_page.Reset(); 271 } 272 273 // Delete from the info table. 274 sql::Statement delete_info(db_.GetCachedStatement(SQL_FROM_HERE, 275 "DELETE FROM info WHERE rowid=?")); 276 if (!delete_info) 277 return; 278 for (std::set<int64>::const_iterator i = rows_to_delete.begin(); 279 i != rows_to_delete.end(); ++i) { 280 delete_info.BindInt64(0, *i); 281 if (!delete_info.Run()) { 282 NOTREACHED(); 283 return; 284 } 285 delete_info.Reset(); 286 } 287} 288 289void TextDatabase::Optimize() { 290 sql::Statement statement(db_.GetCachedStatement(SQL_FROM_HERE, 291 "SELECT OPTIMIZE(pages) FROM pages LIMIT 1")); 292 if (!statement) 293 return; 294 statement.Run(); 295} 296 297void TextDatabase::GetTextMatches(const std::string& query, 298 const QueryOptions& options, 299 std::vector<Match>* results, 300 URLSet* found_urls, 301 base::Time* first_time_searched) { 302 *first_time_searched = options.begin_time; 303 304 sql::Statement statement(db_.GetCachedStatement(SQL_FROM_HERE, 305 "SELECT url, title, time, offsets(pages), body " 306 "FROM pages LEFT OUTER JOIN info ON pages.rowid = info.rowid " 307 "WHERE pages MATCH ? AND time >= ? AND time < ? " 308 "ORDER BY time DESC " 309 "LIMIT ?")); 310 if (!statement) 311 return; 312 313 // When their values indicate "unspecified", saturate the numbers to the max 314 // or min to get the correct result. 315 int64 effective_begin_time = options.begin_time.is_null() ? 316 0 : options.begin_time.ToInternalValue(); 317 int64 effective_end_time = options.end_time.is_null() ? 318 std::numeric_limits<int64>::max() : options.end_time.ToInternalValue(); 319 int effective_max_count = options.max_count ? 320 options.max_count : std::numeric_limits<int>::max(); 321 322 statement.BindString(0, query); 323 statement.BindInt64(1, effective_begin_time); 324 statement.BindInt64(2, effective_end_time); 325 statement.BindInt(3, effective_max_count); 326 327 while (statement.Step()) { 328 // TODO(brettw) allow canceling the query in the middle. 329 // if (canceled_or_something) 330 // break; 331 332 GURL url(statement.ColumnString(0)); 333 URLSet::const_iterator found_url = found_urls->find(url); 334 if (found_url != found_urls->end()) 335 continue; // Don't add this duplicate. 336 337 // Fill the results into the vector (avoid copying the URL with Swap()). 338 results->resize(results->size() + 1); 339 Match& match = results->at(results->size() - 1); 340 match.url.Swap(&url); 341 342 match.title = statement.ColumnString16(1); 343 match.time = base::Time::FromInternalValue(statement.ColumnInt64(2)); 344 345 // Extract any matches in the title. 346 std::string offsets_str = statement.ColumnString(3); 347 Snippet::ExtractMatchPositions(offsets_str, kTitleColumnIndex, 348 &match.title_match_positions); 349 Snippet::ConvertMatchPositionsToWide(statement.ColumnString(1), 350 &match.title_match_positions); 351 352 // Extract the matches in the body. 353 Snippet::MatchPositions match_positions; 354 Snippet::ExtractMatchPositions(offsets_str, kBodyColumnIndex, 355 &match_positions); 356 357 // Compute the snippet based on those matches. 358 std::string body = statement.ColumnString(4); 359 match.snippet.ComputeSnippet(match_positions, body); 360 } 361 362 // When we have returned all the results possible (or determined that there 363 // are none), then we have searched all the time requested, so we can 364 // set the first_time_searched to that value. 365 if (results->size() == 0 || 366 options.max_count == 0 || // Special case for wanting all the results. 367 static_cast<int>(results->size()) < options.max_count) { 368 *first_time_searched = options.begin_time; 369 } else { 370 // Since we got the results in order, we know the last item is the last 371 // time we considered. 372 *first_time_searched = results->back().time; 373 } 374 375 statement.Reset(); 376} 377 378} // namespace history 379