1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/history/text_database_manager.h"
6
7#include "base/compiler_specific.h"
8#include "base/file_util.h"
9#include "base/metrics/histogram.h"
10#include "base/logging.h"
11#include "base/message_loop.h"
12#include "base/string_util.h"
13#include "base/utf_string_conversions.h"
14#include "chrome/browser/history/history_publisher.h"
15#include "chrome/browser/history/visit_database.h"
16#include "content/common/mru_cache.h"
17
18using base::Time;
19using base::TimeDelta;
20using base::TimeTicks;
21
22namespace history {
23
24namespace {
25
26// The number of database files we will be attached to at once.
27const int kCacheDBSize = 5;
28
29std::string ConvertStringForIndexer(const string16& input) {
30  // TODO(evanm): other transformations here?
31  return UTF16ToUTF8(CollapseWhitespace(input, false));
32}
33
34// Data older than this will be committed to the full text index even if we
35// haven't gotten a title and/or body.
36const int kExpirationSec = 20;
37
38}  // namespace
39
40// TextDatabaseManager::ChangeSet ----------------------------------------------
41
42TextDatabaseManager::ChangeSet::ChangeSet() {}
43
44TextDatabaseManager::ChangeSet::~ChangeSet() {}
45
46// TextDatabaseManager::PageInfo -----------------------------------------------
47
48TextDatabaseManager::PageInfo::PageInfo(URLID url_id,
49                                        VisitID visit_id,
50                                        Time visit_time)
51    : url_id_(url_id),
52      visit_id_(visit_id),
53      visit_time_(visit_time) {
54  added_time_ = TimeTicks::Now();
55}
56
57TextDatabaseManager::PageInfo::~PageInfo() {}
58
59void TextDatabaseManager::PageInfo::set_title(const string16& ttl) {
60  if (ttl.empty())  // Make the title nonempty when we set it for EverybodySet.
61    title_ = ASCIIToUTF16(" ");
62  else
63    title_ = ttl;
64}
65
66void TextDatabaseManager::PageInfo::set_body(const string16& bdy) {
67  if (bdy.empty())  // Make the body nonempty when we set it for EverybodySet.
68    body_ = ASCIIToUTF16(" ");
69  else
70    body_ = bdy;
71}
72
73bool TextDatabaseManager::PageInfo::Expired(TimeTicks now) const {
74  return now - added_time_ > TimeDelta::FromSeconds(kExpirationSec);
75}
76
77// TextDatabaseManager ---------------------------------------------------------
78
79TextDatabaseManager::TextDatabaseManager(const FilePath& dir,
80                                         URLDatabase* url_database,
81                                         VisitDatabase* visit_database)
82    : dir_(dir),
83      url_database_(url_database),
84      visit_database_(visit_database),
85      recent_changes_(RecentChangeList::NO_AUTO_EVICT),
86      transaction_nesting_(0),
87      db_cache_(DBCache::NO_AUTO_EVICT),
88      present_databases_loaded_(false),
89      ALLOW_THIS_IN_INITIALIZER_LIST(factory_(this)),
90      history_publisher_(NULL) {
91}
92
93TextDatabaseManager::~TextDatabaseManager() {
94  if (transaction_nesting_)
95    CommitTransaction();
96}
97
98// static
99TextDatabase::DBIdent TextDatabaseManager::TimeToID(Time time) {
100  Time::Exploded exploded;
101  time.UTCExplode(&exploded);
102
103  // We combine the month and year into a 6-digit number (200801 for
104  // January, 2008). The month is 1-based.
105  return exploded.year * 100 + exploded.month;
106}
107
108// static
109Time TextDatabaseManager::IDToTime(TextDatabase::DBIdent id) {
110  Time::Exploded exploded;
111  memset(&exploded, 0, sizeof(Time::Exploded));
112  exploded.year = id / 100;
113  exploded.month = id % 100;
114  return Time::FromUTCExploded(exploded);
115}
116
117bool TextDatabaseManager::Init(const HistoryPublisher* history_publisher) {
118  history_publisher_ = history_publisher;
119
120  // Start checking recent changes and committing them.
121  ScheduleFlushOldChanges();
122  return true;
123}
124
125void TextDatabaseManager::BeginTransaction() {
126  transaction_nesting_++;
127}
128
129void TextDatabaseManager::CommitTransaction() {
130  DCHECK(transaction_nesting_);
131  transaction_nesting_--;
132  if (transaction_nesting_)
133    return;  // Still more nesting of transactions before committing.
134
135  // Commit all databases with open transactions on them.
136  for (DBIdentSet::const_iterator i = open_transactions_.begin();
137       i != open_transactions_.end(); ++i) {
138    DBCache::iterator iter = db_cache_.Get(*i);
139    if (iter == db_cache_.end()) {
140      NOTREACHED() << "All open transactions should be cached.";
141      continue;
142    }
143    iter->second->CommitTransaction();
144  }
145  open_transactions_.clear();
146
147  // Now that the transaction is over, we can expire old connections.
148  db_cache_.ShrinkToSize(kCacheDBSize);
149}
150
151void TextDatabaseManager::InitDBList() {
152  if (present_databases_loaded_)
153    return;
154
155  present_databases_loaded_ = true;
156
157  // Find files on disk matching our pattern so we can quickly test for them.
158  FilePath::StringType filepattern(TextDatabase::file_base());
159  filepattern.append(FILE_PATH_LITERAL("*"));
160  file_util::FileEnumerator enumerator(
161      dir_, false, file_util::FileEnumerator::FILES, filepattern);
162  FilePath cur_file;
163  while (!(cur_file = enumerator.Next()).empty()) {
164    // Convert to the number representing this file.
165    TextDatabase::DBIdent id = TextDatabase::FileNameToID(cur_file);
166    if (id)  // Will be 0 on error.
167      present_databases_.insert(id);
168  }
169}
170
171void TextDatabaseManager::AddPageURL(const GURL& url,
172                                     URLID url_id,
173                                     VisitID visit_id,
174                                     Time time) {
175  // Delete any existing page info.
176  RecentChangeList::iterator found = recent_changes_.Peek(url);
177  if (found != recent_changes_.end())
178    recent_changes_.Erase(found);
179
180  // Just save this info for later. We will save it when it expires or when all
181  // the data is complete.
182  recent_changes_.Put(url, PageInfo(url_id, visit_id, time));
183}
184
185void TextDatabaseManager::AddPageTitle(const GURL& url,
186                                       const string16& title) {
187  RecentChangeList::iterator found = recent_changes_.Peek(url);
188  if (found == recent_changes_.end()) {
189    // This page is not in our cache of recent pages. This is very much an edge
190    // case as normally a title will come in <20 seconds after the page commits,
191    // and TabContents will avoid spamming us with >1 title per page. However,
192    // it could come up if your connection is unhappy, and we don't want to
193    // miss anything.
194    //
195    // To solve this problem, we'll just associate the most recent visit with
196    // the new title and index that using the regular code path.
197    URLRow url_row;
198    if (!url_database_->GetRowForURL(url, &url_row))
199      return;  // URL is unknown, give up.
200    VisitRow visit;
201    if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
202      return;  // No recent visit, give up.
203
204    if (visit.is_indexed) {
205      // If this page was already indexed, we could have a body that came in
206      // first and we don't want to overwrite it. We could go query for the
207      // current body, or have a special setter for only the title, but this is
208      // not worth it for this edge case.
209      //
210      // It will be almost impossible for the title to take longer than
211      // kExpirationSec yet we got a body in less than that time, since the
212      // title should always come in first.
213      return;
214    }
215
216    AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
217                title, string16());
218    return;  // We don't know about this page, give up.
219  }
220
221  PageInfo& info = found->second;
222  if (info.has_body()) {
223    // This info is complete, write to the database.
224    AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
225                title, info.body());
226    recent_changes_.Erase(found);
227    return;
228  }
229
230  info.set_title(title);
231}
232
233void TextDatabaseManager::AddPageContents(const GURL& url,
234                                          const string16& body) {
235  RecentChangeList::iterator found = recent_changes_.Peek(url);
236  if (found == recent_changes_.end()) {
237    // This page is not in our cache of recent pages. This means that the page
238    // took more than kExpirationSec to load. Often, this will be the result of
239    // a very slow iframe or other resource on the page that makes us think its
240    // still loading.
241    //
242    // As a fallback, set the most recent visit's contents using the input, and
243    // use the last set title in the URL table as the title to index.
244    URLRow url_row;
245    if (!url_database_->GetRowForURL(url, &url_row))
246      return;  // URL is unknown, give up.
247    VisitRow visit;
248    if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
249      return;  // No recent visit, give up.
250
251    // Use the title from the URL row as the title for the indexing.
252    AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
253                url_row.title(), body);
254    return;
255  }
256
257  PageInfo& info = found->second;
258  if (info.has_title()) {
259    // This info is complete, write to the database.
260    AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
261                info.title(), body);
262    recent_changes_.Erase(found);
263    return;
264  }
265
266  info.set_body(body);
267}
268
269bool TextDatabaseManager::AddPageData(const GURL& url,
270                                      URLID url_id,
271                                      VisitID visit_id,
272                                      Time visit_time,
273                                      const string16& title,
274                                      const string16& body) {
275  TextDatabase* db = GetDBForTime(visit_time, true);
276  if (!db)
277    return false;
278
279  TimeTicks beginning_time = TimeTicks::Now();
280
281  // First delete any recently-indexed data for this page. This will delete
282  // anything in the main database, but we don't bother looking through the
283  // archived database.
284  VisitVector visits;
285  visit_database_->GetVisitsForURL(url_id, &visits);
286  size_t our_visit_row_index = visits.size();
287  for (size_t i = 0; i < visits.size(); i++) {
288    // While we're going trough all the visits, also find our row so we can
289    // avoid another DB query.
290    if (visits[i].visit_id == visit_id) {
291      our_visit_row_index = i;
292    } else if (visits[i].is_indexed) {
293      visits[i].is_indexed = false;
294      visit_database_->UpdateVisitRow(visits[i]);
295      DeletePageData(visits[i].visit_time, url, NULL);
296    }
297  }
298
299  if (visit_id) {
300    // We're supposed to update the visit database.
301    if (our_visit_row_index >= visits.size()) {
302      NOTREACHED() << "We should always have found a visit when given an ID.";
303      return false;
304    }
305
306    DCHECK(visit_time == visits[our_visit_row_index].visit_time);
307
308    // Update the visit database to reference our addition.
309    visits[our_visit_row_index].is_indexed = true;
310    if (!visit_database_->UpdateVisitRow(visits[our_visit_row_index]))
311      return false;
312  }
313
314  // Now index the data.
315  std::string url_str = URLDatabase::GURLToDatabaseURL(url);
316  bool success = db->AddPageData(visit_time, url_str,
317                                 ConvertStringForIndexer(title),
318                                 ConvertStringForIndexer(body));
319
320  UMA_HISTOGRAM_TIMES("History.AddFTSData",
321                      TimeTicks::Now() - beginning_time);
322
323  if (history_publisher_)
324    history_publisher_->PublishPageContent(visit_time, url, title, body);
325
326  return success;
327}
328
329void TextDatabaseManager::DeletePageData(Time time, const GURL& url,
330                                         ChangeSet* change_set) {
331  TextDatabase::DBIdent db_ident = TimeToID(time);
332
333  // We want to open the database for writing, but only if it exists. To
334  // achieve this, we check whether it exists by saying we're not going to
335  // write to it (avoiding the autocreation code normally called when writing)
336  // and then access it for writing only if it succeeds.
337  TextDatabase* db = GetDB(db_ident, false);
338  if (!db)
339    return;
340  db = GetDB(db_ident, true);
341
342  if (change_set)
343    change_set->Add(db_ident);
344
345  db->DeletePageData(time, URLDatabase::GURLToDatabaseURL(url));
346}
347
348void TextDatabaseManager::DeleteFromUncommitted(
349    const std::set<GURL>& restrict_urls, Time begin, Time end) {
350  // First find the beginning of the range to delete. Recall that the list
351  // has the most recent item at the beginning. There won't normally be very
352  // many items, so a brute-force search is fine.
353  RecentChangeList::iterator cur = recent_changes_.begin();
354  if (!end.is_null()) {
355    // Walk from the beginning of the list backwards in time to find the newest
356    // entry that should be deleted.
357    while (cur != recent_changes_.end() && cur->second.visit_time() >= end)
358      ++cur;
359  }
360
361  // Now delete all visits up to the oldest one we were supposed to delete.
362  // Note that if begin is_null, it will be less than or equal to any other
363  // time.
364  if (restrict_urls.empty()) {
365    while (cur != recent_changes_.end() && cur->second.visit_time() >= begin)
366      cur = recent_changes_.Erase(cur);
367  } else {
368    while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) {
369      if (restrict_urls.find(cur->first) != restrict_urls.end())
370        cur = recent_changes_.Erase(cur);
371      else
372        ++cur;
373    }
374  }
375}
376
377void TextDatabaseManager::DeleteAll() {
378  DCHECK_EQ(0, transaction_nesting_) << "Calling deleteAll in a transaction.";
379
380  InitDBList();
381
382  // Close all open databases.
383  db_cache_.Clear();
384
385  // Now go through and delete all the files.
386  for (DBIdentSet::iterator i = present_databases_.begin();
387       i != present_databases_.end(); ++i) {
388    FilePath file_name = dir_.Append(TextDatabase::IDToFileName(*i));
389    file_util::Delete(file_name, false);
390  }
391}
392
393void TextDatabaseManager::OptimizeChangedDatabases(
394    const ChangeSet& change_set) {
395  for (ChangeSet::DBSet::const_iterator i =
396           change_set.changed_databases_.begin();
397       i != change_set.changed_databases_.end(); ++i) {
398    // We want to open the database for writing, but only if it exists. To
399    // achieve this, we check whether it exists by saying we're not going to
400    // write to it (avoiding the autocreation code normally called when writing)
401    // and then access it for writing only if it succeeds.
402    TextDatabase* db = GetDB(*i, false);
403    if (!db)
404      continue;
405    db = GetDB(*i, true);
406    if (!db)
407      continue;  // The file may have changed or something.
408    db->Optimize();
409  }
410}
411
412void TextDatabaseManager::GetTextMatches(
413    const string16& query,
414    const QueryOptions& options,
415    std::vector<TextDatabase::Match>* results,
416    Time* first_time_searched) {
417  results->clear();
418
419  InitDBList();
420  if (present_databases_.empty()) {
421    // Nothing to search.
422    *first_time_searched = options.begin_time;
423    return;
424  }
425
426  // Get the query into the proper format for the individual DBs.
427  string16 fts_query16;
428  query_parser_.ParseQuery(query, &fts_query16);
429  std::string fts_query = UTF16ToUTF8(fts_query16);
430
431  // Need a copy of the options so we can modify the max count for each call
432  // to the individual databases.
433  QueryOptions cur_options(options);
434
435  // Compute the minimum and maximum values for the identifiers that could
436  // encompass the input time range.
437  TextDatabase::DBIdent min_ident = options.begin_time.is_null() ?
438      *present_databases_.begin() :
439      TimeToID(options.begin_time);
440  TextDatabase::DBIdent max_ident = options.end_time.is_null() ?
441      *present_databases_.rbegin() :
442      TimeToID(options.end_time);
443
444  // Iterate over the databases from the most recent backwards.
445  bool checked_one = false;
446  TextDatabase::URLSet found_urls;
447  for (DBIdentSet::reverse_iterator i = present_databases_.rbegin();
448       i != present_databases_.rend();
449       ++i) {
450    // TODO(brettw) allow canceling the query in the middle.
451    // if (canceled_or_something)
452    //   break;
453
454    // This code is stupid, we just loop until we find the correct starting
455    // time range rather than search in an intelligent way. Users will have a
456    // few dozen files at most, so this should not be an issue.
457    if (*i > max_ident)
458      continue;  // Haven't gotten to the time range yet.
459    if (*i < min_ident)
460      break;  // Covered all the time range.
461
462    TextDatabase* cur_db = GetDB(*i, false);
463    if (!cur_db)
464      continue;
465
466    // Adjust the max count according to how many results we've already got.
467    if (options.max_count) {
468      cur_options.max_count = options.max_count -
469          static_cast<int>(results->size());
470    }
471
472    // Since we are going backwards in time, it is always OK to pass the
473    // current first_time_searched, since it will always be smaller than
474    // any previous set.
475    cur_db->GetTextMatches(fts_query, cur_options,
476                           results, &found_urls, first_time_searched);
477    checked_one = true;
478
479    DCHECK(options.max_count == 0 ||
480           static_cast<int>(results->size()) <= options.max_count);
481    if (options.max_count &&
482        static_cast<int>(results->size()) >= options.max_count)
483      break;  // Got the max number of results.
484  }
485
486  // When there were no databases in the range, we need to fix up the min time.
487  if (!checked_one)
488    *first_time_searched = options.begin_time;
489}
490
491TextDatabase* TextDatabaseManager::GetDB(TextDatabase::DBIdent id,
492                                         bool for_writing) {
493  DBCache::iterator found_db = db_cache_.Get(id);
494  if (found_db != db_cache_.end()) {
495    if (transaction_nesting_ && for_writing &&
496        open_transactions_.find(id) == open_transactions_.end()) {
497      // If we currently have an open transaction, that database is not yet
498      // part of the transaction, and the database will be written to, it needs
499      // to be part of our transaction.
500      found_db->second->BeginTransaction();
501      open_transactions_.insert(id);
502    }
503    return found_db->second;
504  }
505
506  // Need to make the database.
507  TextDatabase* new_db = new TextDatabase(dir_, id, for_writing);
508  if (!new_db->Init()) {
509    delete new_db;
510    return NULL;
511  }
512  db_cache_.Put(id, new_db);
513  present_databases_.insert(id);
514
515  if (transaction_nesting_ && for_writing) {
516    // If we currently have an open transaction and the new database will be
517    // written to, it needs to be part of our transaction.
518    new_db->BeginTransaction();
519    open_transactions_.insert(id);
520  }
521
522  // When no transaction is open, allow this new one to kick out an old one.
523  if (!transaction_nesting_)
524    db_cache_.ShrinkToSize(kCacheDBSize);
525
526  return new_db;
527}
528
529TextDatabase* TextDatabaseManager::GetDBForTime(Time time,
530                                                bool create_if_necessary) {
531  return GetDB(TimeToID(time), create_if_necessary);
532}
533
534void TextDatabaseManager::ScheduleFlushOldChanges() {
535  factory_.RevokeAll();
536  MessageLoop::current()->PostDelayedTask(FROM_HERE, factory_.NewRunnableMethod(
537          &TextDatabaseManager::FlushOldChanges),
538      kExpirationSec * Time::kMillisecondsPerSecond);
539}
540
541void TextDatabaseManager::FlushOldChanges() {
542  FlushOldChangesForTime(TimeTicks::Now());
543}
544
545void TextDatabaseManager::FlushOldChangesForTime(TimeTicks now) {
546  // The end of the list is the oldest, so we just start from there committing
547  // things until we get something too new.
548  RecentChangeList::reverse_iterator i = recent_changes_.rbegin();
549  while (i != recent_changes_.rend() && i->second.Expired(now)) {
550    AddPageData(i->first, i->second.url_id(), i->second.visit_id(),
551                i->second.visit_time(), i->second.title(), i->second.body());
552    i = recent_changes_.Erase(i);
553  }
554
555  ScheduleFlushOldChanges();
556}
557
558}  // namespace history
559