1// Copyright 2013 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "chrome/browser/devtools/devtools_file_system_indexer.h" 6 7#include <iterator> 8 9#include "base/bind.h" 10#include "base/callback.h" 11#include "base/file_util.h" 12#include "base/files/file_enumerator.h" 13#include "base/files/file_util_proxy.h" 14#include "base/lazy_instance.h" 15#include "base/logging.h" 16#include "base/stl_util.h" 17#include "base/strings/utf_string_conversions.h" 18#include "content/public/browser/browser_thread.h" 19 20using base::Bind; 21using base::Callback; 22using base::FileEnumerator; 23using base::FilePath; 24using base::Time; 25using base::TimeDelta; 26using base::TimeTicks; 27using content::BrowserThread; 28using std::map; 29using std::set; 30using std::string; 31using std::vector; 32 33namespace { 34 35typedef int32 Trigram; 36typedef char TrigramChar; 37typedef uint16 FileId; 38 39const int kMinTimeoutBetweenWorkedNitification = 200; 40// Trigram characters include all ASCII printable characters (32-126) except for 41// the capital letters, because the index is case insensitive. 42const size_t kTrigramCharacterCount = 126 - 'Z' - 1 + 'A' - ' ' + 1; 43const size_t kTrigramCount = 44 kTrigramCharacterCount * kTrigramCharacterCount * kTrigramCharacterCount; 45const int kMaxReadLength = 10 * 1024; 46const TrigramChar kUndefinedTrigramChar = -1; 47const Trigram kUndefinedTrigram = -1; 48 49base::LazyInstance<vector<bool> >::Leaky g_is_binary_char = 50 LAZY_INSTANCE_INITIALIZER; 51 52base::LazyInstance<vector<TrigramChar> >::Leaky g_trigram_chars = 53 LAZY_INSTANCE_INITIALIZER; 54 55class Index { 56 public: 57 Index(); 58 Time LastModifiedTimeForFile(const FilePath& file_path); 59 void SetTrigramsForFile(const FilePath& file_path, 60 const vector<Trigram>& index, 61 const Time& time); 62 vector<FilePath> Search(string query); 63 void PrintStats(); 64 void NormalizeVectors(); 65 66 private: 67 ~Index(); 68 69 FileId GetFileId(const FilePath& file_path); 70 71 typedef map<FilePath, FileId> FileIdsMap; 72 FileIdsMap file_ids_; 73 FileId last_file_id_; 74 // The index in this vector is the trigram id. 75 vector<vector<FileId> > index_; 76 typedef map<FilePath, Time> IndexedFilesMap; 77 IndexedFilesMap index_times_; 78 vector<bool> is_normalized_; 79 80 DISALLOW_COPY_AND_ASSIGN(Index); 81}; 82 83base::LazyInstance<Index>::Leaky g_trigram_index = LAZY_INSTANCE_INITIALIZER; 84 85void InitIsBinaryCharMap() { 86 for (size_t i = 0; i < 256; ++i) { 87 unsigned char ch = i; 88 bool is_binary_char = ch < 9 || (ch >= 14 && ch < 32) || ch == 127; 89 g_is_binary_char.Get().push_back(is_binary_char); 90 } 91} 92 93bool IsBinaryChar(char c) { 94 unsigned char uc = static_cast<unsigned char>(c); 95 return g_is_binary_char.Get()[uc]; 96} 97 98void InitTrigramCharsMap() { 99 for (size_t i = 0; i < 256; ++i) { 100 if (i > 127) { 101 g_trigram_chars.Get().push_back(kUndefinedTrigramChar); 102 continue; 103 } 104 char ch = i; 105 if (ch == '\t') 106 ch = ' '; 107 if (ch >= 'A' && ch <= 'Z') 108 ch = ch - 'A' + 'a'; 109 if ((IsBinaryChar(ch)) || (ch < ' ')) { 110 g_trigram_chars.Get().push_back(kUndefinedTrigramChar); 111 continue; 112 } 113 114 if (ch >= 'Z') 115 ch = ch - 'Z' - 1 + 'A'; 116 ch -= ' '; 117 char signed_trigram_char_count = static_cast<char>(kTrigramCharacterCount); 118 CHECK(ch >= 0 && ch < signed_trigram_char_count); 119 g_trigram_chars.Get().push_back(ch); 120 } 121} 122 123TrigramChar TrigramCharForChar(char c) { 124 unsigned char uc = static_cast<unsigned char>(c); 125 return g_trigram_chars.Get()[uc]; 126} 127 128Trigram TrigramAtIndex(const vector<TrigramChar>& trigram_chars, size_t index) { 129 static int kTrigramCharacterCountSquared = 130 kTrigramCharacterCount * kTrigramCharacterCount; 131 if (trigram_chars[index] == kUndefinedTrigramChar || 132 trigram_chars[index + 1] == kUndefinedTrigramChar || 133 trigram_chars[index + 2] == kUndefinedTrigramChar) 134 return kUndefinedTrigram; 135 Trigram trigram = kTrigramCharacterCountSquared * trigram_chars[index] + 136 kTrigramCharacterCount * trigram_chars[index + 1] + 137 trigram_chars[index + 2]; 138 return trigram; 139} 140 141Index::Index() : last_file_id_(0) { 142 index_.resize(kTrigramCount); 143 is_normalized_.resize(kTrigramCount); 144 std::fill(is_normalized_.begin(), is_normalized_.end(), true); 145} 146 147Index::~Index() {} 148 149Time Index::LastModifiedTimeForFile(const FilePath& file_path) { 150 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 151 Time last_modified_time; 152 if (index_times_.find(file_path) != index_times_.end()) 153 last_modified_time = index_times_[file_path]; 154 return last_modified_time; 155} 156 157void Index::SetTrigramsForFile(const FilePath& file_path, 158 const vector<Trigram>& index, 159 const Time& time) { 160 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 161 FileId file_id = GetFileId(file_path); 162 vector<Trigram>::const_iterator it = index.begin(); 163 for (; it != index.end(); ++it) { 164 Trigram trigram = *it; 165 index_[trigram].push_back(file_id); 166 is_normalized_[trigram] = false; 167 } 168 index_times_[file_path] = time; 169} 170 171vector<FilePath> Index::Search(string query) { 172 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 173 const char* data = query.c_str(); 174 vector<TrigramChar> trigram_chars; 175 trigram_chars.reserve(query.size()); 176 for (size_t i = 0; i < query.size(); ++i) 177 trigram_chars.push_back(TrigramCharForChar(data[i])); 178 vector<Trigram> trigrams; 179 for (size_t i = 0; i + 2 < query.size(); ++i) { 180 Trigram trigram = TrigramAtIndex(trigram_chars, i); 181 if (trigram != kUndefinedTrigram) 182 trigrams.push_back(trigram); 183 } 184 set<FileId> file_ids; 185 bool first = true; 186 vector<Trigram>::const_iterator it = trigrams.begin(); 187 for (; it != trigrams.end(); ++it) { 188 Trigram trigram = *it; 189 if (first) { 190 std::copy(index_[trigram].begin(), 191 index_[trigram].end(), 192 std::inserter(file_ids, file_ids.begin())); 193 first = false; 194 continue; 195 } 196 set<FileId> intersection = base::STLSetIntersection<set<FileId> >( 197 file_ids, index_[trigram]); 198 file_ids.swap(intersection); 199 } 200 vector<FilePath> result; 201 FileIdsMap::const_iterator ids_it = file_ids_.begin(); 202 for (; ids_it != file_ids_.end(); ++ids_it) { 203 if (trigrams.size() == 0 || 204 file_ids.find(ids_it->second) != file_ids.end()) { 205 result.push_back(ids_it->first); 206 } 207 } 208 return result; 209} 210 211FileId Index::GetFileId(const FilePath& file_path) { 212 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 213 string file_path_str = file_path.AsUTF8Unsafe(); 214 if (file_ids_.find(file_path) != file_ids_.end()) 215 return file_ids_[file_path]; 216 file_ids_[file_path] = ++last_file_id_; 217 return last_file_id_; 218} 219 220void Index::NormalizeVectors() { 221 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 222 for (size_t i = 0; i < kTrigramCount; ++i) { 223 if (!is_normalized_[i]) { 224 std::sort(index_[i].begin(), index_[i].end()); 225 if (index_[i].capacity() > index_[i].size()) 226 vector<FileId>(index_[i]).swap(index_[i]); 227 is_normalized_[i] = true; 228 } 229 } 230} 231 232void Index::PrintStats() { 233 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 234 LOG(ERROR) << "Index stats:"; 235 size_t size = 0; 236 size_t maxSize = 0; 237 size_t capacity = 0; 238 for (size_t i = 0; i < kTrigramCount; ++i) { 239 if (index_[i].size() > maxSize) 240 maxSize = index_[i].size(); 241 size += index_[i].size(); 242 capacity += index_[i].capacity(); 243 } 244 LOG(ERROR) << " - total trigram count: " << size; 245 LOG(ERROR) << " - max file count per trigram: " << maxSize; 246 LOG(ERROR) << " - total vectors capacity " << capacity; 247 size_t total_index_size = 248 capacity * sizeof(FileId) + sizeof(vector<FileId>) * kTrigramCount; 249 LOG(ERROR) << " - estimated total index size " << total_index_size; 250} 251 252typedef Callback<void(bool, const vector<bool>&)> IndexerCallback; 253 254} // namespace 255 256DevToolsFileSystemIndexer::FileSystemIndexingJob::FileSystemIndexingJob( 257 const FilePath& file_system_path, 258 const TotalWorkCallback& total_work_callback, 259 const WorkedCallback& worked_callback, 260 const DoneCallback& done_callback) 261 : file_system_path_(file_system_path), 262 total_work_callback_(total_work_callback), 263 worked_callback_(worked_callback), 264 done_callback_(done_callback), 265 current_file_(BrowserThread::GetMessageLoopProxyForThread( 266 BrowserThread::FILE).get()), 267 files_indexed_(0), 268 stopped_(false) { 269 current_trigrams_set_.resize(kTrigramCount); 270 current_trigrams_.reserve(kTrigramCount); 271} 272 273DevToolsFileSystemIndexer::FileSystemIndexingJob::~FileSystemIndexingJob() {} 274 275void DevToolsFileSystemIndexer::FileSystemIndexingJob::Start() { 276 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 277 BrowserThread::PostTask( 278 BrowserThread::FILE, 279 FROM_HERE, 280 Bind(&FileSystemIndexingJob::CollectFilesToIndex, this)); 281} 282 283void DevToolsFileSystemIndexer::FileSystemIndexingJob::Stop() { 284 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 285 BrowserThread::PostTask(BrowserThread::FILE, 286 FROM_HERE, 287 Bind(&FileSystemIndexingJob::StopOnFileThread, this)); 288} 289 290void DevToolsFileSystemIndexer::FileSystemIndexingJob::StopOnFileThread() { 291 stopped_ = true; 292} 293 294void DevToolsFileSystemIndexer::FileSystemIndexingJob::CollectFilesToIndex() { 295 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 296 if (stopped_) 297 return; 298 if (!file_enumerator_) { 299 file_enumerator_.reset( 300 new FileEnumerator(file_system_path_, true, FileEnumerator::FILES)); 301 } 302 FilePath file_path = file_enumerator_->Next(); 303 if (file_path.empty()) { 304 BrowserThread::PostTask( 305 BrowserThread::UI, 306 FROM_HERE, 307 Bind(total_work_callback_, file_path_times_.size())); 308 indexing_it_ = file_path_times_.begin(); 309 IndexFiles(); 310 return; 311 } 312 Time saved_last_modified_time = 313 g_trigram_index.Get().LastModifiedTimeForFile(file_path); 314 FileEnumerator::FileInfo file_info = file_enumerator_->GetInfo(); 315 Time current_last_modified_time = file_info.GetLastModifiedTime(); 316 if (current_last_modified_time > saved_last_modified_time) { 317 file_path_times_[file_path] = current_last_modified_time; 318 } 319 BrowserThread::PostTask( 320 BrowserThread::FILE, 321 FROM_HERE, 322 Bind(&FileSystemIndexingJob::CollectFilesToIndex, this)); 323} 324 325void DevToolsFileSystemIndexer::FileSystemIndexingJob::IndexFiles() { 326 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 327 if (stopped_) 328 return; 329 if (indexing_it_ == file_path_times_.end()) { 330 g_trigram_index.Get().NormalizeVectors(); 331 BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, done_callback_); 332 return; 333 } 334 FilePath file_path = indexing_it_->first; 335 current_file_.CreateOrOpen( 336 file_path, 337 base::File::FLAG_OPEN | base::File::FLAG_READ, 338 Bind(&FileSystemIndexingJob::StartFileIndexing, this)); 339} 340 341void DevToolsFileSystemIndexer::FileSystemIndexingJob::StartFileIndexing( 342 base::File::Error error) { 343 if (!current_file_.IsValid()) { 344 FinishFileIndexing(false); 345 return; 346 } 347 current_file_offset_ = 0; 348 current_trigrams_.clear(); 349 std::fill(current_trigrams_set_.begin(), current_trigrams_set_.end(), false); 350 ReadFromFile(); 351} 352 353void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReadFromFile() { 354 if (stopped_) { 355 CloseFile(); 356 return; 357 } 358 current_file_.Read(current_file_offset_, kMaxReadLength, 359 Bind(&FileSystemIndexingJob::OnRead, this)); 360} 361 362void DevToolsFileSystemIndexer::FileSystemIndexingJob::OnRead( 363 base::File::Error error, 364 const char* data, 365 int bytes_read) { 366 if (error != base::File::FILE_OK) { 367 FinishFileIndexing(false); 368 return; 369 } 370 371 if (!bytes_read || bytes_read < 3) { 372 FinishFileIndexing(true); 373 return; 374 } 375 376 size_t size = static_cast<size_t>(bytes_read); 377 vector<TrigramChar> trigram_chars; 378 trigram_chars.reserve(size); 379 for (size_t i = 0; i < size; ++i) { 380 if (IsBinaryChar(data[i])) { 381 current_trigrams_.clear(); 382 FinishFileIndexing(true); 383 return; 384 } 385 trigram_chars.push_back(TrigramCharForChar(data[i])); 386 } 387 388 for (size_t i = 0; i + 2 < size; ++i) { 389 Trigram trigram = TrigramAtIndex(trigram_chars, i); 390 if ((trigram != kUndefinedTrigram) && !current_trigrams_set_[trigram]) { 391 current_trigrams_set_[trigram] = true; 392 current_trigrams_.push_back(trigram); 393 } 394 } 395 current_file_offset_ += bytes_read - 2; 396 ReadFromFile(); 397} 398 399void DevToolsFileSystemIndexer::FileSystemIndexingJob::FinishFileIndexing( 400 bool success) { 401 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 402 CloseFile(); 403 if (success) { 404 FilePath file_path = indexing_it_->first; 405 g_trigram_index.Get().SetTrigramsForFile( 406 file_path, current_trigrams_, file_path_times_[file_path]); 407 } 408 ReportWorked(); 409 ++indexing_it_; 410 IndexFiles(); 411} 412 413void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseFile() { 414 if (current_file_.IsValid()) 415 current_file_.Close(Bind(&FileSystemIndexingJob::CloseCallback, this)); 416} 417 418void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseCallback( 419 base::File::Error error) {} 420 421void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReportWorked() { 422 TimeTicks current_time = TimeTicks::Now(); 423 bool should_send_worked_nitification = true; 424 if (!last_worked_notification_time_.is_null()) { 425 TimeDelta delta = current_time - last_worked_notification_time_; 426 if (delta.InMilliseconds() < kMinTimeoutBetweenWorkedNitification) 427 should_send_worked_nitification = false; 428 } 429 ++files_indexed_; 430 if (should_send_worked_nitification) { 431 last_worked_notification_time_ = current_time; 432 BrowserThread::PostTask( 433 BrowserThread::UI, FROM_HERE, Bind(worked_callback_, files_indexed_)); 434 files_indexed_ = 0; 435 } 436} 437 438DevToolsFileSystemIndexer::DevToolsFileSystemIndexer() { 439 static bool maps_initialized = false; 440 if (!maps_initialized) { 441 InitIsBinaryCharMap(); 442 InitTrigramCharsMap(); 443 maps_initialized = true; 444 } 445} 446 447DevToolsFileSystemIndexer::~DevToolsFileSystemIndexer() {} 448 449scoped_refptr<DevToolsFileSystemIndexer::FileSystemIndexingJob> 450DevToolsFileSystemIndexer::IndexPath( 451 const string& file_system_path, 452 const TotalWorkCallback& total_work_callback, 453 const WorkedCallback& worked_callback, 454 const DoneCallback& done_callback) { 455 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 456 scoped_refptr<FileSystemIndexingJob> indexing_job = 457 new FileSystemIndexingJob(FilePath::FromUTF8Unsafe(file_system_path), 458 total_work_callback, 459 worked_callback, 460 done_callback); 461 indexing_job->Start(); 462 return indexing_job; 463} 464 465void DevToolsFileSystemIndexer::SearchInPath(const string& file_system_path, 466 const string& query, 467 const SearchCallback& callback) { 468 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 469 BrowserThread::PostTask( 470 BrowserThread::FILE, 471 FROM_HERE, 472 Bind(&DevToolsFileSystemIndexer::SearchInPathOnFileThread, 473 this, 474 file_system_path, 475 query, 476 callback)); 477} 478 479void DevToolsFileSystemIndexer::SearchInPathOnFileThread( 480 const string& file_system_path, 481 const string& query, 482 const SearchCallback& callback) { 483 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 484 vector<FilePath> file_paths = g_trigram_index.Get().Search(query); 485 vector<string> result; 486 FilePath path = FilePath::FromUTF8Unsafe(file_system_path); 487 vector<FilePath>::const_iterator it = file_paths.begin(); 488 for (; it != file_paths.end(); ++it) { 489 if (path.IsParent(*it)) 490 result.push_back(it->AsUTF8Unsafe()); 491 } 492 BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, Bind(callback, result)); 493} 494