1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/devtools/devtools_file_system_indexer.h"
6
7#include <iterator>
8
9#include "base/bind.h"
10#include "base/callback.h"
11#include "base/file_util.h"
12#include "base/files/file_enumerator.h"
13#include "base/files/file_util_proxy.h"
14#include "base/lazy_instance.h"
15#include "base/logging.h"
16#include "base/stl_util.h"
17#include "base/strings/utf_string_conversions.h"
18#include "content/public/browser/browser_thread.h"
19
20using base::Bind;
21using base::Callback;
22using base::FileEnumerator;
23using base::FilePath;
24using base::Time;
25using base::TimeDelta;
26using base::TimeTicks;
27using content::BrowserThread;
28using std::map;
29using std::set;
30using std::string;
31using std::vector;
32
33namespace {
34
35typedef int32 Trigram;
36typedef char TrigramChar;
37typedef uint16 FileId;
38
39const int kMinTimeoutBetweenWorkedNitification = 200;
40// Trigram characters include all ASCII printable characters (32-126) except for
41// the capital letters, because the index is case insensitive.
42const size_t kTrigramCharacterCount = 126 - 'Z' - 1 + 'A' - ' ' + 1;
43const size_t kTrigramCount =
44    kTrigramCharacterCount * kTrigramCharacterCount * kTrigramCharacterCount;
45const int kMaxReadLength = 10 * 1024;
46const TrigramChar kUndefinedTrigramChar = -1;
47const Trigram kUndefinedTrigram = -1;
48
49base::LazyInstance<vector<bool> >::Leaky g_is_binary_char =
50    LAZY_INSTANCE_INITIALIZER;
51
52base::LazyInstance<vector<TrigramChar> >::Leaky g_trigram_chars =
53    LAZY_INSTANCE_INITIALIZER;
54
55class Index {
56 public:
57  Index();
58  Time LastModifiedTimeForFile(const FilePath& file_path);
59  void SetTrigramsForFile(const FilePath& file_path,
60                          const vector<Trigram>& index,
61                          const Time& time);
62  vector<FilePath> Search(string query);
63  void PrintStats();
64  void NormalizeVectors();
65
66 private:
67  ~Index();
68
69  FileId GetFileId(const FilePath& file_path);
70
71  typedef map<FilePath, FileId> FileIdsMap;
72  FileIdsMap file_ids_;
73  FileId last_file_id_;
74  // The index in this vector is the trigram id.
75  vector<vector<FileId> > index_;
76  typedef map<FilePath, Time> IndexedFilesMap;
77  IndexedFilesMap index_times_;
78  vector<bool> is_normalized_;
79
80  DISALLOW_COPY_AND_ASSIGN(Index);
81};
82
83base::LazyInstance<Index>::Leaky g_trigram_index = LAZY_INSTANCE_INITIALIZER;
84
85void InitIsBinaryCharMap() {
86  for (size_t i = 0; i < 256; ++i) {
87    unsigned char ch = i;
88    bool is_binary_char = ch < 9 || (ch >= 14 && ch < 32) || ch == 127;
89    g_is_binary_char.Get().push_back(is_binary_char);
90  }
91}
92
93bool IsBinaryChar(char c) {
94  unsigned char uc = static_cast<unsigned char>(c);
95  return g_is_binary_char.Get()[uc];
96}
97
98void InitTrigramCharsMap() {
99  for (size_t i = 0; i < 256; ++i) {
100    if (i > 127) {
101      g_trigram_chars.Get().push_back(kUndefinedTrigramChar);
102      continue;
103    }
104    char ch = i;
105    if (ch == '\t')
106      ch = ' ';
107    if (ch >= 'A' && ch <= 'Z')
108      ch = ch - 'A' + 'a';
109    if ((IsBinaryChar(ch)) || (ch < ' ')) {
110      g_trigram_chars.Get().push_back(kUndefinedTrigramChar);
111      continue;
112    }
113
114    if (ch >= 'Z')
115      ch = ch - 'Z' - 1 + 'A';
116    ch -= ' ';
117    char signed_trigram_char_count = static_cast<char>(kTrigramCharacterCount);
118    CHECK(ch >= 0 && ch < signed_trigram_char_count);
119    g_trigram_chars.Get().push_back(ch);
120  }
121}
122
123TrigramChar TrigramCharForChar(char c) {
124  unsigned char uc = static_cast<unsigned char>(c);
125  return g_trigram_chars.Get()[uc];
126}
127
128Trigram TrigramAtIndex(const vector<TrigramChar>& trigram_chars, size_t index) {
129  static int kTrigramCharacterCountSquared =
130      kTrigramCharacterCount * kTrigramCharacterCount;
131  if (trigram_chars[index] == kUndefinedTrigramChar ||
132      trigram_chars[index + 1] == kUndefinedTrigramChar ||
133      trigram_chars[index + 2] == kUndefinedTrigramChar)
134    return kUndefinedTrigram;
135  Trigram trigram = kTrigramCharacterCountSquared * trigram_chars[index] +
136                    kTrigramCharacterCount * trigram_chars[index + 1] +
137                    trigram_chars[index + 2];
138  return trigram;
139}
140
141Index::Index() : last_file_id_(0) {
142  index_.resize(kTrigramCount);
143  is_normalized_.resize(kTrigramCount);
144  std::fill(is_normalized_.begin(), is_normalized_.end(), true);
145}
146
147Index::~Index() {}
148
149Time Index::LastModifiedTimeForFile(const FilePath& file_path) {
150  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
151  Time last_modified_time;
152  if (index_times_.find(file_path) != index_times_.end())
153    last_modified_time = index_times_[file_path];
154  return last_modified_time;
155}
156
157void Index::SetTrigramsForFile(const FilePath& file_path,
158                               const vector<Trigram>& index,
159                               const Time& time) {
160  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
161  FileId file_id = GetFileId(file_path);
162  vector<Trigram>::const_iterator it = index.begin();
163  for (; it != index.end(); ++it) {
164    Trigram trigram = *it;
165    index_[trigram].push_back(file_id);
166    is_normalized_[trigram] = false;
167  }
168  index_times_[file_path] = time;
169}
170
171vector<FilePath> Index::Search(string query) {
172  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
173  const char* data = query.c_str();
174  vector<TrigramChar> trigram_chars;
175  trigram_chars.reserve(query.size());
176  for (size_t i = 0; i < query.size(); ++i)
177      trigram_chars.push_back(TrigramCharForChar(data[i]));
178  vector<Trigram> trigrams;
179  for (size_t i = 0; i + 2 < query.size(); ++i) {
180    Trigram trigram = TrigramAtIndex(trigram_chars, i);
181    if (trigram != kUndefinedTrigram)
182      trigrams.push_back(trigram);
183  }
184  set<FileId> file_ids;
185  bool first = true;
186  vector<Trigram>::const_iterator it = trigrams.begin();
187  for (; it != trigrams.end(); ++it) {
188    Trigram trigram = *it;
189    if (first) {
190      std::copy(index_[trigram].begin(),
191                index_[trigram].end(),
192                std::inserter(file_ids, file_ids.begin()));
193      first = false;
194      continue;
195    }
196    set<FileId> intersection = base::STLSetIntersection<set<FileId> >(
197        file_ids, index_[trigram]);
198    file_ids.swap(intersection);
199  }
200  vector<FilePath> result;
201  FileIdsMap::const_iterator ids_it = file_ids_.begin();
202  for (; ids_it != file_ids_.end(); ++ids_it) {
203    if (trigrams.size() == 0 ||
204        file_ids.find(ids_it->second) != file_ids.end()) {
205      result.push_back(ids_it->first);
206    }
207  }
208  return result;
209}
210
211FileId Index::GetFileId(const FilePath& file_path) {
212  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
213  string file_path_str = file_path.AsUTF8Unsafe();
214  if (file_ids_.find(file_path) != file_ids_.end())
215    return file_ids_[file_path];
216  file_ids_[file_path] = ++last_file_id_;
217  return last_file_id_;
218}
219
220void Index::NormalizeVectors() {
221  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
222  for (size_t i = 0; i < kTrigramCount; ++i) {
223    if (!is_normalized_[i]) {
224      std::sort(index_[i].begin(), index_[i].end());
225      if (index_[i].capacity() > index_[i].size())
226        vector<FileId>(index_[i]).swap(index_[i]);
227      is_normalized_[i] = true;
228    }
229  }
230}
231
232void Index::PrintStats() {
233  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
234  LOG(ERROR) << "Index stats:";
235  size_t size = 0;
236  size_t maxSize = 0;
237  size_t capacity = 0;
238  for (size_t i = 0; i < kTrigramCount; ++i) {
239    if (index_[i].size() > maxSize)
240      maxSize = index_[i].size();
241    size += index_[i].size();
242    capacity += index_[i].capacity();
243  }
244  LOG(ERROR) << "  - total trigram count: " << size;
245  LOG(ERROR) << "  - max file count per trigram: " << maxSize;
246  LOG(ERROR) << "  - total vectors capacity " << capacity;
247  size_t total_index_size =
248      capacity * sizeof(FileId) + sizeof(vector<FileId>) * kTrigramCount;
249  LOG(ERROR) << "  - estimated total index size " << total_index_size;
250}
251
252typedef Callback<void(bool, const vector<bool>&)> IndexerCallback;
253
254}  // namespace
255
256DevToolsFileSystemIndexer::FileSystemIndexingJob::FileSystemIndexingJob(
257    const FilePath& file_system_path,
258    const TotalWorkCallback& total_work_callback,
259    const WorkedCallback& worked_callback,
260    const DoneCallback& done_callback)
261    : file_system_path_(file_system_path),
262      total_work_callback_(total_work_callback),
263      worked_callback_(worked_callback),
264      done_callback_(done_callback),
265      current_file_(BrowserThread::GetMessageLoopProxyForThread(
266                        BrowserThread::FILE).get()),
267      files_indexed_(0),
268      stopped_(false) {
269  current_trigrams_set_.resize(kTrigramCount);
270  current_trigrams_.reserve(kTrigramCount);
271}
272
273DevToolsFileSystemIndexer::FileSystemIndexingJob::~FileSystemIndexingJob() {}
274
275void DevToolsFileSystemIndexer::FileSystemIndexingJob::Start() {
276  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
277  BrowserThread::PostTask(
278      BrowserThread::FILE,
279      FROM_HERE,
280      Bind(&FileSystemIndexingJob::CollectFilesToIndex, this));
281}
282
283void DevToolsFileSystemIndexer::FileSystemIndexingJob::Stop() {
284  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
285  BrowserThread::PostTask(BrowserThread::FILE,
286                          FROM_HERE,
287                          Bind(&FileSystemIndexingJob::StopOnFileThread, this));
288}
289
290void DevToolsFileSystemIndexer::FileSystemIndexingJob::StopOnFileThread() {
291  stopped_ = true;
292}
293
294void DevToolsFileSystemIndexer::FileSystemIndexingJob::CollectFilesToIndex() {
295  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
296  if (stopped_)
297    return;
298  if (!file_enumerator_) {
299    file_enumerator_.reset(
300        new FileEnumerator(file_system_path_, true, FileEnumerator::FILES));
301  }
302  FilePath file_path = file_enumerator_->Next();
303  if (file_path.empty()) {
304    BrowserThread::PostTask(
305        BrowserThread::UI,
306        FROM_HERE,
307        Bind(total_work_callback_, file_path_times_.size()));
308    indexing_it_ = file_path_times_.begin();
309    IndexFiles();
310    return;
311  }
312  Time saved_last_modified_time =
313      g_trigram_index.Get().LastModifiedTimeForFile(file_path);
314  FileEnumerator::FileInfo file_info = file_enumerator_->GetInfo();
315  Time current_last_modified_time = file_info.GetLastModifiedTime();
316  if (current_last_modified_time > saved_last_modified_time) {
317    file_path_times_[file_path] = current_last_modified_time;
318  }
319  BrowserThread::PostTask(
320      BrowserThread::FILE,
321      FROM_HERE,
322      Bind(&FileSystemIndexingJob::CollectFilesToIndex, this));
323}
324
325void DevToolsFileSystemIndexer::FileSystemIndexingJob::IndexFiles() {
326  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
327  if (stopped_)
328    return;
329  if (indexing_it_ == file_path_times_.end()) {
330    g_trigram_index.Get().NormalizeVectors();
331    BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, done_callback_);
332    return;
333  }
334  FilePath file_path = indexing_it_->first;
335  current_file_.CreateOrOpen(
336        file_path,
337        base::File::FLAG_OPEN | base::File::FLAG_READ,
338        Bind(&FileSystemIndexingJob::StartFileIndexing, this));
339}
340
341void DevToolsFileSystemIndexer::FileSystemIndexingJob::StartFileIndexing(
342    base::File::Error error) {
343  if (!current_file_.IsValid()) {
344    FinishFileIndexing(false);
345    return;
346  }
347  current_file_offset_ = 0;
348  current_trigrams_.clear();
349  std::fill(current_trigrams_set_.begin(), current_trigrams_set_.end(), false);
350  ReadFromFile();
351}
352
353void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReadFromFile() {
354  if (stopped_) {
355    CloseFile();
356    return;
357  }
358  current_file_.Read(current_file_offset_, kMaxReadLength,
359                     Bind(&FileSystemIndexingJob::OnRead, this));
360}
361
362void DevToolsFileSystemIndexer::FileSystemIndexingJob::OnRead(
363    base::File::Error error,
364    const char* data,
365    int bytes_read) {
366  if (error != base::File::FILE_OK) {
367    FinishFileIndexing(false);
368    return;
369  }
370
371  if (!bytes_read || bytes_read < 3) {
372    FinishFileIndexing(true);
373    return;
374  }
375
376  size_t size = static_cast<size_t>(bytes_read);
377  vector<TrigramChar> trigram_chars;
378  trigram_chars.reserve(size);
379  for (size_t i = 0; i < size; ++i) {
380    if (IsBinaryChar(data[i])) {
381      current_trigrams_.clear();
382      FinishFileIndexing(true);
383      return;
384    }
385    trigram_chars.push_back(TrigramCharForChar(data[i]));
386  }
387
388  for (size_t i = 0; i + 2 < size; ++i) {
389    Trigram trigram = TrigramAtIndex(trigram_chars, i);
390    if ((trigram != kUndefinedTrigram) && !current_trigrams_set_[trigram]) {
391      current_trigrams_set_[trigram] = true;
392      current_trigrams_.push_back(trigram);
393    }
394  }
395  current_file_offset_ += bytes_read - 2;
396  ReadFromFile();
397}
398
399void DevToolsFileSystemIndexer::FileSystemIndexingJob::FinishFileIndexing(
400    bool success) {
401  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
402  CloseFile();
403  if (success) {
404    FilePath file_path = indexing_it_->first;
405    g_trigram_index.Get().SetTrigramsForFile(
406        file_path, current_trigrams_, file_path_times_[file_path]);
407  }
408  ReportWorked();
409  ++indexing_it_;
410  IndexFiles();
411}
412
413void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseFile() {
414  if (current_file_.IsValid())
415    current_file_.Close(Bind(&FileSystemIndexingJob::CloseCallback, this));
416}
417
418void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseCallback(
419    base::File::Error error) {}
420
421void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReportWorked() {
422  TimeTicks current_time = TimeTicks::Now();
423  bool should_send_worked_nitification = true;
424  if (!last_worked_notification_time_.is_null()) {
425    TimeDelta delta = current_time - last_worked_notification_time_;
426    if (delta.InMilliseconds() < kMinTimeoutBetweenWorkedNitification)
427      should_send_worked_nitification = false;
428  }
429  ++files_indexed_;
430  if (should_send_worked_nitification) {
431    last_worked_notification_time_ = current_time;
432    BrowserThread::PostTask(
433        BrowserThread::UI, FROM_HERE, Bind(worked_callback_, files_indexed_));
434    files_indexed_ = 0;
435  }
436}
437
438DevToolsFileSystemIndexer::DevToolsFileSystemIndexer() {
439  static bool maps_initialized = false;
440  if (!maps_initialized) {
441    InitIsBinaryCharMap();
442    InitTrigramCharsMap();
443    maps_initialized = true;
444  }
445}
446
447DevToolsFileSystemIndexer::~DevToolsFileSystemIndexer() {}
448
449scoped_refptr<DevToolsFileSystemIndexer::FileSystemIndexingJob>
450DevToolsFileSystemIndexer::IndexPath(
451    const string& file_system_path,
452    const TotalWorkCallback& total_work_callback,
453    const WorkedCallback& worked_callback,
454    const DoneCallback& done_callback) {
455  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
456  scoped_refptr<FileSystemIndexingJob> indexing_job =
457      new FileSystemIndexingJob(FilePath::FromUTF8Unsafe(file_system_path),
458                                total_work_callback,
459                                worked_callback,
460                                done_callback);
461  indexing_job->Start();
462  return indexing_job;
463}
464
465void DevToolsFileSystemIndexer::SearchInPath(const string& file_system_path,
466                                             const string& query,
467                                             const SearchCallback& callback) {
468  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
469  BrowserThread::PostTask(
470      BrowserThread::FILE,
471      FROM_HERE,
472      Bind(&DevToolsFileSystemIndexer::SearchInPathOnFileThread,
473           this,
474           file_system_path,
475           query,
476           callback));
477}
478
479void DevToolsFileSystemIndexer::SearchInPathOnFileThread(
480    const string& file_system_path,
481    const string& query,
482    const SearchCallback& callback) {
483  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
484  vector<FilePath> file_paths = g_trigram_index.Get().Search(query);
485  vector<string> result;
486  FilePath path = FilePath::FromUTF8Unsafe(file_system_path);
487  vector<FilePath>::const_iterator it = file_paths.begin();
488  for (; it != file_paths.end(); ++it) {
489    if (path.IsParent(*it))
490      result.push_back(it->AsUTF8Unsafe());
491  }
492  BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, Bind(callback, result));
493}
494