browser/safe_browsing/safe_browsing_store_file.cc

// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/browser/safe_browsing/safe_browsing_store_file.h"

#include "base/files/file_util.h"
#include "base/files/scoped_file.h"
#include "base/md5.h"
#include "base/metrics/histogram.h"
#include "base/metrics/sparse_histogram.h"

namespace {

// NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
// that byte-order changes force corruption.
const int32 kFileMagic = 0x600D71FE;

// Version history:
// Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite)
// Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28
// Version 8: d3dd0715/r259791 by shess@chromium.org on 2014-03-27
const int32 kFileVersion = 8;

// ReadAndVerifyHeader() returns this in case of error.
const int32 kInvalidVersion = -1;

// Starting with version 8, the storage is sorted and can be sharded to allow
// updates to be done with lower memory requirements.  Newly written files will
// be sharded to need less than this amount of memory during update.  Larger
// values are preferred to minimize looping overhead during processing.
const int64 kUpdateStorageBytes = 100 * 1024;

// Prevent excessive sharding by setting a lower limit on the shard stride.
// Smaller values should work fine, but very small values will probably lead to
// poor performance.  Shard stride is indirectly related to
// |kUpdateStorageBytes|, setting that very small will bump against this.
const uint32 kMinShardStride = 1 << 24;

// Strides over the entire SBPrefix space.
const uint64 kMaxShardStride = 1ULL << 32;

// Maximum SBPrefix value.
const SBPrefix kMaxSBPrefix = 0xFFFFFFFF;

// Header at the front of the main database file.
struct FileHeader {
  int32 magic, version;
  uint32 add_chunk_count, sub_chunk_count;
  uint32 shard_stride;
  // TODO(shess): Is this where 64-bit will bite me?  Perhaps write a
  // specialized read/write?
};

// Header for each chunk in the chunk-accumulation file.
struct ChunkHeader {
  uint32 add_prefix_count, sub_prefix_count;
  uint32 add_hash_count, sub_hash_count;
};

// Header for each shard of data in the main database file.
struct ShardHeader {
  uint32 add_prefix_count, sub_prefix_count;
  uint32 add_hash_count, sub_hash_count;
};

// Enumerate different format-change events for histogramming
// purposes.  DO NOT CHANGE THE ORDERING OF THESE VALUES.
enum FormatEventType {
  // Corruption detected, broken down by file format.
  FORMAT_EVENT_FILE_CORRUPT,
  FORMAT_EVENT_SQLITE_CORRUPT,  // Obsolete

  // The type of format found in the file.  The expected case (new
  // file format) is intentionally not covered.
  FORMAT_EVENT_FOUND_SQLITE,  // Obsolete
  FORMAT_EVENT_FOUND_UNKNOWN,  // magic does not match.

  // The number of SQLite-format files deleted should be the same as
  // FORMAT_EVENT_FOUND_SQLITE.  It can differ if the delete fails,
  // or if a failure prevents the update from succeeding.
  FORMAT_EVENT_SQLITE_DELETED,  // Obsolete
  FORMAT_EVENT_SQLITE_DELETE_FAILED,  // Obsolete

  // Found and deleted (or failed to delete) the ancient "Safe
  // Browsing" file.
  FORMAT_EVENT_DELETED_ORIGINAL,  // Obsolete
  FORMAT_EVENT_DELETED_ORIGINAL_FAILED,  // Obsolete

  // The checksum did not check out in CheckValidity() or in
  // FinishUpdate().  This most likely indicates that the machine
  // crashed before the file was fully sync'ed to disk.
  FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE,
  FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE,

  // The header checksum was incorrect in ReadAndVerifyHeader().  Likely
  // indicates that the system crashed while writing an update.
  FORMAT_EVENT_HEADER_CHECKSUM_FAILURE,

  FORMAT_EVENT_FOUND_DEPRECATED,  // version too old.

  // Memory space for histograms is determined by the max.  ALWAYS
  // ADD NEW VALUES BEFORE THIS ONE.
  FORMAT_EVENT_MAX
};

void RecordFormatEvent(FormatEventType event_type) {
  UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX);
}

// Rewind the file.  Using fseek(2) because rewind(3) errors are
// weird.
bool FileRewind(FILE* fp) {
  int rv = fseek(fp, 0, SEEK_SET);
  DCHECK_EQ(rv, 0);
  return rv == 0;
}

// Read from |fp| into |item|, and fold the input data into the
// checksum in |context|, if non-NULL.  Return true on success.
template <class T>
bool ReadItem(T* item, FILE* fp, base::MD5Context* context) {
  const size_t ret = fread(item, sizeof(T), 1, fp);
  if (ret != 1)
    return false;

  if (context) {
    base::MD5Update(context,
                    base::StringPiece(reinterpret_cast<char*>(item),
                                      sizeof(T)));
  }
  return true;
}

// Write |item| to |fp|, and fold the output data into the checksum in
// |context|, if non-NULL.  Return true on success.
template <class T>
bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) {
  const size_t ret = fwrite(&item, sizeof(T), 1, fp);
  if (ret != 1)
    return false;

  if (context) {
    base::MD5Update(context,
                    base::StringPiece(reinterpret_cast<const char*>(&item),
                                      sizeof(T)));
  }

  return true;
}

// Read |count| items into |values| from |fp|, and fold them into the
// checksum in |context|.  Returns true on success.
template <typename CT>
bool ReadToContainer(CT* values, size_t count, FILE* fp,
                     base::MD5Context* context) {
  if (!count)
    return true;

  for (size_t i = 0; i < count; ++i) {
    typename CT::value_type value;
    if (!ReadItem(&value, fp, context))
      return false;

    // push_back() is more obvious, but coded this way std::set can
    // also be read.
    values->insert(values->end(), value);
  }

  return true;
}

// Write values between |beg| and |end| to |fp|, and fold the data into the
// checksum in |context|, if non-NULL.  Returns true if all items successful.
template <typename CTI>
bool WriteRange(const CTI& beg, const CTI& end,
                FILE* fp, base::MD5Context* context) {
  for (CTI iter = beg; iter != end; ++iter) {
    if (!WriteItem(*iter, fp, context))
      return false;
  }
  return true;
}

// Write all of |values| to |fp|, and fold the data into the checksum
// in |context|, if non-NULL.  Returns true if all items successful.
template <typename CT>
bool WriteContainer(const CT& values, FILE* fp,
                    base::MD5Context* context) {
  return WriteRange(values.begin(), values.end(), fp, context);
}

// Delete the chunks in |deleted| from |chunks|.
void DeleteChunksFromSet(const base::hash_set<int32>& deleted,
                         std::set<int32>* chunks) {
  for (std::set<int32>::iterator iter = chunks->begin();
       iter != chunks->end();) {
    std::set<int32>::iterator prev = iter++;
    if (deleted.count(*prev) > 0)
      chunks->erase(prev);
  }
}

bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) {
  base::MD5Digest calculated_digest;
  base::MD5IntermediateFinal(&calculated_digest, context);

  base::MD5Digest file_digest;
  if (!ReadItem(&file_digest, fp, context))
    return false;

  return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0;
}

// Helper function to read the file header and chunk TOC.  Rewinds |fp| and
// initializes |context|.  The header is left in |header|, with the version
// returned.  kInvalidVersion is returned for sanity check or checksum failure.
int ReadAndVerifyHeader(const base::FilePath& filename,
                        FileHeader* header,
                        std::set<int32>* add_chunks,
                        std::set<int32>* sub_chunks,
                        FILE* fp,
                        base::MD5Context* context) {
  DCHECK(header);
  DCHECK(add_chunks);
  DCHECK(sub_chunks);
  DCHECK(fp);
  DCHECK(context);

  base::MD5Init(context);
  if (!FileRewind(fp))
    return kInvalidVersion;
  if (!ReadItem(header, fp, context))
    return kInvalidVersion;
  if (header->magic != kFileMagic)
    return kInvalidVersion;

  // Track version read to inform removal of support for older versions.
  UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->version);

  if (header->version != kFileVersion)
    return kInvalidVersion;

  if (!ReadToContainer(add_chunks, header->add_chunk_count, fp, context) ||
      !ReadToContainer(sub_chunks, header->sub_chunk_count, fp, context)) {
    return kInvalidVersion;
  }

  // Verify that the data read thus far is valid.
  if (!ReadAndVerifyChecksum(fp, context)) {
    RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE);
    return kInvalidVersion;
  }

  return kFileVersion;
}

// Helper function to write out the initial header and chunks-contained data.
// Rewinds |fp|, initializes |context|, then writes a file header and
// |add_chunks| and |sub_chunks|.
bool WriteHeader(uint32 out_stride,
                 const std::set<int32>& add_chunks,
                 const std::set<int32>& sub_chunks,
                 FILE* fp,
                 base::MD5Context* context) {
  if (!FileRewind(fp))
    return false;

  base::MD5Init(context);
  FileHeader header;
  header.magic = kFileMagic;
  header.version = kFileVersion;
  header.add_chunk_count = add_chunks.size();
  header.sub_chunk_count = sub_chunks.size();
  header.shard_stride = out_stride;
  if (!WriteItem(header, fp, context))
    return false;

  if (!WriteContainer(add_chunks, fp, context) ||
      !WriteContainer(sub_chunks, fp, context))
    return false;

  // Write out the header digest.
  base::MD5Digest header_digest;
  base::MD5IntermediateFinal(&header_digest, context);
  if (!WriteItem(header_digest, fp, context))
    return false;

  return true;
}

// Return |true| if the range is sorted by the given comparator.
template <typename CTI, typename LESS>
bool sorted(CTI beg, CTI end, LESS less) {
  while ((end - beg) > 2) {
    CTI n = beg++;
    DCHECK(!less(*beg, *n));
    if (less(*beg, *n))
      return false;
  }
  return true;
}

// Merge |beg|..|end| into |container|.  Both should be sorted by the given
// comparator, and the range iterators should not be derived from |container|.
// Differs from std::inplace_merge() in that additional memory is not required
// for linear performance.
template <typename CT, typename CTI, typename COMP>
void container_merge(CT* container, CTI beg, CTI end, const COMP& less) {
  DCHECK(sorted(container->begin(), container->end(), less));
  DCHECK(sorted(beg, end, less));

  // Size the container to fit the results.
  const size_t c_size = container->size();
  container->resize(c_size + (end - beg));

  // |c_end| points to the original endpoint, while |c_out| points to the
  // endpoint that will scan from end to beginning while merging.
  typename CT::iterator c_end = container->begin() + c_size;
  typename CT::iterator c_out = container->end();

  // While both inputs have data, move the greater to |c_out|.
  while (c_end != container->begin() && end != beg) {
    if (less(*(c_end - 1), *(end - 1))) {
      *(--c_out) = *(--end);
    } else {
      *(--c_out) = *(--c_end);
    }
  }

  // Copy any data remaining in the new range.
  if (end != beg) {
    // The original container data has been fully shifted.
    DCHECK(c_end == container->begin());

    // There is exactly the correct amount of space left.
    DCHECK_EQ(c_out - c_end, end - beg);

    std::copy(beg, end, container->begin());
  }

  DCHECK(sorted(container->begin(), container->end(), less));
}

// Collection of iterators used while stepping through StateInternal (see
// below).
class StateInternalPos {
 public:
  StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter,
                   SBSubPrefixes::iterator sub_prefixes_iter,
                   std::vector<SBAddFullHash>::iterator add_hashes_iter,
                   std::vector<SBSubFullHash>::iterator sub_hashes_iter)
      : add_prefixes_iter_(add_prefixes_iter),
        sub_prefixes_iter_(sub_prefixes_iter),
        add_hashes_iter_(add_hashes_iter),
        sub_hashes_iter_(sub_hashes_iter) {
  }

  SBAddPrefixes::iterator add_prefixes_iter_;
  SBSubPrefixes::iterator sub_prefixes_iter_;
  std::vector<SBAddFullHash>::iterator add_hashes_iter_;
  std::vector<SBSubFullHash>::iterator sub_hashes_iter_;
};

// Helper to find the next shard boundary.
template <class T>
bool prefix_bounder(SBPrefix val, const T& elt) {
  return val < elt.GetAddPrefix();
}

// Container for partial database state.  Includes add/sub prefixes/hashes, plus
// aggregate operations on same.
class StateInternal {
 public:
  // Append indicated amount of data from |fp|.
  bool AppendData(size_t add_prefix_count, size_t sub_prefix_count,
                  size_t add_hash_count, size_t sub_hash_count,
                  FILE* fp, base::MD5Context* context) {
    return
        ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) &&
        ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) &&
        ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) &&
        ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context);
  }

  void ClearData() {
    add_prefixes_.clear();
    sub_prefixes_.clear();
    add_full_hashes_.clear();
    sub_full_hashes_.clear();
  }

  // Merge data from |beg|..|end| into receiver's state, then process the state.
  // The current state and the range given should corrospond to the same sorted
  // shard of data from different sources.  |add_del_cache| and |sub_del_cache|
  // indicate the chunk ids which should be deleted during processing (see
  // SBProcessSubs).
  void MergeDataAndProcess(const StateInternalPos& beg,
                           const StateInternalPos& end,
                           const base::hash_set<int32>& add_del_cache,
                           const base::hash_set<int32>& sub_del_cache) {
    container_merge(&add_prefixes_,
                    beg.add_prefixes_iter_,
                    end.add_prefixes_iter_,
                    SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);

    container_merge(&sub_prefixes_,
                    beg.sub_prefixes_iter_,
                    end.sub_prefixes_iter_,
                    SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);

    container_merge(&add_full_hashes_,
                    beg.add_hashes_iter_,
                    end.add_hashes_iter_,
                    SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);

    container_merge(&sub_full_hashes_,
                    beg.sub_hashes_iter_,
                    end.sub_hashes_iter_,
                    SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>);

    SBProcessSubs(&add_prefixes_, &sub_prefixes_,
                  &add_full_hashes_, &sub_full_hashes_,
                  add_del_cache, sub_del_cache);
  }

  // Sort the data appropriately for the sharding, merging, and processing
  // operations.
  void SortData() {
    std::sort(add_prefixes_.begin(), add_prefixes_.end(),
              SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
    std::sort(sub_prefixes_.begin(), sub_prefixes_.end(),
              SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
    std::sort(add_full_hashes_.begin(), add_full_hashes_.end(),
              SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
    std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(),
              SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
  }

  // Iterator from the beginning of the state's data.
  StateInternalPos StateBegin() {
    return StateInternalPos(add_prefixes_.begin(),
                            sub_prefixes_.begin(),
                            add_full_hashes_.begin(),
                            sub_full_hashes_.begin());
  }

  // An iterator pointing just after the last possible element of the shard
  // indicated by |shard_max|.  Used to step through the state by shard.
  // TODO(shess): Verify whether binary search really improves over linear.
  // Merging or writing will immediately touch all of these elements.
  StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) {
    return StateInternalPos(
        std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(),
                         shard_max, prefix_bounder<SBAddPrefix>),
        std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(),
                         shard_max, prefix_bounder<SBSubPrefix>),
        std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(),
                         shard_max, prefix_bounder<SBAddFullHash>),
        std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(),
                         shard_max, prefix_bounder<SBSubFullHash>));
  }

  // Write a shard header and data for the shard starting at |beg| and ending at
  // the element before |end|.
  bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end,
                  FILE* fp, base::MD5Context* context) {
    ShardHeader shard_header;
    shard_header.add_prefix_count =
        end.add_prefixes_iter_ - beg.add_prefixes_iter_;
    shard_header.sub_prefix_count =
        end.sub_prefixes_iter_ - beg.sub_prefixes_iter_;
    shard_header.add_hash_count =
        end.add_hashes_iter_ - beg.add_hashes_iter_;
    shard_header.sub_hash_count =
        end.sub_hashes_iter_ - beg.sub_hashes_iter_;

    return
        WriteItem(shard_header, fp, context) &&
        WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_,
                   fp, context) &&
        WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_,
                   fp, context) &&
        WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_,
                   fp, context) &&
        WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_,
                   fp, context);
  }

  SBAddPrefixes add_prefixes_;
  SBSubPrefixes sub_prefixes_;
  std::vector<SBAddFullHash> add_full_hashes_;
  std::vector<SBSubFullHash> sub_full_hashes_;
};

// True if |val| is an even power of two.
template <typename T>
bool IsPowerOfTwo(const T& val) {
  return val && (val & (val - 1)) == 0;
}

// Helper to read the entire database state, used by GetAddPrefixes() and
// GetAddFullHashes().  Those functions are generally used only for smaller
// files.  Returns false in case of errors reading the data.
bool ReadDbStateHelper(const base::FilePath& filename,
                       StateInternal* db_state) {
  base::ScopedFILE file(base::OpenFile(filename, "rb"));
  if (file.get() == NULL)
    return false;

  std::set<int32> add_chunks;
  std::set<int32> sub_chunks;

  base::MD5Context context;
  FileHeader header;
  const int version =
      ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks,
                          file.get(), &context);
  if (version == kInvalidVersion)
    return false;

  uint64 in_min = 0;
  uint64 in_stride = header.shard_stride;
  if (!in_stride)
    in_stride = kMaxShardStride;
  if (!IsPowerOfTwo(in_stride))
    return false;

  do {
    ShardHeader shard_header;
    if (!ReadItem(&shard_header, file.get(), &context))
      return false;

    if (!db_state->AppendData(shard_header.add_prefix_count,
                              shard_header.sub_prefix_count,
                              shard_header.add_hash_count,
                              shard_header.sub_hash_count,
                              file.get(), &context)) {
      return false;
    }

    in_min += in_stride;
  } while (in_min <= kMaxSBPrefix);

  if (!ReadAndVerifyChecksum(file.get(), &context))
    return false;

  int64 size = 0;
  if (!base::GetFileSize(filename, &size))
    return false;

  return static_cast<int64>(ftell(file.get())) == size;
}

}  // namespace

SafeBrowsingStoreFile::SafeBrowsingStoreFile()
    : chunks_written_(0), empty_(false), corruption_seen_(false) {}

SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
  Close();
}

bool SafeBrowsingStoreFile::Delete() {
  // The database should not be open at this point.  But, just in
  // case, close everything before deleting.
  if (!Close()) {
    NOTREACHED();
    return false;
  }

  return DeleteStore(filename_);
}

bool SafeBrowsingStoreFile::CheckValidity() {
  // The file was either empty or never opened.  The empty case is
  // presumed not to be invalid.  The never-opened case can happen if
  // BeginUpdate() fails for any databases, and should already have
  // caused the corruption callback to fire.
  if (!file_.get())
    return true;

  if (!FileRewind(file_.get()))
    return OnCorruptDatabase();

  int64 size = 0;
  if (!base::GetFileSize(filename_, &size))
    return OnCorruptDatabase();

  base::MD5Context context;
  base::MD5Init(&context);

  // Read everything except the final digest.
  size_t bytes_left = static_cast<size_t>(size);
  CHECK(size == static_cast<int64>(bytes_left));
  if (bytes_left < sizeof(base::MD5Digest))
    return OnCorruptDatabase();
  bytes_left -= sizeof(base::MD5Digest);

  // Fold the contents of the file into the checksum.
  while (bytes_left > 0) {
    char buf[4096];
    const size_t c = std::min(sizeof(buf), bytes_left);
    const size_t ret = fread(buf, 1, c, file_.get());

    // The file's size changed while reading, give up.
    if (ret != c)
      return OnCorruptDatabase();
    base::MD5Update(&context, base::StringPiece(buf, c));
    bytes_left -= c;
  }

  if (!ReadAndVerifyChecksum(file_.get(), &context)) {
    RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE);
    return OnCorruptDatabase();
  }

  return true;
}

void SafeBrowsingStoreFile::Init(
    const base::FilePath& filename,
    const base::Closure& corruption_callback
) {
  filename_ = filename;
  corruption_callback_ = corruption_callback;
}

bool SafeBrowsingStoreFile::BeginChunk() {
  return ClearChunkBuffers();
}

bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) {
  add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix));
  return true;
}

bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) {
  add_prefixes->clear();
  if (!base::PathExists(filename_))
    return true;

  StateInternal db_state;
  if (!ReadDbStateHelper(filename_, &db_state))
    return OnCorruptDatabase();

  add_prefixes->swap(db_state.add_prefixes_);
  return true;
}

bool SafeBrowsingStoreFile::GetAddFullHashes(
    std::vector<SBAddFullHash>* add_full_hashes) {
  add_full_hashes->clear();
  if (!base::PathExists(filename_))
    return true;

  StateInternal db_state;
  if (!ReadDbStateHelper(filename_, &db_state))
    return OnCorruptDatabase();

  add_full_hashes->swap(db_state.add_full_hashes_);
  return true;
}

bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id,
                                         const SBFullHash& full_hash) {
  add_hashes_.push_back(SBAddFullHash(chunk_id, full_hash));
  return true;
}

bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id,
                                           int32 add_chunk_id,
                                           SBPrefix prefix) {
  sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix));
  return true;
}

bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id,
                                         const SBFullHash& full_hash) {
  sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash));
  return true;
}

bool SafeBrowsingStoreFile::OnCorruptDatabase() {
  if (!corruption_seen_)
    RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT);
  corruption_seen_ = true;

  corruption_callback_.Run();

  // Return false as a convenience to callers.
  return false;
}

bool SafeBrowsingStoreFile::Close() {
  ClearUpdateBuffers();

  // Make sure the files are closed.
  file_.reset();
  new_file_.reset();
  return true;
}

bool SafeBrowsingStoreFile::BeginUpdate() {
  DCHECK(!file_.get() && !new_file_.get());

  // Structures should all be clear unless something bad happened.
  DCHECK(add_chunks_cache_.empty());
  DCHECK(sub_chunks_cache_.empty());
  DCHECK(add_del_cache_.empty());
  DCHECK(sub_del_cache_.empty());
  DCHECK(add_prefixes_.empty());
  DCHECK(sub_prefixes_.empty());
  DCHECK(add_hashes_.empty());
  DCHECK(sub_hashes_.empty());
  DCHECK_EQ(chunks_written_, 0);

  corruption_seen_ = false;

  const base::FilePath new_filename = TemporaryFileForFilename(filename_);
  base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+"));
  if (new_file.get() == NULL)
    return false;

  base::ScopedFILE file(base::OpenFile(filename_, "rb"));
  empty_ = (file.get() == NULL);
  if (empty_) {
    // If the file exists but cannot be opened, try to delete it (not
    // deleting directly, the bloom filter needs to be deleted, too).
    if (base::PathExists(filename_))
      return OnCorruptDatabase();

    new_file_.swap(new_file);
    return true;
  }

  base::MD5Context context;
  FileHeader header;
  const int version =
      ReadAndVerifyHeader(filename_, &header,
                          &add_chunks_cache_, &sub_chunks_cache_,
                          file.get(), &context);
  if (version == kInvalidVersion) {
    FileHeader retry_header;
    if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL)) {
      if (retry_header.magic == kFileMagic &&
          retry_header.version < kFileVersion) {
        RecordFormatEvent(FORMAT_EVENT_FOUND_DEPRECATED);
      } else {
        RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN);
      }
    }

    // Close the file so that it can be deleted.
    file.reset();

    return OnCorruptDatabase();
  }

  file_.swap(file);
  new_file_.swap(new_file);
  return true;
}

bool SafeBrowsingStoreFile::FinishChunk() {
  if (!add_prefixes_.size() && !sub_prefixes_.size() &&
      !add_hashes_.size() && !sub_hashes_.size())
    return true;

  ChunkHeader header;
  header.add_prefix_count = add_prefixes_.size();
  header.sub_prefix_count = sub_prefixes_.size();
  header.add_hash_count = add_hashes_.size();
  header.sub_hash_count = sub_hashes_.size();
  if (!WriteItem(header, new_file_.get(), NULL))
    return false;

  if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) ||
      !WriteContainer(sub_prefixes_, new_file_.get(), NULL) ||
      !WriteContainer(add_hashes_, new_file_.get(), NULL) ||
      !WriteContainer(sub_hashes_, new_file_.get(), NULL))
    return false;

  ++chunks_written_;

  // Clear everything to save memory.
  return ClearChunkBuffers();
}

bool SafeBrowsingStoreFile::DoUpdate(
    safe_browsing::PrefixSetBuilder* builder,
    std::vector<SBAddFullHash>* add_full_hashes_result) {
  DCHECK(file_.get() || empty_);
  DCHECK(new_file_.get());
  CHECK(builder);
  CHECK(add_full_hashes_result);

  // Rewind the temporary storage.
  if (!FileRewind(new_file_.get()))
    return false;

  // Get chunk file's size for validating counts.
  int64 update_size = 0;
  if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size))
    return OnCorruptDatabase();

  // Track update size to answer questions at http://crbug.com/72216 .
  // Log small updates as 1k so that the 0 (underflow) bucket can be
  // used for "empty" in SafeBrowsingDatabase.
  UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
                       std::max(static_cast<int>(update_size / 1024), 1));

  // Chunk updates to integrate.
  StateInternal new_state;

  // Read update chunks.
  for (int i = 0; i < chunks_written_; ++i) {
    ChunkHeader header;

    int64 ofs = ftell(new_file_.get());
    if (ofs == -1)
      return false;

    if (!ReadItem(&header, new_file_.get(), NULL))
      return false;

    // As a safety measure, make sure that the header describes a sane
    // chunk, given the remaining file size.
    int64 expected_size = ofs + sizeof(ChunkHeader);
    expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
    expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
    expected_size += header.add_hash_count * sizeof(SBAddFullHash);
    expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
    if (expected_size > update_size)
      return false;

    if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count,
                              header.add_hash_count, header.sub_hash_count,
                              new_file_.get(), NULL)) {
      return false;
    }
  }

  // The state was accumulated by chunk, sort by prefix.
  new_state.SortData();

  // These strides control how much data is loaded into memory per pass.
  // Strides must be an even power of two.  |in_stride| will be derived from the
  // input file.  |out_stride| will be derived from an estimate of the resulting
  // file's size.  |process_stride| will be the max of both.
  uint64 in_stride = kMaxShardStride;
  uint64 out_stride = kMaxShardStride;
  uint64 process_stride = 0;

  // Used to verify the input's checksum if |!empty_|.
  base::MD5Context in_context;

  if (!empty_) {
    DCHECK(file_.get());

    FileHeader header = {0};
    int version = ReadAndVerifyHeader(filename_, &header,
                                      &add_chunks_cache_, &sub_chunks_cache_,
                                      file_.get(), &in_context);
    if (version == kInvalidVersion)
      return OnCorruptDatabase();

    if (header.shard_stride)
      in_stride = header.shard_stride;

    // The header checksum should have prevented this case, but the code will be
    // broken if this is not correct.
    if (!IsPowerOfTwo(in_stride))
      return OnCorruptDatabase();
  }

  // We no longer need to track deleted chunks.
  DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_);
  DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_);

  // Calculate |out_stride| to break the file down into reasonable shards.
  {
    int64 original_size = 0;
    if (!empty_ && !base::GetFileSize(filename_, &original_size))
      return OnCorruptDatabase();

    // Approximate the final size as everything.  Subs and deletes will reduce
    // the size, but modest over-sharding won't hurt much.
    int64 shard_size = original_size + update_size;

    // Keep splitting until a single stride of data fits the target.
    size_t shifts = 0;
    while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) {
      out_stride >>= 1;
      shard_size >>= 1;
      ++shifts;
    }
    UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts);

    DCHECK(IsPowerOfTwo(out_stride));
  }

  // Outer loop strides by the max of the input stride (to read integral shards)
  // and the output stride (to write integral shards).
  process_stride = std::max(in_stride, out_stride);
  DCHECK(IsPowerOfTwo(process_stride));
  DCHECK_EQ(0u, process_stride % in_stride);
  DCHECK_EQ(0u, process_stride % out_stride);

  // Start writing the new data to |new_file_|.
  base::MD5Context out_context;
  if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_,
                   new_file_.get(), &out_context)) {
    return false;
  }

  // Start at the beginning of the SBPrefix space.
  uint64 in_min = 0;
  uint64 out_min = 0;
  uint64 process_min = 0;

  // Start at the beginning of the updates.
  StateInternalPos new_pos = new_state.StateBegin();

  // Re-usable container for shard processing.
  StateInternal db_state;

  // Track aggregate counts for histograms.
  size_t add_prefix_count = 0;
  size_t sub_prefix_count = 0;

  do {
    // Maximum element in the current shard.
    SBPrefix process_max =
        static_cast<SBPrefix>(process_min + process_stride - 1);
    DCHECK_GT(process_max, process_min);

    // Drop the data from previous pass.
    db_state.ClearData();

    // Fill the processing shard with one or more input shards.
    if (!empty_) {
      do {
        ShardHeader shard_header;
        if (!ReadItem(&shard_header, file_.get(), &in_context))
          return OnCorruptDatabase();

        if (!db_state.AppendData(shard_header.add_prefix_count,
                                 shard_header.sub_prefix_count,
                                 shard_header.add_hash_count,
                                 shard_header.sub_hash_count,
                                 file_.get(), &in_context))
          return OnCorruptDatabase();

        in_min += in_stride;
      } while (in_min <= kMaxSBPrefix && in_min < process_max);
    }

    // Shard the update data to match the database data, then merge the update
    // data and process the results.
    {
      StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max);
      db_state.MergeDataAndProcess(new_pos, new_end,
                                   add_del_cache_, sub_del_cache_);
      new_pos = new_end;
    }

    // Collect the processed data for return to caller.
    for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) {
      builder->AddPrefix(db_state.add_prefixes_[i].prefix);
    }
    add_full_hashes_result->insert(add_full_hashes_result->end(),
                                   db_state.add_full_hashes_.begin(),
                                   db_state.add_full_hashes_.end());
    add_prefix_count += db_state.add_prefixes_.size();
    sub_prefix_count += db_state.sub_prefixes_.size();

    // Write one or more shards of processed output.
    StateInternalPos out_pos = db_state.StateBegin();
    do {
      SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1);
      DCHECK_GT(out_max, out_min);

      StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max);
      if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context))
        return false;
      out_pos = out_end;

      out_min += out_stride;
    } while (out_min == static_cast<SBPrefix>(out_min) &&
             out_min < process_max);

    process_min += process_stride;
  } while (process_min <= kMaxSBPrefix);

  // Verify the overall checksum.
  if (!empty_) {
    if (!ReadAndVerifyChecksum(file_.get(), &in_context)) {
      RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE);
      return OnCorruptDatabase();
    }

    // TODO(shess): Verify EOF?

    // Close the input file so the new file can be renamed over it.
    file_.reset();
  }
  DCHECK(!file_.get());

  // Write the overall checksum.
  base::MD5Digest out_digest;
  base::MD5Final(&out_digest, &out_context);
  if (!WriteItem(out_digest, new_file_.get(), NULL))
    return false;

  // Trim any excess left over from the temporary chunk data.
  if (!base::TruncateFile(new_file_.get()))
    return false;

  // Close the file handle and swizzle the file into place.
  new_file_.reset();
  if (!base::DeleteFile(filename_, false) &&
      base::PathExists(filename_))
    return false;

  const base::FilePath new_filename = TemporaryFileForFilename(filename_);
  if (!base::Move(new_filename, filename_))
    return false;

  // Record counts before swapping to caller.
  UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count);
  UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count);

  return true;
}

bool SafeBrowsingStoreFile::FinishUpdate(
    safe_browsing::PrefixSetBuilder* builder,
    std::vector<SBAddFullHash>* add_full_hashes_result) {
  DCHECK(builder);
  DCHECK(add_full_hashes_result);

  if (!DoUpdate(builder, add_full_hashes_result)) {
    CancelUpdate();
    return false;
  }

  DCHECK(!new_file_.get());
  DCHECK(!file_.get());

  return Close();
}

bool SafeBrowsingStoreFile::CancelUpdate() {
  bool ret = Close();

  // Delete stale staging file.
  const base::FilePath new_filename = TemporaryFileForFilename(filename_);
  base::DeleteFile(new_filename, false);

  return ret;
}

void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) {
  add_chunks_cache_.insert(chunk_id);
}

bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) {
  return add_chunks_cache_.count(chunk_id) > 0;
}

void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) {
  out->clear();
  out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end());
}

void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) {
  sub_chunks_cache_.insert(chunk_id);
}

bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) {
  return sub_chunks_cache_.count(chunk_id) > 0;
}

void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) {
  out->clear();
  out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end());
}

void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) {
  add_del_cache_.insert(chunk_id);
}

void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) {
  sub_del_cache_.insert(chunk_id);
}

// static
bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) {
  if (!base::DeleteFile(basename, false) &&
      base::PathExists(basename)) {
    NOTREACHED();
    return false;
  }

  const base::FilePath new_filename = TemporaryFileForFilename(basename);
  if (!base::DeleteFile(new_filename, false) &&
      base::PathExists(new_filename)) {
    NOTREACHED();
    return false;
  }

  // With SQLite support gone, one way to get to this code is if the
  // existing file is a SQLite file.  Make sure the journal file is
  // also removed.
  const base::FilePath journal_filename(
      basename.value() + FILE_PATH_LITERAL("-journal"));
  if (base::PathExists(journal_filename))
    base::DeleteFile(journal_filename, false);

  return true;
}