15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2009 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// We want to measure the similarity of two sequences of bytes as a surrogate
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// for measuring how well a second sequence will compress differentially to the
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// first sequence.
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "courgette/difference_estimator.h"
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
117d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/containers/hash_tables.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace courgette {
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Our difference measure is the number of k-tuples that occur in Subject that
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// don't occur in Base.
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int kTupleSize = 4;
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace {
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)COMPILE_ASSERT(kTupleSize >= 4 && kTupleSize <= 8, kTupleSize_between_4_and_8);
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)size_t HashTuple(const uint8* source) {
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  size_t hash1 = *reinterpret_cast<const uint32*>(source);
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  size_t hash2 = *reinterpret_cast<const uint32*>(source + kTupleSize - 4);
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23);
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return hash;
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool RegionsEqual(const Region& a, const Region& b) {
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (a.length() != b.length())
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return memcmp(a.start(), b.start(), a.length()) == 0;
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // anonymous namepace
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class DifferenceEstimator::Base {
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  explicit Base(const Region& region) : region_(region) { }
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void Init() {
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const uint8* start = region_.start();
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const uint8* end = region_.end() - (kTupleSize - 1);
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    for (const uint8* p = start;  p < end;  ++p) {
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      size_t hash = HashTuple(p);
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      hashes_.insert(hash);
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const Region& region() const { return region_; }
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Region region_;
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  base::hash_set<size_t> hashes_;
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  friend class DifferenceEstimator;
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(Base);
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class DifferenceEstimator::Subject {
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  explicit Subject(const Region& region) : region_(region) {}
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const Region& region() const { return region_; }
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Region region_;
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(Subject);
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DifferenceEstimator::DifferenceEstimator() {}
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DifferenceEstimator::~DifferenceEstimator() {
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (size_t i = 0;  i < owned_bases_.size();  ++i)
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    delete owned_bases_[i];
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (size_t i = 0;  i < owned_subjects_.size();  ++i)
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    delete owned_subjects_[i];
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DifferenceEstimator::Base* DifferenceEstimator::MakeBase(const Region& region) {
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Base* base = new Base(region);
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  base->Init();
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  owned_bases_.push_back(base);
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return base;
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject(
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const Region& region) {
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Subject* subject = new Subject(region);
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  owned_subjects_.push_back(subject);
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return subject;
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)size_t DifferenceEstimator::Measure(Base* base, Subject* subject) {
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  size_t mismatches = 0;
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const uint8* start = subject->region().start();
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const uint8* end = subject->region().end() - (kTupleSize - 1);
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const uint8* p = start;
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (p < end) {
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    size_t hash = HashTuple(p);
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (base->hashes_.find(hash) == base->hashes_.end()) {
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ++mismatches;
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    p += 1;
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (mismatches == 0) {
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (RegionsEqual(base->region(), subject->region()))
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return 0;
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ++mismatches;  // Guarantee not zero.
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return mismatches;
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace
119