1// Copyright (c) 2009 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5// We want to measure the similarity of two sequences of bytes as a surrogate 6// for measuring how well a second sequence will compress differentially to the 7// first sequence. 8 9#include "courgette/difference_estimator.h" 10 11#include "base/containers/hash_tables.h" 12 13namespace courgette { 14 15// Our difference measure is the number of k-tuples that occur in Subject that 16// don't occur in Base. 17const int kTupleSize = 4; 18 19namespace { 20 21COMPILE_ASSERT(kTupleSize >= 4 && kTupleSize <= 8, kTupleSize_between_4_and_8); 22 23size_t HashTuple(const uint8* source) { 24 size_t hash1 = *reinterpret_cast<const uint32*>(source); 25 size_t hash2 = *reinterpret_cast<const uint32*>(source + kTupleSize - 4); 26 size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23); 27 return hash; 28} 29 30bool RegionsEqual(const Region& a, const Region& b) { 31 if (a.length() != b.length()) 32 return false; 33 return memcmp(a.start(), b.start(), a.length()) == 0; 34} 35 36} // anonymous namepace 37 38class DifferenceEstimator::Base { 39 public: 40 explicit Base(const Region& region) : region_(region) { } 41 42 void Init() { 43 const uint8* start = region_.start(); 44 const uint8* end = region_.end() - (kTupleSize - 1); 45 for (const uint8* p = start; p < end; ++p) { 46 size_t hash = HashTuple(p); 47 hashes_.insert(hash); 48 } 49 } 50 51 const Region& region() const { return region_; } 52 53 private: 54 Region region_; 55 base::hash_set<size_t> hashes_; 56 57 friend class DifferenceEstimator; 58 DISALLOW_COPY_AND_ASSIGN(Base); 59}; 60 61class DifferenceEstimator::Subject { 62 public: 63 explicit Subject(const Region& region) : region_(region) {} 64 65 const Region& region() const { return region_; } 66 67 private: 68 Region region_; 69 70 DISALLOW_COPY_AND_ASSIGN(Subject); 71}; 72 73DifferenceEstimator::DifferenceEstimator() {} 74 75DifferenceEstimator::~DifferenceEstimator() { 76 for (size_t i = 0; i < owned_bases_.size(); ++i) 77 delete owned_bases_[i]; 78 for (size_t i = 0; i < owned_subjects_.size(); ++i) 79 delete owned_subjects_[i]; 80} 81 82DifferenceEstimator::Base* DifferenceEstimator::MakeBase(const Region& region) { 83 Base* base = new Base(region); 84 base->Init(); 85 owned_bases_.push_back(base); 86 return base; 87} 88 89DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject( 90 const Region& region) { 91 Subject* subject = new Subject(region); 92 owned_subjects_.push_back(subject); 93 return subject; 94} 95 96size_t DifferenceEstimator::Measure(Base* base, Subject* subject) { 97 size_t mismatches = 0; 98 const uint8* start = subject->region().start(); 99 const uint8* end = subject->region().end() - (kTupleSize - 1); 100 101 const uint8* p = start; 102 while (p < end) { 103 size_t hash = HashTuple(p); 104 if (base->hashes_.find(hash) == base->hashes_.end()) { 105 ++mismatches; 106 } 107 p += 1; 108 } 109 110 if (mismatches == 0) { 111 if (RegionsEqual(base->region(), subject->region())) 112 return 0; 113 } 114 ++mismatches; // Guarantee not zero. 115 return mismatches; 116} 117 118} // namespace 119