difference_estimator.cc revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright (c) 2009 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// We want to measure the similarity of two sequences of bytes as a surrogate
6// for measuring how well a second sequence will compress differentially to the
7// first sequence.
8
9#include "courgette/difference_estimator.h"
10
11#include "base/hash_tables.h"
12
13namespace courgette {
14
15// Our difference measure is the number of k-tuples that occur in Subject that
16// don't occur in Base.
17const int kTupleSize = 4;
18
19namespace {
20
21COMPILE_ASSERT(kTupleSize >= 4 && kTupleSize <= 8, kTupleSize_between_4_and_8);
22
23size_t HashTuple(const uint8* source) {
24  size_t hash1 = *reinterpret_cast<const uint32*>(source);
25  size_t hash2 = *reinterpret_cast<const uint32*>(source + kTupleSize - 4);
26  size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23);
27  return hash;
28}
29
30bool RegionsEqual(const Region& a, const Region& b) {
31  if (a.length() != b.length())
32    return false;
33  return memcmp(a.start(), b.start(), a.length()) == 0;
34}
35
36}  // anonymous namepace
37
38class DifferenceEstimator::Base {
39 public:
40  explicit Base(const Region& region) : region_(region) { }
41
42  void Init() {
43    const uint8* start = region_.start();
44    const uint8* end = region_.end() - (kTupleSize - 1);
45    for (const uint8* p = start;  p < end;  ++p) {
46      size_t hash = HashTuple(p);
47      hashes_.insert(hash);
48    }
49  }
50
51  const Region& region() const { return region_; }
52
53 private:
54  Region region_;
55  base::hash_set<size_t> hashes_;
56
57  friend class DifferenceEstimator;
58  DISALLOW_COPY_AND_ASSIGN(Base);
59};
60
61class DifferenceEstimator::Subject {
62 public:
63  explicit Subject(const Region& region) : region_(region) {}
64
65  const Region& region() const { return region_; }
66
67 private:
68  Region region_;
69
70  DISALLOW_COPY_AND_ASSIGN(Subject);
71};
72
73DifferenceEstimator::DifferenceEstimator() {}
74
75DifferenceEstimator::~DifferenceEstimator() {
76  for (size_t i = 0;  i < owned_bases_.size();  ++i)
77    delete owned_bases_[i];
78  for (size_t i = 0;  i < owned_subjects_.size();  ++i)
79    delete owned_subjects_[i];
80}
81
82DifferenceEstimator::Base* DifferenceEstimator::MakeBase(const Region& region) {
83  Base* base = new Base(region);
84  base->Init();
85  owned_bases_.push_back(base);
86  return base;
87}
88
89DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject(
90    const Region& region) {
91  Subject* subject = new Subject(region);
92  owned_subjects_.push_back(subject);
93  return subject;
94}
95
96size_t DifferenceEstimator::Measure(Base* base, Subject* subject) {
97  size_t mismatches = 0;
98  const uint8* start = subject->region().start();
99  const uint8* end = subject->region().end() - (kTupleSize - 1);
100
101  const uint8* p = start;
102  while (p < end) {
103    size_t hash = HashTuple(p);
104    if (base->hashes_.find(hash) == base->hashes_.end()) {
105      ++mismatches;
106    }
107    p += 1;
108  }
109
110  if (mismatches == 0) {
111    if (RegionsEqual(base->region(), subject->region()))
112      return 0;
113  }
114  ++mismatches;  // Guarantee not zero.
115  return mismatches;
116}
117
118}  // namespace
119