15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2009 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// We want to measure the similarity of two sequences of bytes as a surrogate 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// for measuring how well a second sequence will compress differentially to the 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// first sequence. 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "courgette/difference_estimator.h" 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 117d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/containers/hash_tables.h" 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace courgette { 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Our difference measure is the number of k-tuples that occur in Subject that 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// don't occur in Base. 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int kTupleSize = 4; 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace { 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)COMPILE_ASSERT(kTupleSize >= 4 && kTupleSize <= 8, kTupleSize_between_4_and_8); 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)size_t HashTuple(const uint8* source) { 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t hash1 = *reinterpret_cast<const uint32*>(source); 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t hash2 = *reinterpret_cast<const uint32*>(source + kTupleSize - 4); 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23); 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return hash; 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool RegionsEqual(const Region& a, const Region& b) { 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (a.length() != b.length()) 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return memcmp(a.start(), b.start(), a.length()) == 0; 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // anonymous namepace 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class DifferenceEstimator::Base { 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) explicit Base(const Region& region) : region_(region) { } 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void Init() { 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* start = region_.start(); 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* end = region_.end() - (kTupleSize - 1); 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (const uint8* p = start; p < end; ++p) { 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t hash = HashTuple(p); 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) hashes_.insert(hash); 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const Region& region() const { return region_; } 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Region region_; 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::hash_set<size_t> hashes_; 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) friend class DifferenceEstimator; 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_COPY_AND_ASSIGN(Base); 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class DifferenceEstimator::Subject { 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) explicit Subject(const Region& region) : region_(region) {} 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const Region& region() const { return region_; } 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Region region_; 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_COPY_AND_ASSIGN(Subject); 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DifferenceEstimator::DifferenceEstimator() {} 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DifferenceEstimator::~DifferenceEstimator() { 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (size_t i = 0; i < owned_bases_.size(); ++i) 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete owned_bases_[i]; 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (size_t i = 0; i < owned_subjects_.size(); ++i) 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete owned_subjects_[i]; 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DifferenceEstimator::Base* DifferenceEstimator::MakeBase(const Region& region) { 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Base* base = new Base(region); 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base->Init(); 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) owned_bases_.push_back(base); 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return base; 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject( 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const Region& region) { 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Subject* subject = new Subject(region); 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) owned_subjects_.push_back(subject); 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return subject; 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)size_t DifferenceEstimator::Measure(Base* base, Subject* subject) { 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t mismatches = 0; 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* start = subject->region().start(); 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* end = subject->region().end() - (kTupleSize - 1); 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* p = start; 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (p < end) { 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t hash = HashTuple(p); 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (base->hashes_.find(hash) == base->hashes_.end()) { 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++mismatches; 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) p += 1; 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (mismatches == 0) { 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (RegionsEqual(base->region(), subject->region())) 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return 0; 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++mismatches; // Guarantee not zero. 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return mismatches; 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace 119