195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// Copyright 2006 Google Inc. 295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// Authors: Sanjay Ghemawat, Jeff Dean, Chandra Chereddi, Lincoln Smith 395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// 495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// Licensed under the Apache License, Version 2.0 (the "License"); 56ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach// you may not use this file except in compliance with the License. 66ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach// You may obtain a copy of the License at 76ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach// 895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// http://www.apache.org/licenses/LICENSE-2.0 96ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach// 1095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// Unless required by applicable law or agreed to in writing, software 116ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach// distributed under the License is distributed on an "AS IS" BASIS, 126ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 136ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach// See the License for the specific language governing permissions and 146ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach// limitations under the License. 156ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach// 1695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// Implementation of the Bentley/McIlroy algorithm for finding differences. 176ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach// Bentley, McIlroy. DCC 1999. Data Compression Using Long Common Strings. 18254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren// http://citeseer.ist.psu.edu/555557.html 196ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach 20254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren#ifndef OPEN_VCDIFF_BLOCKHASH_H_ 21254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren#define OPEN_VCDIFF_BLOCKHASH_H_ 22254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren 23254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren#include <config.h> 24254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren#include <stddef.h> // size_t 256ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach#include <stdint.h> // uint32_t 2695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy#include <vector> 2795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 2895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamynamespace open_vcdiff { 2995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 3095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// A generic hash table which will be used to keep track of byte runs 3195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// of size kBlockSize in both the incrementally processed target data 3295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// and the preprocessed source dictionary. 3395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// 34254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren// A custom hash table implementation is used instead of the standard 35254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren// hash_map template because we know that there will be exactly one 366ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach// entry in the BlockHash corresponding to each kBlockSize bytes 37254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren// in the source data, which makes certain optimizations possible: 386ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach// * The memory for the hash table and for all hash entries can be allocated 3995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// in one step rather than incrementally for each insert operation. 4095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// * A single integer can be used to represent both 4195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// the index of the next hash entry in the chain 4295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// and the position of the entry within the source data 436ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach// (== kBlockSize * block_number). This greatly reduces the size 446ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach// of a hash entry. 4595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy// 4695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamyclass BlockHash { 4795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy public: 4895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // Block size as per Bentley/McIlroy; must be a power of two. 4995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 5095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // Using (for example) kBlockSize = 4 guarantees that no match smaller 5195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // than size 4 will be identified, that some matches having sizes 5295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 4, 5, or 6 may be identified, and that all matches 536ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach // having size 7 or greater will be identified (because any string of 5495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 7 bytes must contain a complete aligned block of 4 bytes.) 5595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 5695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // Increasing kBlockSize by a factor of two will halve the amount of 5795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // memory needed for the next block table, and will halve the setup time 586ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach // for a new BlockHash. However, it also doubles the minimum 5995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // match length that is guaranteed to be found in FindBestMatch(), 606ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach // so that function will be less effective in finding matches. 6195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 6295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // Computational effort in FindBestMatch (which is the inner loop of 6395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // the encoding algorithm) will be proportional to the number of 6495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // matches found, and a low value of kBlockSize will waste time 6595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // tracking down small matches. On the other hand, if this value 6695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // is set too high, no matches will be found at all. 6795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 6895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // It is suggested that different values of kBlockSize be tried against 6995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // a representative data set to find the best tradeoff between 7095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // memory/CPU and the effectiveness of FindBestMatch(). 7195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 7295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // If you change kBlockSize to a smaller value, please increase 7395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // kMaxMatchesToCheck accordingly. 7495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy static const int kBlockSize = 16; 7595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 7695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // This class is used to store the best match found by FindBestMatch() 7795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // and return it to the caller. 7895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy class Match { 7995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy public: 8095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy Match() : size_(0), source_offset_(-1), target_offset_(-1) { } 8195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 8295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy void ReplaceIfBetterMatch(size_t candidate_size, 8395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy int candidate_source_offset, 8495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy int candidate_target_offset) { 8595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy if (candidate_size > size_) { 8695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy size_ = candidate_size; 8795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy source_offset_ = candidate_source_offset; 8895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy target_offset_ = candidate_target_offset; 8995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy } 9095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy } 9195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 9295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy size_t size() const { return size_; } 9395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy int source_offset() const { return source_offset_; } 9495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy int target_offset() const { return target_offset_; } 9595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 966ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach private: 976ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach // The size of the best (longest) match passed to ReplaceIfBetterMatch(). 986ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach size_t size_; 9995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 10095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // The source offset of the match, including the starting_offset_ 10195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // of the BlockHash for which the match was found. 10295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy int source_offset_; 10395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 10495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // The target offset of the match. An offset of 0 corresponds to the 10595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // data at target_start, which is an argument of FindBestMatch(). 10695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy int target_offset_; 10795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 10895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // Making these private avoids implicit copy constructor 10995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // & assignment operator 11095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy Match(const Match&); // NOLINT 11195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy void operator=(const Match&); 11295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy }; 11395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 11495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // A BlockHash is created using a buffer of source data. The hash table 11595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // will contain one entry for each kBlockSize-byte block in the 11695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // source data. 11795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 11895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // See the comments for starting_offset_, below, for a description of 11995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // the starting_offset argument. For a hash of source (dictionary) data, 12095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // starting_offset_ will be zero; for a hash of previously encoded 12195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // target data, starting_offset_ will be equal to the dictionary size. 12295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 12395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy BlockHash(const char* source_data, size_t source_size, int starting_offset); 12495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 12595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy ~BlockHash(); 12695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 12795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // Initializes the object before use. 12895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // This method must be called after constructing a BlockHash object, 12995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // and before any other method may be called. This is because 13095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // Init() dynamically allocates hash_table_ and next_block_table_. 13195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // Returns true if initialization succeeded, or false if an error occurred, 13295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // in which case no other method except the destructor may then be used 13395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // on the object. 1346ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach // 13595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // If populate_hash_table is true, then AddAllBlocks() will be called 13695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // to populate the hash table. If populate_hash_table is false, then 13795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // classes that inherit from BlockHash are expected to call AddBlock() 13895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // to incrementally populate individual blocks of data. 13995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 14095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy bool Init(bool populate_hash_table); 14195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 14295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // In the context of the open-vcdiff encoder, BlockHash is used for two 14395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // purposes: to hash the source (dictionary) data, and to hash 14495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // the previously encoded target data. The main differences between 14595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // a dictionary BlockHash and a target BlockHash are as follows: 14695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 14795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 1. The best_match->source_offset() returned from FindBestMatch() 14895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // for a target BlockHash is computed in the following manner: 14995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // the starting offset of the first byte in the target data 15095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // is equal to the dictionary size. FindBestMatch() will add 15195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // starting_offset_ to any best_match->source_offset() value it returns, 15295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // in order to produce the correct offset value for a target BlockHash. 15395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 2. For a dictionary BlockHash, the entire data set is hashed at once 15495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // when Init() is called with the parameter populate_hash_table = true. 15595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // For a target BlockHash, because the previously encoded target data 15695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // includes only the data seen up to the current encoding position, 15795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // the data blocks are hashed incrementally as the encoding position 15895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // advances, using AddOneIndexHash() and AddAllBlocksThroughIndex(). 15995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 16095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // The following two factory functions can be used to create BlockHash 16195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // objects for each of these two purposes. Each factory function calls 16295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // the object constructor and also calls Init(). If an error occurs, 16395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // NULL is returned; otherwise a valid BlockHash object is returned. 16495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // Since a dictionary BlockHash is not expected to be modified after 16595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // initialization, a const object is returned. 16695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // The caller is responsible for deleting the returned object 16795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // (using the C++ delete operator) once it is no longer needed. 16895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy static const BlockHash* CreateDictionaryHash(const char* dictionary_data, 16995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy size_t dictionary_size); 17095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy static BlockHash* CreateTargetHash(const char* target_data, 17195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy size_t target_size, 17295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy size_t dictionary_size); 17395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 17495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // This function will be called to add blocks incrementally to the target hash 17595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // as the encoding position advances through the target data. It will be 17695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // called for every kBlockSize-byte block in the target data, regardless 17795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // of whether the block is aligned evenly on a block boundary. The 17895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // BlockHash will only store hash entries for the evenly-aligned blocks. 17995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 18095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy void AddOneIndexHash(int index, uint32_t hash_value) { 18195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy if (index == NextIndexToAdd()) { 18295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy AddBlock(hash_value); 18395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy } 18495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy } 18595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 18695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // Calls AddBlock() for each kBlockSize-byte block in the range 18795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // (last_block_added_ * kBlockSize, end_index), exclusive of the endpoints. 18895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // If end_index <= the last index added (last_block_added_ * kBlockSize), 18995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // this function does nothing. 19095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 19195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // A partial block beginning anywhere up to (end_index - 1) is also added, 19295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // unless it extends outside the end of the source data. Like AddAllBlocks(), 19395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // this function computes the hash value for each of the blocks in question 19495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // from scratch, so it is not a good option if the hash values have already 19595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // been computed for some other purpose. 19695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 19795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // Example: assume kBlockSize = 4, last_block_added_ = 1, and there are 19895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 14 bytes of source data. 19995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // If AddAllBlocksThroughIndex(9) is invoked, then it will call AddBlock() 20095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // only for block number 2 (at index 8). 20195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // If, after that, AddAllBlocksThroughIndex(14) is invoked, it will not call 20295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // AddBlock() at all, because block 3 (beginning at index 12) would 20395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // fall outside the range of source data. 20495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 20595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // VCDiffEngine::Encode (in vcdiffengine.cc) uses this function to 20695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // add a whole range of data to a target hash when a COPY instruction 20795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // is generated. 20895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy void AddAllBlocksThroughIndex(int end_index); 20995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy 21095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // FindBestMatch takes a position within the unencoded target data 21195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // (target_candidate_start) and the hash value of the kBlockSize bytes 21295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // beginning at that position (hash_value). It attempts to find a matching 21395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // set of bytes within the source (== dictionary) data, expanding 21495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // the match both below and above the target block. It cannot expand 21595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // the match outside the bounds of the source data, or below 21695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // target_start within the target data, or past 21795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // the end limit of (target_start + target_length). 21895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 21995fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // target_candidate_start is the start of the candidate block within the 22095fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // target data for which a match will be sought, while 22195fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // target_start (which is <= target_candidate_start) 22295fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // is the start of the target data that has yet to be encoded. 22395fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // 22495fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // If a match is found whose size is greater than the size 22595fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // of best_match, this function populates *best_match with the 22695fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // size, source_offset, and target_offset of the match found. 22795fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // best_match->source_offset() will contain the index of the start of the 2286ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach // matching source data, plus starting_offset_ 2296ef101187774e30ddba6b46bbedef549a42196adAndre Eisenbach // (see description of starting_offset_ for details); 230254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren // best_match->target_offset() will contain the offset of the match 231254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren // beginning with target_start = offset 0, such that 232254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren // 0 <= best_match->target_offset() 233254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren // <= (target_candidate_start - target_start); 234254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren // and best_match->size() will contain the size of the match. 235254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren // If no such match is found, this function leaves *best_match unmodified. 2362f9c0a7966a08936e8ae7a03ab8fbf1de3b22e0eMattias Agren // 237254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren // On calling FindBestMatch(), best_match must 238254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren // point to a valid Match object, and cannot be NULL. 239254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren // The same Match object can be passed 240254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren // when calling FindBestMatch() on a different BlockHash object 241254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren // for the same candidate data block, in order to find 242254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren // the best match possible across both objects. For example: 243254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren // 244254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren // open_vcdiff::BlockHash::Match best_match; 245254588bfe6c3e70625b0f725b908598f30f476c8Mattias Agren // uint32_t hash_value = 24602f8bc6eeb93809da4a7dee62fd092095a11d885Matthew Xie // RollingHash<BlockHash::kBlockSize>::Hash(target_candidate_start); 247b930299a8ad32a8227b00e0d9016b62682b577f3Matthew Xie // bh1.FindBestMatch(hash_value, 24895fa11b3b2f19a382c7e3a744a6afb452fad86dfKausik Sinnaswamy // target_candidate_start, 249 // target_start, 250 // target_length, 251 // &best_match); 252 // bh2.FindBestMatch(hash_value, 253 // target_candidate_start, 254 // target_start, 255 // target_length, 256 // &best_match); 257 // if (best_size >= 0) { 258 // // a match was found; its size, source offset, and target offset 259 // // can be found in best_match 260 // } 261 // 262 // hash_value is passed as a separate parameter from target_candidate_start, 263 // (rather than calculated within FindBestMatch) in order to take 264 // advantage of the rolling hash, which quickly calculates the hash value 265 // of the block starting at target_candidate_start based on 266 // the known hash value of the block starting at (target_candidate_start - 1). 267 // See vcdiffengine.cc for more details. 268 // 269 // Example: 270 // kBlockSize: 4 271 // target text: "ANDREW LLOYD WEBBER" 272 // 1^ 5^2^ 3^ 273 // dictionary: "INSURANCE : LLOYDS OF LONDON" 274 // 4^ 275 // hashed dictionary blocks: 276 // "INSU", "RANC", "E : ", "LLOY", "DS O", "F LON" 277 // 278 // 1: target_start (beginning of unencoded data) 279 // 2: target_candidate_start (for the block "LLOY") 280 // 3: target_length (points one byte beyond the last byte of data.) 281 // 4: best_match->source_offset() (after calling FindBestMatch) 282 // 5: best_match->target_offset() (after calling FindBestMatch) 283 // 284 // Under these conditions, FindBestMatch will find a matching 285 // hashed dictionary block for "LLOY", and will extend the beginning of 286 // this match backwards by one byte, and the end of the match forwards 287 // by one byte, finding that the best match is " LLOYD" 288 // with best_match->source_offset() = 10 289 // (offset of " LLOYD" in the source string), 290 // best_match->target_offset() = 6 291 // (offset of " LLOYD" in the target string), 292 // and best_match->size() = 6. 293 // 294 void FindBestMatch(uint32_t hash_value, 295 const char* target_candidate_start, 296 const char* target_start, 297 size_t target_size, 298 Match* best_match) const; 299 300 protected: 301 // FindBestMatch() will not process more than this number 302 // of matching hash entries. 303 // 304 // It is necessary to have a limit on the maximum number of matches 305 // that will be checked in order to avoid the worst-case performance 306 // possible if, for example, all the blocks in the dictionary have 307 // the same hash value. See the unit test SearchStringFindsTooManyMatches 308 // for an example of such a case. The encoder uses a loop in 309 // VCDiffEngine::Encode over each target byte, containing a loop in 310 // BlockHash::FindBestMatch over the number of matches (up to a maximum 311 // of the number of source blocks), containing two loops that extend 312 // the match forwards and backwards up to the number of source bytes. 313 // Total complexity in the worst case is 314 // O([target size] * source_size_ * source_size_) 315 // Placing a limit on the possible number of matches checked changes this to 316 // O([target size] * source_size_ * kMaxMatchesToCheck) 317 // 318 // In empirical testing on real HTML text, using a block size of 4, 319 // the number of true matches per call to FindBestMatch() did not exceed 78; 320 // with a block size of 32, the number of matches did not exceed 3. 321 // 322 // The expected number of true matches scales super-linearly 323 // with the inverse of kBlockSize, but here a linear scale is used 324 // for block sizes smaller than 32. 325 static const int kMaxMatchesToCheck = (kBlockSize >= 32) ? 32 : 326 (32 * (32 / kBlockSize)); 327 328 // Do not skip more than this number of non-matching hash collisions 329 // to find the next matching entry in the hash chain. 330 static const int kMaxProbes = 16; 331 332 // Internal routine which calculates a hash table size based on kBlockSize and 333 // the dictionary_size. Will return a power of two if successful, or 0 if an 334 // internal error occurs. Some calculations (such as GetHashTableIndex()) 335 // depend on the table size being a power of two. 336 static size_t CalcTableSize(const size_t dictionary_size); 337 338 size_t GetNumberOfBlocks() const { 339 return source_size_ / kBlockSize; 340 } 341 342 // Use the lowest-order bits of the hash value 343 // as the index into the hash table. 344 uint32_t GetHashTableIndex(uint32_t hash_value) const { 345 return hash_value & hash_table_mask_; 346 } 347 348 // The index within source_data_ of the next block 349 // for which AddBlock() should be called. 350 int NextIndexToAdd() const { 351 return (last_block_added_ + 1) * kBlockSize; 352 } 353 354 static inline bool TooManyMatches(int* match_counter); 355 356 const char* source_data() { return source_data_; } 357 size_t source_size() { return source_size_; } 358 359 // Adds an entry to the hash table for one block of source data of length 360 // kBlockSize, starting at source_data_[block_number * kBlockSize], 361 // where block_number is always (last_block_added_ + 1). That is, 362 // AddBlock() must be called once for each block in source_data_ 363 // in increasing order. 364 void AddBlock(uint32_t hash_value); 365 366 // Calls AddBlock() for each complete kBlockSize-byte block between 367 // source_data_ and (source_data_ + source_size_). It is equivalent 368 // to calling AddAllBlocksThroughIndex(source_data + source_size). 369 // This function is called when Init(true) is invoked. 370 void AddAllBlocks(); 371 372 // Returns true if the contents of the kBlockSize-byte block 373 // beginning at block1 are identical to the contents of 374 // the block beginning at block2; false otherwise. 375 static bool BlockContentsMatch(const char* block1, const char* block2); 376 377 // Compares each machine word of the two (possibly unaligned) blocks, rather 378 // than each byte, thus reducing the number of test-and-branch instructions 379 // executed. Returns a boolean (do the blocks match?) rather than 380 // the signed byte difference returned by memcmp. 381 // 382 // BlockContentsMatch will use either this function or memcmp to do its work, 383 // depending on which is faster for a particular architecture. 384 // 385 // For gcc on x86-based architectures, this function has been shown to run 386 // about twice as fast as the library function memcmp(), and between five and 387 // nine times faster than the assembly instructions (repz and cmpsb) that gcc 388 // uses by default for builtin memcmp. On other architectures, or using 389 // other compilers, this function has not shown to be faster than memcmp. 390 static bool BlockCompareWords(const char* block1, const char* block2); 391 392 // Finds the first block number within the hashed data 393 // that represents a match for the given hash value. 394 // Returns -1 if no match was found. 395 // 396 // Init() must have been called and returned true before using 397 // FirstMatchingBlock or NextMatchingBlock. No check is performed 398 // for this condition; the code will crash if this condition is violated. 399 // 400 // The hash table is initially populated with -1 (not found) values, 401 // so if this function is called before the hash table has been populated 402 // using AddAllBlocks() or AddBlock(), it will simply return -1 403 // for any value of hash_value. 404 int FirstMatchingBlock(uint32_t hash_value, const char* block_ptr) const; 405 406 // Given a block number returned by FirstMatchingBlock() 407 // or by a previous call to NextMatchingBlock(), returns 408 // the next block number that matches the same hash value. 409 // Returns -1 if no match was found. 410 int NextMatchingBlock(int block_number, const char* block_ptr) const; 411 412 // Inline version of FirstMatchingBlock. This saves the cost of a function 413 // call when this routine is called from within the module. The external 414 // (non-inlined) version is called only by unit tests. 415 inline int FirstMatchingBlockInline(uint32_t hash_value, 416 const char* block_ptr) const; 417 418 // Walk through the hash entry chain, skipping over any false matches 419 // (for which the lowest bits of the fingerprints match, 420 // but the actual block data does not.) Returns the block number of 421 // the first true match found, or -1 if no true match was found. 422 // If block_number is a matching block, the function will return block_number 423 // without skipping to the next block. 424 int SkipNonMatchingBlocks(int block_number, const char* block_ptr) const; 425 426 // Returns the number of bytes to the left of source_match_start 427 // that match the corresponding bytes to the left of target_match_start. 428 // Will not examine more than max_bytes bytes, which is to say that 429 // the return value will be in the range [0, max_bytes] inclusive. 430 static int MatchingBytesToLeft(const char* source_match_start, 431 const char* target_match_start, 432 int max_bytes); 433 434 // Returns the number of bytes starting at source_match_end 435 // that match the corresponding bytes starting at target_match_end. 436 // Will not examine more than max_bytes bytes, which is to say that 437 // the return value will be in the range [0, max_bytes] inclusive. 438 static int MatchingBytesToRight(const char* source_match_end, 439 const char* target_match_end, 440 int max_bytes); 441 442 // The protected functions BlockContentsMatch, FirstMatchingBlock, 443 // NextMatchingBlock, MatchingBytesToLeft, and MatchingBytesToRight 444 // should be made accessible to unit tests. 445 friend class BlockHashTest; 446 447 private: 448 const char* const source_data_; 449 const size_t source_size_; 450 451 // The size of this array is determined using CalcTableSize(). It has at 452 // least one element for each kBlockSize-byte block in the source data. 453 // GetHashTableIndex() returns an index into this table for a given hash 454 // value. The value of each element of hash_table_ is the lowest block 455 // number in the source data whose hash value would return the same value from 456 // GetHashTableIndex(), or -1 if there is no matching block. This value can 457 // then be used as an index into next_block_table_ to retrieve the entire set 458 // of matching block numbers. 459 std::vector<int> hash_table_; 460 461 // An array containing one element for each source block. Each element is 462 // either -1 (== not found) or the index of the next block whose hash value 463 // would produce a matching result from GetHashTableIndex(). 464 std::vector<int> next_block_table_; 465 466 // This vector has the same size as next_block_table_. For every block number 467 // B that is referenced in hash_table_, last_block_table_[B] will contain 468 // the maximum block number that has the same GetHashTableIndex() value 469 // as block B. This number may be B itself. For a block number B' that 470 // is not referenced in hash_table_, the value of last_block_table_[B'] is -1. 471 // This table is used only while populating the hash table, not while looking 472 // up hash values in the table. Keeping track of the last block number in the 473 // chain allows us to construct the block chains as FIFO rather than LIFO 474 // lists, so that the match with the lowest index is returned first. This 475 // should result in a more compact encoding because the VCDIFF format favors 476 // smaller index values and repeated index values. 477 std::vector<int> last_block_table_; 478 479 // Performing a bitwise AND with hash_table_mask_ will produce a value ranging 480 // from 0 to the number of elements in hash_table_. 481 uint32_t hash_table_mask_; 482 483 // The offset of the first byte of source data (the data at source_data_[0]). 484 // For the purpose of computing offsets, the source data and target data 485 // are considered to be concatenated -- not literally in a single memory 486 // buffer, but conceptually as described in the RFC. 487 // The first byte of the previously encoded target data 488 // has an offset that is equal to dictionary_size, i.e., just after 489 // the last byte of source data. 490 // For a hash of source (dictionary) data, starting_offset_ will be zero; 491 // for a hash of previously encoded target data, starting_offset_ will be 492 // equal to the dictionary size. 493 const int starting_offset_; 494 495 // The last index added by AddBlock(). This determines the block number 496 // for successive calls to AddBlock(), and is also 497 // used to determine the starting block for AddAllBlocksThroughIndex(). 498 int last_block_added_; 499 500 // Making these private avoids implicit copy constructor & assignment operator 501 BlockHash(const BlockHash&); // NOLINT 502 void operator=(const BlockHash&); 503}; 504 505} // namespace open_vcdiff 506 507#endif // OPEN_VCDIFF_BLOCKHASH_H_ 508