1311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// Copyright 2006 Google Inc. 2311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// Authors: Sanjay Ghemawat, Jeff Dean, Chandra Chereddi, Lincoln Smith 3311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// 4311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// Licensed under the Apache License, Version 2.0 (the "License"); 5311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// you may not use this file except in compliance with the License. 6311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// You may obtain a copy of the License at 7311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// 8311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// http://www.apache.org/licenses/LICENSE-2.0 9311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// 10311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// Unless required by applicable law or agreed to in writing, software 11311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// distributed under the License is distributed on an "AS IS" BASIS, 12311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// See the License for the specific language governing permissions and 14311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// limitations under the License. 15311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// 16311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// Implementation of the Bentley/McIlroy algorithm for finding differences. 17311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// Bentley, McIlroy. DCC 1999. Data Compression Using Long Common Strings. 18311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// http://citeseer.ist.psu.edu/555557.html 19311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 20311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff#ifndef OPEN_VCDIFF_BLOCKHASH_H_ 21311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff#define OPEN_VCDIFF_BLOCKHASH_H_ 22311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 23311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff#include <config.h> 2428db8079f707ebdf43ce62cdfd96eb39c8f889e0openvcdiff#include <stddef.h> // size_t 25311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff#include <stdint.h> // uint32_t 26311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff#include <vector> 27311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 28311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiffnamespace open_vcdiff { 29311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 30311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// A generic hash table which will be used to keep track of byte runs 31311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// of size kBlockSize in both the incrementally processed target data 32311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// and the preprocessed source dictionary. 33311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// 34311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// A custom hash table implementation is used instead of the standard 35311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// hash_map template because we know that there will be exactly one 36311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// entry in the BlockHash corresponding to each kBlockSize bytes 37311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// in the source data, which makes certain optimizations possible: 38311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// * The memory for the hash table and for all hash entries can be allocated 39311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// in one step rather than incrementally for each insert operation. 40311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// * A single integer can be used to represent both 41311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// the index of the next hash entry in the chain 42311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// and the position of the entry within the source data 43311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// (== kBlockSize * block_number). This greatly reduces the size 44311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// of a hash entry. 45311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// 46311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiffclass BlockHash { 47311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff public: 48311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Block size as per Bentley/McIlroy; must be a power of two. 49311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 50311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Using (for example) kBlockSize = 4 guarantees that no match smaller 51311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // than size 4 will be identified, that some matches having sizes 52311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 4, 5, or 6 may be identified, and that all matches 53311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // having size 7 or greater will be identified (because any string of 54311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 7 bytes must contain a complete aligned block of 4 bytes.) 55311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 56311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Increasing kBlockSize by a factor of two will halve the amount of 57311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // memory needed for the next block table, and will halve the setup time 58311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // for a new BlockHash. However, it also doubles the minimum 59311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // match length that is guaranteed to be found in FindBestMatch(), 60311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // so that function will be less effective in finding matches. 61311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 62311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Computational effort in FindBestMatch (which is the inner loop of 63311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the encoding algorithm) will be proportional to the number of 64311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // matches found, and a low value of kBlockSize will waste time 65311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // tracking down small matches. On the other hand, if this value 66311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // is set too high, no matches will be found at all. 67311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 68311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // It is suggested that different values of kBlockSize be tried against 69311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // a representative data set to find the best tradeoff between 70311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // memory/CPU and the effectiveness of FindBestMatch(). 71311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 72311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // If you change kBlockSize to a smaller value, please increase 73311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // kMaxMatchesToCheck accordingly. 74d18457863096b3685e56f5a8919959f6afbdb121openvcdiff static const int kBlockSize = 16; 75311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 76311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // This class is used to store the best match found by FindBestMatch() 77311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // and return it to the caller. 78311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff class Match { 79311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff public: 80311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff Match() : size_(0), source_offset_(-1), target_offset_(-1) { } 81311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 82311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff void ReplaceIfBetterMatch(size_t candidate_size, 83311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff int candidate_source_offset, 84311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff int candidate_target_offset) { 85311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff if (candidate_size > size_) { 86311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff size_ = candidate_size; 87311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff source_offset_ = candidate_source_offset; 88311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff target_offset_ = candidate_target_offset; 89311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff } 90311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff } 91311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 92311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff size_t size() const { return size_; } 93311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff int source_offset() const { return source_offset_; } 94311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff int target_offset() const { return target_offset_; } 95311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 96311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff private: 97311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // The size of the best (longest) match passed to ReplaceIfBetterMatch(). 98311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff size_t size_; 99311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 100311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // The source offset of the match, including the starting_offset_ 101311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // of the BlockHash for which the match was found. 102311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff int source_offset_; 103311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 104311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // The target offset of the match. An offset of 0 corresponds to the 105311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // data at target_start, which is an argument of FindBestMatch(). 106311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff int target_offset_; 107311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 108311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Making these private avoids implicit copy constructor 109311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // & assignment operator 110311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff Match(const Match&); // NOLINT 111311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff void operator=(const Match&); 112311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff }; 113311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 114311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // A BlockHash is created using a buffer of source data. The hash table 115311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // will contain one entry for each kBlockSize-byte block in the 116311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // source data. 117311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 118311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // See the comments for starting_offset_, below, for a description of 119311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the starting_offset argument. For a hash of source (dictionary) data, 120311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // starting_offset_ will be zero; for a hash of previously encoded 121311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // target data, starting_offset_ will be equal to the dictionary size. 122311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 123311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff BlockHash(const char* source_data, size_t source_size, int starting_offset); 124311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 125311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff ~BlockHash(); 126311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 127311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Initializes the object before use. 128311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // This method must be called after constructing a BlockHash object, 129311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // and before any other method may be called. This is because 130311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Init() dynamically allocates hash_table_ and next_block_table_. 131311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Returns true if initialization succeeded, or false if an error occurred, 132311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // in which case no other method except the destructor may then be used 133311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // on the object. 134311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 135311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // If populate_hash_table is true, then AddAllBlocks() will be called 136311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // to populate the hash table. If populate_hash_table is false, then 137311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // classes that inherit from BlockHash are expected to call AddBlock() 138311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // to incrementally populate individual blocks of data. 139311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 140311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff bool Init(bool populate_hash_table); 141311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 142311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // In the context of the open-vcdiff encoder, BlockHash is used for two 143311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // purposes: to hash the source (dictionary) data, and to hash 144311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the previously encoded target data. The main differences between 145311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // a dictionary BlockHash and a target BlockHash are as follows: 146311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 147311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 1. The best_match->source_offset() returned from FindBestMatch() 148311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // for a target BlockHash is computed in the following manner: 149311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the starting offset of the first byte in the target data 150311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // is equal to the dictionary size. FindBestMatch() will add 151311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // starting_offset_ to any best_match->source_offset() value it returns, 152311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // in order to produce the correct offset value for a target BlockHash. 153311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 2. For a dictionary BlockHash, the entire data set is hashed at once 154311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // when Init() is called with the parameter populate_hash_table = true. 155311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // For a target BlockHash, because the previously encoded target data 156311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // includes only the data seen up to the current encoding position, 157311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the data blocks are hashed incrementally as the encoding position 158311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // advances, using AddOneIndexHash() and AddAllBlocksThroughIndex(). 159311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 160311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // The following two factory functions can be used to create BlockHash 161311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // objects for each of these two purposes. Each factory function calls 162311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the object constructor and also calls Init(). If an error occurs, 163311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // NULL is returned; otherwise a valid BlockHash object is returned. 164311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Since a dictionary BlockHash is not expected to be modified after 165311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // initialization, a const object is returned. 166311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // The caller is responsible for deleting the returned object 167311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // (using the C++ delete operator) once it is no longer needed. 168311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff static const BlockHash* CreateDictionaryHash(const char* dictionary_data, 169311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff size_t dictionary_size); 170311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff static BlockHash* CreateTargetHash(const char* target_data, 171311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff size_t target_size, 172311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff size_t dictionary_size); 173311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 174311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // This function will be called to add blocks incrementally to the target hash 175311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // as the encoding position advances through the target data. It will be 176311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // called for every kBlockSize-byte block in the target data, regardless 177311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // of whether the block is aligned evenly on a block boundary. The 178311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // BlockHash will only store hash entries for the evenly-aligned blocks. 179311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 180311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff void AddOneIndexHash(int index, uint32_t hash_value) { 181311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff if (index == NextIndexToAdd()) { 182311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff AddBlock(hash_value); 183311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff } 184311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff } 185311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 186311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Calls AddBlock() for each kBlockSize-byte block in the range 187311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // (last_block_added_ * kBlockSize, end_index), exclusive of the endpoints. 188311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // If end_index <= the last index added (last_block_added_ * kBlockSize), 189311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // this function does nothing. 190311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 191311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // A partial block beginning anywhere up to (end_index - 1) is also added, 192311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // unless it extends outside the end of the source data. Like AddAllBlocks(), 193311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // this function computes the hash value for each of the blocks in question 194311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // from scratch, so it is not a good option if the hash values have already 195311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // been computed for some other purpose. 196311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 197311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Example: assume kBlockSize = 4, last_block_added_ = 1, and there are 198311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 14 bytes of source data. 199311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // If AddAllBlocksThroughIndex(9) is invoked, then it will call AddBlock() 200311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // only for block number 2 (at index 8). 201311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // If, after that, AddAllBlocksThroughIndex(14) is invoked, it will not call 202311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // AddBlock() at all, because block 3 (beginning at index 12) would 203311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // fall outside the range of source data. 204311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 205311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // VCDiffEngine::Encode (in vcdiffengine.cc) uses this function to 206311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // add a whole range of data to a target hash when a COPY instruction 207311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // is generated. 208311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff void AddAllBlocksThroughIndex(int end_index); 209311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 210311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // FindBestMatch takes a position within the unencoded target data 211311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // (target_candidate_start) and the hash value of the kBlockSize bytes 212311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // beginning at that position (hash_value). It attempts to find a matching 213311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // set of bytes within the source (== dictionary) data, expanding 214311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the match both below and above the target block. It cannot expand 215311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the match outside the bounds of the source data, or below 216311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // target_start within the target data, or past 217311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the end limit of (target_start + target_length). 218311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 219311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // target_candidate_start is the start of the candidate block within the 220311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // target data for which a match will be sought, while 221311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // target_start (which is <= target_candidate_start) 222311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // is the start of the target data that has yet to be encoded. 223311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 224311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // If a match is found whose size is greater than the size 225311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // of best_match, this function populates *best_match with the 226311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // size, source_offset, and target_offset of the match found. 227311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // best_match->source_offset() will contain the index of the start of the 228311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // matching source data, plus starting_offset_ 229311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // (see description of starting_offset_ for details); 230311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // best_match->target_offset() will contain the offset of the match 231311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // beginning with target_start = offset 0, such that 232311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 0 <= best_match->target_offset() 233311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // <= (target_candidate_start - target_start); 234311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // and best_match->size() will contain the size of the match. 235311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // If no such match is found, this function leaves *best_match unmodified. 236311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 237311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // On calling FindBestMatch(), best_match must 238311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // point to a valid Match object, and cannot be NULL. 239311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // The same Match object can be passed 240311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // when calling FindBestMatch() on a different BlockHash object 241311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // for the same candidate data block, in order to find 242311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the best match possible across both objects. For example: 243311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 244311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // open_vcdiff::BlockHash::Match best_match; 245311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // uint32_t hash_value = 246311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // RollingHash<BlockHash::kBlockSize>::Hash(target_candidate_start); 247311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // bh1.FindBestMatch(hash_value, 248311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // target_candidate_start, 249311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // target_start, 250311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // target_length, 251311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // &best_match); 252311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // bh2.FindBestMatch(hash_value, 253311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // target_candidate_start, 254311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // target_start, 255311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // target_length, 256311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // &best_match); 257311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // if (best_size >= 0) { 258311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // // a match was found; its size, source offset, and target offset 259311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // // can be found in best_match 260311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // } 261311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 262311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // hash_value is passed as a separate parameter from target_candidate_start, 263311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // (rather than calculated within FindBestMatch) in order to take 264311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // advantage of the rolling hash, which quickly calculates the hash value 265311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // of the block starting at target_candidate_start based on 266311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the known hash value of the block starting at (target_candidate_start - 1). 267311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // See vcdiffengine.cc for more details. 268311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 269311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Example: 270311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // kBlockSize: 4 271311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // target text: "ANDREW LLOYD WEBBER" 272311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 1^ 5^2^ 3^ 273311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // dictionary: "INSURANCE : LLOYDS OF LONDON" 274311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 4^ 275311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // hashed dictionary blocks: 276311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // "INSU", "RANC", "E : ", "LLOY", "DS O", "F LON" 277311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 278311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 1: target_start (beginning of unencoded data) 279311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 2: target_candidate_start (for the block "LLOY") 280311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 3: target_length (points one byte beyond the last byte of data.) 281311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 4: best_match->source_offset() (after calling FindBestMatch) 282311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 5: best_match->target_offset() (after calling FindBestMatch) 283311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 284311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Under these conditions, FindBestMatch will find a matching 285311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // hashed dictionary block for "LLOY", and will extend the beginning of 286311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // this match backwards by one byte, and the end of the match forwards 287311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // by one byte, finding that the best match is " LLOYD" 288311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // with best_match->source_offset() = 10 289311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // (offset of " LLOYD" in the source string), 290311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // best_match->target_offset() = 6 291311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // (offset of " LLOYD" in the target string), 292311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // and best_match->size() = 6. 293311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 294311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff void FindBestMatch(uint32_t hash_value, 295311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff const char* target_candidate_start, 296311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff const char* target_start, 297311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff size_t target_size, 298311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff Match* best_match) const; 299311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 300311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff protected: 301311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // FindBestMatch() will not process more than this number 302311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // of matching hash entries. 303311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 304311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // It is necessary to have a limit on the maximum number of matches 305311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // that will be checked in order to avoid the worst-case performance 306311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // possible if, for example, all the blocks in the dictionary have 307311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the same hash value. See the unit test SearchStringFindsTooManyMatches 308311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // for an example of such a case. The encoder uses a loop in 309311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // VCDiffEngine::Encode over each target byte, containing a loop in 310311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // BlockHash::FindBestMatch over the number of matches (up to a maximum 311311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // of the number of source blocks), containing two loops that extend 312311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the match forwards and backwards up to the number of source bytes. 313311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Total complexity in the worst case is 314311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // O([target size] * source_size_ * source_size_) 315311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Placing a limit on the possible number of matches checked changes this to 316311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // O([target size] * source_size_ * kMaxMatchesToCheck) 317311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 318311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // In empirical testing on real HTML text, using a block size of 4, 319311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the number of true matches per call to FindBestMatch() did not exceed 78; 320311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // with a block size of 32, the number of matches did not exceed 3. 321311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 322311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // The expected number of true matches scales super-linearly 323311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // with the inverse of kBlockSize, but here a linear scale is used 324311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // for block sizes smaller than 32. 325d18457863096b3685e56f5a8919959f6afbdb121openvcdiff static const int kMaxMatchesToCheck = (kBlockSize >= 32) ? 32 : 326d18457863096b3685e56f5a8919959f6afbdb121openvcdiff (32 * (32 / kBlockSize)); 327311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 328311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Do not skip more than this number of non-matching hash collisions 329311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // to find the next matching entry in the hash chain. 330311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff static const int kMaxProbes = 16; 331311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 332311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Internal routine which calculates a hash table size based on kBlockSize and 333311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the dictionary_size. Will return a power of two if successful, or 0 if an 334311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // internal error occurs. Some calculations (such as GetHashTableIndex()) 335311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // depend on the table size being a power of two. 33683bbde0df33922d8dc6fa737cfb306d9caae13b1openvcdiff static size_t CalcTableSize(const size_t dictionary_size); 337311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 33883bbde0df33922d8dc6fa737cfb306d9caae13b1openvcdiff size_t GetNumberOfBlocks() const { 339311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff return source_size_ / kBlockSize; 340311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff } 341311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 342311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Use the lowest-order bits of the hash value 343311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // as the index into the hash table. 344311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff uint32_t GetHashTableIndex(uint32_t hash_value) const { 345311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff return hash_value & hash_table_mask_; 346311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff } 347311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 348311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // The index within source_data_ of the next block 349311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // for which AddBlock() should be called. 350311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff int NextIndexToAdd() const { 351311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff return (last_block_added_ + 1) * kBlockSize; 352311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff } 353311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 354311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff static inline bool TooManyMatches(int* match_counter); 355311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 35683bbde0df33922d8dc6fa737cfb306d9caae13b1openvcdiff const char* source_data() { return source_data_; } 35783bbde0df33922d8dc6fa737cfb306d9caae13b1openvcdiff size_t source_size() { return source_size_; } 358311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 359311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Adds an entry to the hash table for one block of source data of length 360311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // kBlockSize, starting at source_data_[block_number * kBlockSize], 361311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // where block_number is always (last_block_added_ + 1). That is, 362311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // AddBlock() must be called once for each block in source_data_ 363311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // in increasing order. 364311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff void AddBlock(uint32_t hash_value); 365311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 366311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Calls AddBlock() for each complete kBlockSize-byte block between 367311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // source_data_ and (source_data_ + source_size_). It is equivalent 368311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // to calling AddAllBlocksThroughIndex(source_data + source_size). 369311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // This function is called when Init(true) is invoked. 370311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff void AddAllBlocks(); 371311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 372311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Returns true if the contents of the kBlockSize-byte block 373311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // beginning at block1 are identical to the contents of 374311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the block beginning at block2; false otherwise. 375311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff static bool BlockContentsMatch(const char* block1, const char* block2); 376311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 377311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Compares each machine word of the two (possibly unaligned) blocks, rather 378311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // than each byte, thus reducing the number of test-and-branch instructions 379311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // executed. Returns a boolean (do the blocks match?) rather than 380311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the signed byte difference returned by memcmp. 381311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 382311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // BlockContentsMatch will use either this function or memcmp to do its work, 383311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // depending on which is faster for a particular architecture. 384311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 385311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // For gcc on x86-based architectures, this function has been shown to run 386311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // about twice as fast as the library function memcmp(), and between five and 387311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // nine times faster than the assembly instructions (repz and cmpsb) that gcc 388311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // uses by default for builtin memcmp. On other architectures, or using 389311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // other compilers, this function has not shown to be faster than memcmp. 390311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff static bool BlockCompareWords(const char* block1, const char* block2); 391311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 392311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Finds the first block number within the hashed data 393311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // that represents a match for the given hash value. 394311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Returns -1 if no match was found. 395311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 396311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Init() must have been called and returned true before using 397311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // FirstMatchingBlock or NextMatchingBlock. No check is performed 398311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // for this condition; the code will crash if this condition is violated. 399311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // 400311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // The hash table is initially populated with -1 (not found) values, 401311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // so if this function is called before the hash table has been populated 402311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // using AddAllBlocks() or AddBlock(), it will simply return -1 403311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // for any value of hash_value. 404311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff int FirstMatchingBlock(uint32_t hash_value, const char* block_ptr) const; 405311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 406311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Given a block number returned by FirstMatchingBlock() 407311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // or by a previous call to NextMatchingBlock(), returns 408311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the next block number that matches the same hash value. 409311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Returns -1 if no match was found. 410311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff int NextMatchingBlock(int block_number, const char* block_ptr) const; 411311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 412311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Inline version of FirstMatchingBlock. This saves the cost of a function 413311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // call when this routine is called from within the module. The external 414311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // (non-inlined) version is called only by unit tests. 415311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff inline int FirstMatchingBlockInline(uint32_t hash_value, 416311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff const char* block_ptr) const; 417311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 418311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Walk through the hash entry chain, skipping over any false matches 419311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // (for which the lowest bits of the fingerprints match, 420311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // but the actual block data does not.) Returns the block number of 421311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the first true match found, or -1 if no true match was found. 422311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // If block_number is a matching block, the function will return block_number 423311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // without skipping to the next block. 424311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff int SkipNonMatchingBlocks(int block_number, const char* block_ptr) const; 425311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 426311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Returns the number of bytes to the left of source_match_start 427311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // that match the corresponding bytes to the left of target_match_start. 428311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Will not examine more than max_bytes bytes, which is to say that 429311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the return value will be in the range [0, max_bytes] inclusive. 430311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff static int MatchingBytesToLeft(const char* source_match_start, 431311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff const char* target_match_start, 432311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff int max_bytes); 433311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 434311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Returns the number of bytes starting at source_match_end 435311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // that match the corresponding bytes starting at target_match_end. 436311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Will not examine more than max_bytes bytes, which is to say that 437311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the return value will be in the range [0, max_bytes] inclusive. 438311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff static int MatchingBytesToRight(const char* source_match_end, 439311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff const char* target_match_end, 440311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff int max_bytes); 441311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 442311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // The protected functions BlockContentsMatch, FirstMatchingBlock, 443311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // NextMatchingBlock, MatchingBytesToLeft, and MatchingBytesToRight 444311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // should be made accessible to unit tests. 445311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff friend class BlockHashTest; 446311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 447311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff private: 448311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff const char* const source_data_; 449311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff const size_t source_size_; 450311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 451311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // The size of this array is determined using CalcTableSize(). It has at 452311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // least one element for each kBlockSize-byte block in the source data. 453311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // GetHashTableIndex() returns an index into this table for a given hash 454311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // value. The value of each element of hash_table_ is the lowest block 455311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // number in the source data whose hash value would return the same value from 456311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // GetHashTableIndex(), or -1 if there is no matching block. This value can 457311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // then be used as an index into next_block_table_ to retrieve the entire set 458311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // of matching block numbers. 459311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff std::vector<int> hash_table_; 460311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 461311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // An array containing one element for each source block. Each element is 462311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // either -1 (== not found) or the index of the next block whose hash value 463311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // would produce a matching result from GetHashTableIndex(). 464311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff std::vector<int> next_block_table_; 465311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 466311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // This vector has the same size as next_block_table_. For every block number 467311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // B that is referenced in hash_table_, last_block_table_[B] will contain 468311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the maximum block number that has the same GetHashTableIndex() value 469311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // as block B. This number may be B itself. For a block number B' that 470311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // is not referenced in hash_table_, the value of last_block_table_[B'] is -1. 471311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // This table is used only while populating the hash table, not while looking 472311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // up hash values in the table. Keeping track of the last block number in the 473311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // chain allows us to construct the block chains as FIFO rather than LIFO 474311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // lists, so that the match with the lowest index is returned first. This 475311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // should result in a more compact encoding because the VCDIFF format favors 476311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // smaller index values and repeated index values. 477311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff std::vector<int> last_block_table_; 478311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 479311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Performing a bitwise AND with hash_table_mask_ will produce a value ranging 480311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // from 0 to the number of elements in hash_table_. 481311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff uint32_t hash_table_mask_; 482311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 483311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // The offset of the first byte of source data (the data at source_data_[0]). 484311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // For the purpose of computing offsets, the source data and target data 485311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // are considered to be concatenated -- not literally in a single memory 486311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // buffer, but conceptually as described in the RFC. 487311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // The first byte of the previously encoded target data 488311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // has an offset that is equal to dictionary_size, i.e., just after 489311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // the last byte of source data. 490311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // For a hash of source (dictionary) data, starting_offset_ will be zero; 491311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // for a hash of previously encoded target data, starting_offset_ will be 492311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // equal to the dictionary size. 493311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff const int starting_offset_; 494311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 495311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // The last index added by AddBlock(). This determines the block number 496311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // for successive calls to AddBlock(), and is also 497311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // used to determine the starting block for AddAllBlocksThroughIndex(). 498311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff int last_block_added_; 499311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 500311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff // Making these private avoids implicit copy constructor & assignment operator 501311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff BlockHash(const BlockHash&); // NOLINT 502311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff void operator=(const BlockHash&); 503311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff}; 504311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 505311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff} // namespace open_vcdiff 506311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff 507311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff#endif // OPEN_VCDIFF_BLOCKHASH_H_ 508