10a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// Copyright 2007, 2008 Google Inc. 20a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// Authors: Jeff Dean, Sanjay Ghemawat, Lincoln Smith 30a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// 40a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// Licensed under the Apache License, Version 2.0 (the "License"); 50a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// you may not use this file except in compliance with the License. 60a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// You may obtain a copy of the License at 70a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// 80a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// http://www.apache.org/licenses/LICENSE-2.0 90a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// 100a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// Unless required by applicable law or agreed to in writing, software 110a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// distributed under the License is distributed on an "AS IS" BASIS, 120a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 130a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// See the License for the specific language governing permissions and 140a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// limitations under the License. 150a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 160a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath#ifndef OPEN_VCDIFF_ROLLING_HASH_H_ 170a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath#define OPEN_VCDIFF_ROLLING_HASH_H_ 180a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 190a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath#include <config.h> 200a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath#include <stdint.h> // uint32_t 210a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath#include "compile_assert.h" 220a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath#include "logging.h" 230a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 240a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathnamespace open_vcdiff { 250a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 260a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// Rabin-Karp hasher module -- this is a faster version with different 270a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// constants, so it's not quite Rabin-Karp fingerprinting, but its behavior is 280a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// close enough for most applications. 290a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 300a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// Definitions common to all hash window sizes. 310a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathclass RollingHashUtil { 320a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath public: 330a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Multiplier for incremental hashing. The compiler should be smart enough to 340a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // convert (val * kMult) into ((val << 8) + val). 350a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath static const uint32_t kMult = 257; 360a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 370a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // All hashes are returned modulo "kBase". Current implementation requires 380a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // kBase <= 2^32/kMult to avoid overflow. Also, kBase must be a power of two 390a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // so that we can compute modulus efficiently. 400a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath static const uint32_t kBase = (1 << 23); 410a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 420a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Returns operand % kBase, assuming that kBase is a power of two. 430a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath static inline uint32_t ModBase(uint32_t operand) { 440a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath return operand & (kBase - 1); 450a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath } 460a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 470a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Given an unsigned integer "operand", returns an unsigned integer "result" 480a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // such that 490a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // result < kBase 500a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // and 510a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // ModBase(operand + result) == 0 520a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath static inline uint32_t FindModBaseInverse(uint32_t operand) { 530a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // The subtraction (0 - operand) produces an unsigned underflow for any 540a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // operand except 0. The underflow results in a (very large) unsigned 550a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // number. Binary subtraction is used instead of unary negation because 560a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // some compilers (e.g. Visual Studio 7+) produce a warning if an unsigned 570a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // value is negated. 580a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // 590a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // The C++ mod operation (operand % kBase) may produce different results for 600a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // different compilers if operand is negative. That is not a problem in 610a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // this case, since all numbers used are unsigned, and ModBase does its work 620a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // using bitwise arithmetic rather than the % operator. 630a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath return ModBase(uint32_t(0) - operand); 640a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath } 650a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 660a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Here's the heart of the hash algorithm. Start with a partial_hash value of 670a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // 0, and run this HashStep once against each byte in the data window to be 680a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // hashed. The result will be the hash value for the entire data window. The 690a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Hash() function, below, does exactly this, albeit with some refinements. 700a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath static inline uint32_t HashStep(uint32_t partial_hash, 710a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath unsigned char next_byte) { 720a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath return ModBase((partial_hash * kMult) + next_byte); 730a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath } 740a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 750a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Use this function to start computing a new hash value based on the first 760a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // two bytes in the window. It is equivalent to calling 770a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // HashStep(HashStep(0, ptr[0]), ptr[1]) 780a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // but takes advantage of the fact that the maximum value of 790a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // (ptr[0] * kMult) + ptr[1] is not large enough to exceed kBase, thus 800a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // avoiding an unnecessary ModBase operation. 810a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath static inline uint32_t HashFirstTwoBytes(const char* ptr) { 820a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath return (static_cast<unsigned char>(ptr[0]) * kMult) 830a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath + static_cast<unsigned char>(ptr[1]); 840a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath } 850a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath private: 860a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Making these private avoids copy constructor and assignment operator. 870a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // No objects of this type should be constructed. 880a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath RollingHashUtil(); 890a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath RollingHashUtil(const RollingHashUtil&); // NOLINT 900a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath void operator=(const RollingHashUtil&); 910a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath}; 920a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 930a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// window_size must be >= 2. 940a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathtemplate<int window_size> 950a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathclass RollingHash { 960a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath public: 970a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Perform global initialization that is required in order to instantiate a 980a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // RollingHash. This function *must* be called (preferably on startup) by any 990a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // program that uses a RollingHash. It is harmless to call this function more 1000a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // than once. It is not thread-safe, but calling it from two different 1010a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // threads at the same time can only cause a memory leak, not incorrect 1020a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // behavior. Make sure to call it before spawning any threads that could use 1030a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // RollingHash. 1040a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath static void Init(); 1050a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 1060a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Initialize hasher to maintain a window of the specified size. You need an 1070a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // instance of this type to use UpdateHash(), but Hash() does not depend on 1080a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // remove_table_, so it is static. 1090a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath RollingHash() { 1100a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath if (!remove_table_) { 1110a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath VCD_DFATAL << "RollingHash object instantiated" 1120a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath " before calling RollingHash::Init()" << VCD_ENDL; 1130a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath } 1140a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath } 1150a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 1160a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Compute a hash of the window "ptr[0, window_size - 1]". 1170a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath static uint32_t Hash(const char* ptr) { 1180a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath uint32_t h = RollingHashUtil::HashFirstTwoBytes(ptr); 1190a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath for (int i = 2; i < window_size; ++i) { 1200a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath h = RollingHashUtil::HashStep(h, ptr[i]); 1210a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath } 1220a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath return h; 1230a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath } 1240a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 1250a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Update a hash by removing the oldest byte and adding a new byte. 1260a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // 1270a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // UpdateHash takes the hash value of buffer[0] ... buffer[window_size -1] 1280a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // along with the value of buffer[0] (the "old_first_byte" argument) 1290a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // and the value of buffer[window_size] (the "new_last_byte" argument). 1300a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // It quickly computes the hash value of buffer[1] ... buffer[window_size] 1310a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // without having to run Hash() on the entire window. 1320a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // 1330a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // The larger the window, the more advantage comes from using UpdateHash() 1340a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // (which runs in time independent of window_size) instead of Hash(). 1350a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Each time window_size doubles, the time to execute Hash() also doubles, 1360a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // while the time to execute UpdateHash() remains constant. Empirical tests 1370a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // have borne out this statement. 1380a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath uint32_t UpdateHash(uint32_t old_hash, 1390a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath const char old_first_byte, 1400a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath const char new_last_byte) const { 1410a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath uint32_t partial_hash = RemoveFirstByteFromHash(old_hash, old_first_byte); 1420a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath return RollingHashUtil::HashStep(partial_hash, new_last_byte); 1430a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath } 1440a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 1450a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath protected: 1460a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Given a full hash value for buffer[0] ... buffer[window_size -1], plus the 1470a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // value of the first byte buffer[0], this function returns a *partial* hash 1480a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // value for buffer[1] ... buffer[window_size -1]. See the comments in 1490a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Init(), below, for a description of how the contents of remove_table_ are 1500a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // computed. 1510a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath static uint32_t RemoveFirstByteFromHash(uint32_t full_hash, 1520a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath unsigned char first_byte) { 1530a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath return RollingHashUtil::ModBase(full_hash + remove_table_[first_byte]); 1540a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath } 1550a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 1560a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath private: 1570a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // We keep a table that maps from any byte "b" to 1580a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // (- b * pow(kMult, window_size - 1)) % kBase 1590a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath static const uint32_t* remove_table_; 1600a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath}; 1610a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 1620a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// For each window_size, fill a 256-entry table such that 1630a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// the hash value of buffer[0] ... buffer[window_size - 1] 1640a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// + remove_table_[buffer[0]] 1650a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// == the hash value of buffer[1] ... buffer[window_size - 1] 1660a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// See the comments in Init(), below, for a description of how the contents of 1670a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// remove_table_ are computed. 1680a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathtemplate<int window_size> 1690a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathconst uint32_t* RollingHash<window_size>::remove_table_ = NULL; 1700a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 1710a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// Init() checks to make sure that the static object remove_table_ has been 1720a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// initialized; if not, it does the considerable work of populating it. Once 1730a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// it's ready, the table can be used for any number of RollingHash objects of 1740a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// the same window_size. 1750a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// 1760a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathtemplate<int window_size> 1770a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathvoid RollingHash<window_size>::Init() { 1780a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath VCD_COMPILE_ASSERT(window_size >= 2, 1790a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath RollingHash_window_size_must_be_at_least_2); 1800a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath if (remove_table_ == NULL) { 1810a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // The new object is placed into a local pointer instead of directly into 1820a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // remove_table_, for two reasons: 1830a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // 1. remove_table_ is a pointer to const. The table is populated using 1840a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // the non-const local pointer and then assigned to the global const 1850a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // pointer once it's ready. 1860a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // 2. No other thread will ever see remove_table_ pointing to a 1870a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // partially-initialized table. If two threads happen to call Init() 1880a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // at the same time, two tables with the same contents may be created 1890a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // (causing a memory leak), but the results will be consistent 1900a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // no matter which of the two tables is used. 1910a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath uint32_t* new_remove_table = new uint32_t[256]; 1920a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Compute multiplier. Concisely, it is: 1930a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // pow(kMult, (window_size - 1)) % kBase, 1940a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // but we compute the power in integer form. 1950a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath uint32_t multiplier = 1; 1960a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath for (int i = 0; i < window_size - 1; ++i) { 1970a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath multiplier = 1980a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath RollingHashUtil::ModBase(multiplier * RollingHashUtil::kMult); 1990a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath } 2000a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // For each character removed_byte, compute 2010a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // remove_table_[removed_byte] == 2020a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // (- (removed_byte * pow(kMult, (window_size - 1)))) % kBase 2030a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // where the power operator "pow" is taken in integer form. 2040a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // 2050a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // If you take a hash value fp representing the hash of 2060a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // buffer[0] ... buffer[window_size - 1] 2070a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // and add the value of remove_table_[buffer[0]] to it, the result will be 2080a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // a partial hash value for 2090a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // buffer[1] ... buffer[window_size - 1] 2100a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // that is to say, it no longer includes buffer[0]. 2110a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // 2120a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // The following byte at buffer[window_size] can then be merged with this 2130a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // partial hash value to arrive quickly at the hash value for a window that 2140a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // has advanced by one byte, to 2150a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // buffer[1] ... buffer[window_size] 2160a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // In fact, that is precisely what happens in UpdateHash, above. 2170a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath uint32_t byte_times_multiplier = 0; 2180a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath for (int removed_byte = 0; removed_byte < 256; ++removed_byte) { 2190a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath new_remove_table[removed_byte] = 2200a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath RollingHashUtil::FindModBaseInverse(byte_times_multiplier); 2210a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // Iteratively adding the multiplier in this loop is equivalent to 2220a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath // computing (removed_byte * multiplier), and is faster 2230a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath byte_times_multiplier = 2240a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath RollingHashUtil::ModBase(byte_times_multiplier + multiplier); 2250a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath } 2260a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath remove_table_ = new_remove_table; 2270a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath } 2280a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath} 2290a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 2300a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath} // namespace open_vcdiff 2310a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath 2320a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath#endif // OPEN_VCDIFF_ROLLING_HASH_H_ 233