10a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// Copyright 2007, 2008 Google Inc.
20a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// Authors: Jeff Dean, Sanjay Ghemawat, Lincoln Smith
30a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath//
40a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// Licensed under the Apache License, Version 2.0 (the "License");
50a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// you may not use this file except in compliance with the License.
60a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// You may obtain a copy of the License at
70a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath//
80a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath//      http://www.apache.org/licenses/LICENSE-2.0
90a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath//
100a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// Unless required by applicable law or agreed to in writing, software
110a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// distributed under the License is distributed on an "AS IS" BASIS,
120a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
130a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// See the License for the specific language governing permissions and
140a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// limitations under the License.
150a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
160a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath#ifndef OPEN_VCDIFF_ROLLING_HASH_H_
170a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath#define OPEN_VCDIFF_ROLLING_HASH_H_
180a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
190a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath#include <config.h>
200a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath#include <stdint.h>  // uint32_t
210a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath#include "compile_assert.h"
220a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath#include "logging.h"
230a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
240a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathnamespace open_vcdiff {
250a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
260a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// Rabin-Karp hasher module -- this is a faster version with different
270a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// constants, so it's not quite Rabin-Karp fingerprinting, but its behavior is
280a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// close enough for most applications.
290a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
300a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// Definitions common to all hash window sizes.
310a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathclass RollingHashUtil {
320a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath public:
330a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // Multiplier for incremental hashing.  The compiler should be smart enough to
340a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // convert (val * kMult) into ((val << 8) + val).
350a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  static const uint32_t kMult = 257;
360a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
370a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // All hashes are returned modulo "kBase".  Current implementation requires
380a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // kBase <= 2^32/kMult to avoid overflow.  Also, kBase must be a power of two
390a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // so that we can compute modulus efficiently.
400a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  static const uint32_t kBase = (1 << 23);
410a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
420a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // Returns operand % kBase, assuming that kBase is a power of two.
430a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  static inline uint32_t ModBase(uint32_t operand) {
440a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    return operand & (kBase - 1);
450a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  }
460a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
470a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // Given an unsigned integer "operand", returns an unsigned integer "result"
480a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // such that
490a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  //     result < kBase
500a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // and
510a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  //     ModBase(operand + result) == 0
520a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  static inline uint32_t FindModBaseInverse(uint32_t operand) {
530a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // The subtraction (0 - operand) produces an unsigned underflow for any
540a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // operand except 0.  The underflow results in a (very large) unsigned
550a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // number.  Binary subtraction is used instead of unary negation because
560a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // some compilers (e.g. Visual Studio 7+) produce a warning if an unsigned
570a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // value is negated.
580a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //
590a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // The C++ mod operation (operand % kBase) may produce different results for
600a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // different compilers if operand is negative.  That is not a problem in
610a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // this case, since all numbers used are unsigned, and ModBase does its work
620a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // using bitwise arithmetic rather than the % operator.
630a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    return ModBase(uint32_t(0) - operand);
640a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  }
650a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
660a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // Here's the heart of the hash algorithm.  Start with a partial_hash value of
670a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // 0, and run this HashStep once against each byte in the data window to be
680a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // hashed.  The result will be the hash value for the entire data window.  The
690a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // Hash() function, below, does exactly this, albeit with some refinements.
700a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  static inline uint32_t HashStep(uint32_t partial_hash,
710a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath                                  unsigned char next_byte) {
720a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    return ModBase((partial_hash * kMult) + next_byte);
730a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  }
740a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
750a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // Use this function to start computing a new hash value based on the first
760a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // two bytes in the window.  It is equivalent to calling
770a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  //     HashStep(HashStep(0, ptr[0]), ptr[1])
780a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // but takes advantage of the fact that the maximum value of
790a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // (ptr[0] * kMult) + ptr[1] is not large enough to exceed kBase, thus
800a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // avoiding an unnecessary ModBase operation.
810a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  static inline uint32_t HashFirstTwoBytes(const char* ptr) {
820a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    return (static_cast<unsigned char>(ptr[0]) * kMult)
830a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath        + static_cast<unsigned char>(ptr[1]);
840a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  }
850a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath private:
860a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // Making these private avoids copy constructor and assignment operator.
870a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // No objects of this type should be constructed.
880a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  RollingHashUtil();
890a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  RollingHashUtil(const RollingHashUtil&);  // NOLINT
900a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  void operator=(const RollingHashUtil&);
910a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath};
920a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
930a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// window_size must be >= 2.
940a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathtemplate<int window_size>
950a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathclass RollingHash {
960a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath public:
970a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // Perform global initialization that is required in order to instantiate a
980a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // RollingHash.  This function *must* be called (preferably on startup) by any
990a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // program that uses a RollingHash.  It is harmless to call this function more
1000a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // than once.  It is not thread-safe, but calling it from two different
1010a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // threads at the same time can only cause a memory leak, not incorrect
1020a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // behavior.  Make sure to call it before spawning any threads that could use
1030a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // RollingHash.
1040a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  static void Init();
1050a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
1060a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // Initialize hasher to maintain a window of the specified size.  You need an
1070a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // instance of this type to use UpdateHash(), but Hash() does not depend on
1080a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // remove_table_, so it is static.
1090a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  RollingHash() {
1100a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    if (!remove_table_) {
1110a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath      VCD_DFATAL << "RollingHash object instantiated"
1120a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath                    " before calling RollingHash::Init()" << VCD_ENDL;
1130a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    }
1140a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  }
1150a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
1160a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // Compute a hash of the window "ptr[0, window_size - 1]".
1170a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  static uint32_t Hash(const char* ptr) {
1180a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    uint32_t h = RollingHashUtil::HashFirstTwoBytes(ptr);
1190a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    for (int i = 2; i < window_size; ++i) {
1200a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath      h = RollingHashUtil::HashStep(h, ptr[i]);
1210a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    }
1220a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    return h;
1230a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  }
1240a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
1250a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // Update a hash by removing the oldest byte and adding a new byte.
1260a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  //
1270a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // UpdateHash takes the hash value of buffer[0] ... buffer[window_size -1]
1280a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // along with the value of buffer[0] (the "old_first_byte" argument)
1290a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // and the value of buffer[window_size] (the "new_last_byte" argument).
1300a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // It quickly computes the hash value of buffer[1] ... buffer[window_size]
1310a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // without having to run Hash() on the entire window.
1320a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  //
1330a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // The larger the window, the more advantage comes from using UpdateHash()
1340a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // (which runs in time independent of window_size) instead of Hash().
1350a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // Each time window_size doubles, the time to execute Hash() also doubles,
1360a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // while the time to execute UpdateHash() remains constant.  Empirical tests
1370a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // have borne out this statement.
1380a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  uint32_t UpdateHash(uint32_t old_hash,
1390a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath                      const char old_first_byte,
1400a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath                      const char new_last_byte) const {
1410a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    uint32_t partial_hash = RemoveFirstByteFromHash(old_hash, old_first_byte);
1420a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    return RollingHashUtil::HashStep(partial_hash, new_last_byte);
1430a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  }
1440a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
1450a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath protected:
1460a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // Given a full hash value for buffer[0] ... buffer[window_size -1], plus the
1470a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // value of the first byte buffer[0], this function returns a *partial* hash
1480a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // value for buffer[1] ... buffer[window_size -1].  See the comments in
1490a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // Init(), below, for a description of how the contents of remove_table_ are
1500a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // computed.
1510a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  static uint32_t RemoveFirstByteFromHash(uint32_t full_hash,
1520a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath                                          unsigned char first_byte) {
1530a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    return RollingHashUtil::ModBase(full_hash + remove_table_[first_byte]);
1540a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  }
1550a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
1560a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath private:
1570a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  // We keep a table that maps from any byte "b" to
1580a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  //    (- b * pow(kMult, window_size - 1)) % kBase
1590a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  static const uint32_t* remove_table_;
1600a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath};
1610a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
1620a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// For each window_size, fill a 256-entry table such that
1630a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath//        the hash value of buffer[0] ... buffer[window_size - 1]
1640a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath//      + remove_table_[buffer[0]]
1650a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath//     == the hash value of buffer[1] ... buffer[window_size - 1]
1660a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// See the comments in Init(), below, for a description of how the contents of
1670a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// remove_table_ are computed.
1680a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathtemplate<int window_size>
1690a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathconst uint32_t* RollingHash<window_size>::remove_table_ = NULL;
1700a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
1710a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// Init() checks to make sure that the static object remove_table_ has been
1720a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// initialized; if not, it does the considerable work of populating it.  Once
1730a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// it's ready, the table can be used for any number of RollingHash objects of
1740a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath// the same window_size.
1750a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath//
1760a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathtemplate<int window_size>
1770a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamathvoid RollingHash<window_size>::Init() {
1780a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  VCD_COMPILE_ASSERT(window_size >= 2,
1790a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath                     RollingHash_window_size_must_be_at_least_2);
1800a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  if (remove_table_ == NULL) {
1810a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // The new object is placed into a local pointer instead of directly into
1820a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // remove_table_, for two reasons:
1830a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //   1. remove_table_ is a pointer to const.  The table is populated using
1840a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //      the non-const local pointer and then assigned to the global const
1850a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //      pointer once it's ready.
1860a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //   2. No other thread will ever see remove_table_ pointing to a
1870a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //      partially-initialized table.  If two threads happen to call Init()
1880a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //      at the same time, two tables with the same contents may be created
1890a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //      (causing a memory leak), but the results will be consistent
1900a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //      no matter which of the two tables is used.
1910a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    uint32_t* new_remove_table = new uint32_t[256];
1920a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // Compute multiplier.  Concisely, it is:
1930a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //     pow(kMult, (window_size - 1)) % kBase,
1940a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // but we compute the power in integer form.
1950a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    uint32_t multiplier = 1;
1960a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    for (int i = 0; i < window_size - 1; ++i) {
1970a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath      multiplier =
1980a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath          RollingHashUtil::ModBase(multiplier * RollingHashUtil::kMult);
1990a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    }
2000a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // For each character removed_byte, compute
2010a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //     remove_table_[removed_byte] ==
2020a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //         (- (removed_byte * pow(kMult, (window_size - 1)))) % kBase
2030a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // where the power operator "pow" is taken in integer form.
2040a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //
2050a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // If you take a hash value fp representing the hash of
2060a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //     buffer[0] ... buffer[window_size - 1]
2070a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // and add the value of remove_table_[buffer[0]] to it, the result will be
2080a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // a partial hash value for
2090a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //     buffer[1] ... buffer[window_size - 1]
2100a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // that is to say, it no longer includes buffer[0].
2110a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //
2120a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // The following byte at buffer[window_size] can then be merged with this
2130a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // partial hash value to arrive quickly at the hash value for a window that
2140a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // has advanced by one byte, to
2150a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    //     buffer[1] ... buffer[window_size]
2160a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    // In fact, that is precisely what happens in UpdateHash, above.
2170a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    uint32_t byte_times_multiplier = 0;
2180a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    for (int removed_byte = 0; removed_byte < 256; ++removed_byte) {
2190a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath      new_remove_table[removed_byte] =
2200a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath          RollingHashUtil::FindModBaseInverse(byte_times_multiplier);
2210a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath      // Iteratively adding the multiplier in this loop is equivalent to
2220a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath      // computing (removed_byte * multiplier), and is faster
2230a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath      byte_times_multiplier =
2240a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath          RollingHashUtil::ModBase(byte_times_multiplier + multiplier);
2250a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    }
2260a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath    remove_table_ = new_remove_table;
2270a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath  }
2280a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath}
2290a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
2300a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath}  // namespace open_vcdiff
2310a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath
2320a58c5c2f73e5047b36f12b5f12b12d6f2a9f69dNarayan Kamath#endif  // OPEN_VCDIFF_ROLLING_HASH_H_
233