1c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Copyright 2007, 2008 Google Inc.
2c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Authors: Jeff Dean, Sanjay Ghemawat, Lincoln Smith
3c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
4c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Licensed under the Apache License, Version 2.0 (the "License");
5c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// you may not use this file except in compliance with the License.
6c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// You may obtain a copy of the License at
7c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
8c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//      http://www.apache.org/licenses/LICENSE-2.0
9c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
10c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Unless required by applicable law or agreed to in writing, software
11c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// distributed under the License is distributed on an "AS IS" BASIS,
12c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// See the License for the specific language governing permissions and
14c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// limitations under the License.
15c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
16c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#ifndef OPEN_VCDIFF_ROLLING_HASH_H_
17c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define OPEN_VCDIFF_ROLLING_HASH_H_
18c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
19c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include <config.h>
20c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include <stdint.h>  // uint32_t
21c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "logging.h"
22c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
23c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottnamespace open_vcdiff {
24c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
25c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Rabin-Karp hasher module -- this is a faster version with different
26c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// constants, so it's not quite Rabin-Karp fingerprinting, but its behavior is
27c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// close enough for most applications.
28c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
29c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Definitions common to all hash window sizes.
30c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottclass RollingHashUtil {
31c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott public:
32c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Multiplier for incremental hashing.  The compiler should be smart enough to
33c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // convert (val * kMult) into ((val << 8) + val).
34c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  static const uint32_t kMult = 257;
35c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
36c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // All hashes are returned modulo "kBase".  Current implementation requires
37c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // kBase <= 2^32/kMult to avoid overflow.  Also, kBase must be a power of two
38c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // so that we can compute modulus efficiently.
39c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  static const uint32_t kBase = (1 << 23);
40c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
41c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Returns operand % kBase, assuming that kBase is a power of two.
42c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  static inline uint32_t ModBase(uint32_t operand) {
43c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return operand & (kBase - 1);
44c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
45c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
46c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Given an unsigned integer "operand", returns an unsigned integer "result"
47c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // such that
48c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  //     result < kBase
49c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // and
50c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  //     ModBase(operand + result) == 0
51c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  static inline uint32_t FindModBaseInverse(uint32_t operand) {
52c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // The subtraction (0 - operand) produces an unsigned underflow for any
53c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // operand except 0.  The underflow results in a (very large) unsigned
54c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // number.  Binary subtraction is used instead of unary negation because
55c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // some compilers (e.g. Visual Studio 7+) produce a warning if an unsigned
56c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // value is negated.
57c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //
58c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // The C++ mod operation (operand % kBase) may produce different results for
59c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // different compilers if operand is negative.  That is not a problem in
60c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // this case, since all numbers used are unsigned, and ModBase does its work
61c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // using bitwise arithmetic rather than the % operator.
62c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return ModBase(uint32_t(0) - operand);
63c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
64c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
65c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Here's the heart of the hash algorithm.  Start with a partial_hash value of
66c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // 0, and run this HashStep once against each byte in the data window to be
67c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // hashed.  The result will be the hash value for the entire data window.  The
68c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Hash() function, below, does exactly this, albeit with some refinements.
69c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  static inline uint32_t HashStep(uint32_t partial_hash,
70c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                                  unsigned char next_byte) {
71c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return ModBase((partial_hash * kMult) + next_byte);
72c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
73c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
74c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Use this function to start computing a new hash value based on the first
75c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // two bytes in the window.  It is equivalent to calling
76c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  //     HashStep(HashStep(0, ptr[0]), ptr[1])
77c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // but takes advantage of the fact that the maximum value of
78c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // (ptr[0] * kMult) + ptr[1] is not large enough to exceed kBase, thus
79c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // avoiding an unnecessary ModBase operation.
80c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  static inline uint32_t HashFirstTwoBytes(const char* ptr) {
81c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return (static_cast<unsigned char>(ptr[0]) * kMult)
82c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        + static_cast<unsigned char>(ptr[1]);
83c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
84c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott private:
85c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Making these private avoids copy constructor and assignment operator.
86c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // No objects of this type should be constructed.
87c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  RollingHashUtil();
88c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  RollingHashUtil(const RollingHashUtil&);  // NOLINT
89c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  void operator=(const RollingHashUtil&);
90c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott};
91c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
92c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// window_size must be >= 2.
93c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scotttemplate<int window_size>
94c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottclass RollingHash {
95c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott public:
96c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Perform global initialization that is required in order to instantiate a
97c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // RollingHash.  This function *must* be called (preferably on startup) by any
98c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // program that uses a RollingHash.  It is harmless to call this function more
99c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // than once.  It is not thread-safe, but calling it from two different
100c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // threads at the same time can only cause a memory leak, not incorrect
101c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // behavior.  Make sure to call it before spawning any threads that could use
102c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // RollingHash.  The function returns true if initialization succeeds, or
103c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // false if initialization fails, in which case the caller should not proceed
104c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // to construct any objects of type RollingHash.
105c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  static bool Init();
106c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
107c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Initialize hasher to maintain a window of the specified size.  You need an
108c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // instance of this type to use UpdateHash(), but Hash() does not depend on
109c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // remove_table_, so it is static.
110c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  RollingHash() {
111c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (!remove_table_) {
112c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      LOG(DFATAL) << "RollingHash object instantiated"
113c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                     " before calling RollingHash::Init()" << LOG_ENDL;
114c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
115c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
116c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
117c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Compute a hash of the window "ptr[0, window_size - 1]".
118c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  static uint32_t Hash(const char* ptr) {
119c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    uint32_t h = RollingHashUtil::HashFirstTwoBytes(ptr);
120c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    for (int i = 2; i < window_size; ++i) {
121c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      h = RollingHashUtil::HashStep(h, ptr[i]);
122c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
123c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return h;
124c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
125c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
126c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Update a hash by removing the oldest byte and adding a new byte.
127c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  //
128c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // UpdateHash takes the hash value of buffer[0] ... buffer[window_size -1]
129c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // along with the value of buffer[0] (the "old_first_byte" argument)
130c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // and the value of buffer[window_size] (the "new_last_byte" argument).
131c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // It quickly computes the hash value of buffer[1] ... buffer[window_size]
132c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // without having to run Hash() on the entire window.
133c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  //
134c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // The larger the window, the more advantage comes from using UpdateHash()
135c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // (which runs in time independent of window_size) instead of Hash().
136c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Each time window_size doubles, the time to execute Hash() also doubles,
137c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // while the time to execute UpdateHash() remains constant.  Empirical tests
138c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // have borne out this statement.
139c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  uint32_t UpdateHash(uint32_t old_hash,
140c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                      const char old_first_byte,
141c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                      const char new_last_byte) const {
142c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    uint32_t partial_hash = RemoveFirstByteFromHash(old_hash, old_first_byte);
143c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return RollingHashUtil::HashStep(partial_hash, new_last_byte);
144c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
145c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
146c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott protected:
147c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Given a full hash value for buffer[0] ... buffer[window_size -1], plus the
148c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // value of the first byte buffer[0], this function returns a *partial* hash
149c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // value for buffer[1] ... buffer[window_size -1].  See the comments in
150c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Init(), below, for a description of how the contents of remove_table_ are
151c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // computed.
152c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  static uint32_t RemoveFirstByteFromHash(uint32_t full_hash,
153c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                                          unsigned char first_byte) {
154c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return RollingHashUtil::ModBase(full_hash + remove_table_[first_byte]);
155c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
156c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
157c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott private:
158c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // We keep a table that maps from any byte "b" to
159c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  //    (- b * pow(kMult, window_size - 1)) % kBase
160c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  static const uint32_t* remove_table_;
161c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott};
162c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
163c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// For each window_size, fill a 256-entry table such that
164c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//        the hash value of buffer[0] ... buffer[window_size - 1]
165c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//      + remove_table_[buffer[0]]
166c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//     == the hash value of buffer[1] ... buffer[window_size - 1]
167c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// See the comments in Init(), below, for a description of how the contents of
168c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// remove_table_ are computed.
169c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scotttemplate<int window_size>
170c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst uint32_t* RollingHash<window_size>::remove_table_ = NULL;
171c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
172c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Init() checks to make sure that the static object remove_table_ has been
173c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// initialized; if not, it does the considerable work of populating it.  Once
174c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// it's ready, the table can be used for any number of RollingHash objects of
175c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// the same window_size.
176c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
177c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scotttemplate<int window_size>
178c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool RollingHash<window_size>::Init() {
179c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (window_size < 2) {
180c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    LOG(ERROR) << "RollingHash window size " << window_size
181c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott               << " is too small" << LOG_ENDL;
182c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return false;
183c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
184c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (remove_table_ == NULL) {
185c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // The new object is placed into a local pointer instead of directly into
186c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // remove_table_, for two reasons:
187c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //   1. remove_table_ is a pointer to const.  The table is populated using
188c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //      the non-const local pointer and then assigned to the global const
189c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //      pointer once it's ready.
190c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //   2. No other thread will ever see remove_table_ pointing to a
191c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //      partially-initialized table.  If two threads happen to call Init()
192c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //      at the same time, two tables with the same contents may be created
193c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //      (causing a memory leak), but the results will be consistent
194c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //      no matter which of the two tables is used.
195c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    uint32_t* new_remove_table = new uint32_t[256];
196c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // Compute multiplier.  Concisely, it is:
197c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //     pow(kMult, (window_size - 1)) % kBase,
198c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // but we compute the power in integer form.
199c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    uint32_t multiplier = 1;
200c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    for (int i = 0; i < window_size - 1; ++i) {
201c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      multiplier =
202c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott          RollingHashUtil::ModBase(multiplier * RollingHashUtil::kMult);
203c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
204c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // For each character removed_byte, compute
205c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //     remove_table_[removed_byte] ==
206c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //         (- (removed_byte * pow(kMult, (window_size - 1)))) % kBase
207c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // where the power operator "pow" is taken in integer form.
208c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //
209c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // If you take a hash value fp representing the hash of
210c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //     buffer[0] ... buffer[window_size - 1]
211c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // and add the value of remove_table_[buffer[0]] to it, the result will be
212c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // a partial hash value for
213c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //     buffer[1] ... buffer[window_size - 1]
214c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // that is to say, it no longer includes buffer[0].
215c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //
216c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // The following byte at buffer[window_size] can then be merged with this
217c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // partial hash value to arrive quickly at the hash value for a window that
218c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // has advanced by one byte, to
219c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //     buffer[1] ... buffer[window_size]
220c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // In fact, that is precisely what happens in UpdateHash, above.
221c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    uint32_t byte_times_multiplier = 0;
222c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    for (int removed_byte = 0; removed_byte < 256; ++removed_byte) {
223c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      new_remove_table[removed_byte] =
224c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott          RollingHashUtil::FindModBaseInverse(byte_times_multiplier);
225c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      // Iteratively adding the multiplier in this loop is equivalent to
226c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      // computing (removed_byte * multiplier), and is faster
227c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      byte_times_multiplier =
228c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott          RollingHashUtil::ModBase(byte_times_multiplier + multiplier);
229c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
230c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    remove_table_ = new_remove_table;
231c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
232c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return true;
233c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
234c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
235c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}  // namespace open_vcdiff
236c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
237c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif  // OPEN_VCDIFF_ROLLING_HASH_H_
238