1// Copyright 2006, 2008 Google Inc.
2// Authors: Chandra Chereddi, Lincoln Smith
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#include <config.h>
17#include "blockhash.h"
18#include <stdint.h>  // uint32_t
19#include <string.h>  // memcpy, memcmp
20#include <algorithm>  // std::min
21#include "compile_assert.h"
22#include "logging.h"
23#include "rolling_hash.h"
24
25namespace open_vcdiff {
26
27typedef unsigned long uword_t;  // a machine word                         NOLINT
28
29BlockHash::BlockHash(const char* source_data,
30                     size_t source_size,
31                     int starting_offset)
32    : source_data_(source_data),
33      source_size_(source_size),
34      hash_table_mask_(0),
35      starting_offset_(starting_offset),
36      last_block_added_(-1) {
37}
38
39BlockHash::~BlockHash() { }
40
41// kBlockSize must be at least 2 to be meaningful.  Since it's a compile-time
42// constant, check its value at compile time rather than wasting CPU cycles
43// on runtime checks.
44VCD_COMPILE_ASSERT(BlockHash::kBlockSize >= 2, kBlockSize_must_be_at_least_2);
45
46// kBlockSize is required to be a power of 2 because multiplication
47// (n * kBlockSize), division (n / kBlockSize) and MOD (n % kBlockSize)
48// are commonly-used operations.  If kBlockSize is a compile-time
49// constant and a power of 2, the compiler can convert these three operations
50// into bit-shift (>> or <<) and bitwise-AND (&) operations, which are much
51// more efficient than executing full integer multiply, divide, or remainder
52// instructions.
53VCD_COMPILE_ASSERT((BlockHash::kBlockSize & (BlockHash::kBlockSize - 1)) == 0,
54                   kBlockSize_must_be_a_power_of_2);
55
56bool BlockHash::Init(bool populate_hash_table) {
57  if (!hash_table_.empty() ||
58      !next_block_table_.empty() ||
59      !last_block_table_.empty()) {
60    VCD_DFATAL << "Init() called twice for same BlockHash object" << VCD_ENDL;
61    return false;
62  }
63  const size_t table_size = CalcTableSize(source_size_);
64  if (table_size == 0) {
65    VCD_DFATAL << "Error finding table size for source size " << source_size_
66               << VCD_ENDL;
67    return false;
68  }
69  // Since table_size is a power of 2, (table_size - 1) is a bit mask
70  // containing all the bits below table_size.
71  hash_table_mask_ = static_cast<uint32_t>(table_size - 1);
72  hash_table_.resize(table_size, -1);
73  next_block_table_.resize(GetNumberOfBlocks(), -1);
74  last_block_table_.resize(GetNumberOfBlocks(), -1);
75  if (populate_hash_table) {
76    AddAllBlocks();
77  }
78  return true;
79}
80
81const BlockHash* BlockHash::CreateDictionaryHash(const char* dictionary_data,
82                                                 size_t dictionary_size) {
83  BlockHash* new_dictionary_hash = new BlockHash(dictionary_data,
84                                                 dictionary_size,
85                                                 0);
86  if (!new_dictionary_hash->Init(/* populate_hash_table = */ true)) {
87    delete new_dictionary_hash;
88    return NULL;
89  } else {
90    return new_dictionary_hash;
91  }
92}
93
94BlockHash* BlockHash::CreateTargetHash(const char* target_data,
95                                       size_t target_size,
96                                       size_t dictionary_size) {
97  BlockHash* new_target_hash = new BlockHash(target_data,
98                                             target_size,
99                                             static_cast<int>(dictionary_size));
100  if (!new_target_hash->Init(/* populate_hash_table = */ false)) {
101    delete new_target_hash;
102    return NULL;
103  } else {
104    return new_target_hash;
105  }
106}
107
108// Returns zero if an error occurs.
109size_t BlockHash::CalcTableSize(const size_t dictionary_size) {
110  // Overallocate the hash table by making it the same size (in bytes)
111  // as the source data.  This is a trade-off between space and time:
112  // the empty entries in the hash table will reduce the
113  // probability of a hash collision to (sizeof(int) / kblockSize),
114  // and so save time comparing false matches.
115  const size_t min_size = (dictionary_size / sizeof(int)) + 1;  // NOLINT
116  size_t table_size = 1;
117  // Find the smallest power of 2 that is >= min_size, and assign
118  // that value to table_size.
119  while (table_size < min_size) {
120    table_size <<= 1;
121    // Guard against an infinite loop
122    if (table_size <= 0) {
123      VCD_DFATAL << "Internal error: CalcTableSize(dictionary_size = "
124                 << dictionary_size
125                 << "): resulting table_size " << table_size
126                 << " is zero or negative" << VCD_ENDL;
127      return 0;
128    }
129  }
130  // Check size sanity
131  if ((table_size & (table_size - 1)) != 0) {
132    VCD_DFATAL << "Internal error: CalcTableSize(dictionary_size = "
133               << dictionary_size
134               << "): resulting table_size " << table_size
135               << " is not a power of 2" << VCD_ENDL;
136    return 0;
137  }
138  // The loop above tries to find the smallest power of 2 that is >= min_size.
139  // That value must lie somewhere between min_size and (min_size * 2),
140  // except for the case (dictionary_size == 0, table_size == 1).
141  if ((dictionary_size > 0) && (table_size > (min_size * 2))) {
142    VCD_DFATAL << "Internal error: CalcTableSize(dictionary_size = "
143               << dictionary_size
144               << "): resulting table_size " << table_size
145               << " is too large" << VCD_ENDL;
146    return 0;
147  }
148  return table_size;
149}
150
151// If the hash value is already available from the rolling hash,
152// call this function to save time.
153void BlockHash::AddBlock(uint32_t hash_value) {
154  if (hash_table_.empty()) {
155    VCD_DFATAL << "BlockHash::AddBlock() called before BlockHash::Init()"
156               << VCD_ENDL;
157    return;
158  }
159  // The initial value of last_block_added_ is -1.
160  int block_number = last_block_added_ + 1;
161  const int total_blocks =
162      static_cast<int>(source_size_ / kBlockSize);  // round down
163  if (block_number >= total_blocks) {
164    VCD_DFATAL << "BlockHash::AddBlock() called"
165                  " with block number " << block_number
166               << " that is past last block " << (total_blocks - 1)
167               << VCD_ENDL;
168    return;
169  }
170  if (next_block_table_[block_number] != -1) {
171    VCD_DFATAL << "Internal error in BlockHash::AddBlock(): "
172                  "block number = " << block_number
173               << ", next block should be -1 but is "
174               << next_block_table_[block_number] << VCD_ENDL;
175    return;
176  }
177  const uint32_t hash_table_index = GetHashTableIndex(hash_value);
178  const int first_matching_block = hash_table_[hash_table_index];
179  if (first_matching_block < 0) {
180    // This is the first entry with this hash value
181    hash_table_[hash_table_index] = block_number;
182    last_block_table_[block_number] = block_number;
183  } else {
184    // Add this entry at the end of the chain of matching blocks
185    const int last_matching_block = last_block_table_[first_matching_block];
186    if (next_block_table_[last_matching_block] != -1) {
187      VCD_DFATAL << "Internal error in BlockHash::AddBlock(): "
188                    "first matching block = " << first_matching_block
189                 << ", last matching block = " << last_matching_block
190                 << ", next block should be -1 but is "
191                 << next_block_table_[last_matching_block] << VCD_ENDL;
192      return;
193    }
194    next_block_table_[last_matching_block] = block_number;
195    last_block_table_[first_matching_block] = block_number;
196  }
197  last_block_added_ = block_number;
198}
199
200void BlockHash::AddAllBlocks() {
201  AddAllBlocksThroughIndex(static_cast<int>(source_size_));
202}
203
204void BlockHash::AddAllBlocksThroughIndex(int end_index) {
205  if (end_index > static_cast<int>(source_size_)) {
206    VCD_DFATAL << "BlockHash::AddAllBlocksThroughIndex() called"
207                  " with index " << end_index
208               << " higher than end index  " << source_size_ << VCD_ENDL;
209    return;
210  }
211  const int last_index_added = last_block_added_ * kBlockSize;
212  if (end_index <= last_index_added) {
213    VCD_DFATAL << "BlockHash::AddAllBlocksThroughIndex() called"
214                  " with index " << end_index
215               << " <= last index added ( " << last_index_added
216               << ")" << VCD_ENDL;
217    return;
218  }
219  int end_limit = end_index;
220  // Don't allow reading any indices at or past source_size_.
221  // The Hash function extends (kBlockSize - 1) bytes past the index,
222  // so leave a margin of that size.
223  int last_legal_hash_index = static_cast<int>(source_size() - kBlockSize);
224  if (end_limit > last_legal_hash_index) {
225    end_limit = last_legal_hash_index + 1;
226  }
227  const char* block_ptr = source_data() + NextIndexToAdd();
228  const char* const end_ptr = source_data() + end_limit;
229  while (block_ptr < end_ptr) {
230    AddBlock(RollingHash<kBlockSize>::Hash(block_ptr));
231    block_ptr += kBlockSize;
232  }
233}
234
235VCD_COMPILE_ASSERT((BlockHash::kBlockSize % sizeof(uword_t)) == 0,
236                   kBlockSize_must_be_a_multiple_of_machine_word_size);
237
238// A recursive template to compare a fixed number
239// of (possibly unaligned) machine words starting
240// at addresses block1 and block2.  Returns true or false
241// depending on whether an exact match was found.
242template<int number_of_words>
243inline bool CompareWholeWordValues(const char* block1,
244                                   const char* block2) {
245  return CompareWholeWordValues<1>(block1, block2) &&
246         CompareWholeWordValues<number_of_words - 1>(block1 + sizeof(uword_t),
247                                                     block2 + sizeof(uword_t));
248}
249
250// The base of the recursive template: compare one pair of machine words.
251template<>
252inline bool CompareWholeWordValues<1>(const char* word1,
253                                      const char* word2) {
254  uword_t aligned_word1, aligned_word2;
255  memcpy(&aligned_word1, word1, sizeof(aligned_word1));
256  memcpy(&aligned_word2, word2, sizeof(aligned_word2));
257  return aligned_word1 == aligned_word2;
258}
259
260// A block must be composed of an integral number of machine words
261// (uword_t values.)  This function takes advantage of that fact
262// by comparing the blocks as series of (possibly unaligned) word values.
263// A word-sized comparison can be performed as a single
264// machine instruction.  Comparing words instead of bytes means that,
265// on a 64-bit platform, this function will use 8 times fewer test-and-branch
266// instructions than a byte-by-byte comparison.  Even with the extra
267// cost of the calls to memcpy, this method is still at least twice as fast
268// as memcmp (measured using gcc on a 64-bit platform, with a block size
269// of 32.)  For blocks with identical contents (a common case), this method
270// is over six times faster than memcmp.
271inline bool BlockCompareWordsInline(const char* block1, const char* block2) {
272  static const size_t kWordsPerBlock = BlockHash::kBlockSize / sizeof(uword_t);
273  return CompareWholeWordValues<kWordsPerBlock>(block1, block2);
274}
275
276bool BlockHash::BlockCompareWords(const char* block1, const char* block2) {
277  return BlockCompareWordsInline(block1, block2);
278}
279
280inline bool BlockContentsMatchInline(const char* block1, const char* block2) {
281  // Optimize for mismatch in first byte.  Since this function is called only
282  // when the hash values of the two blocks match, it is very likely that either
283  // the blocks are identical, or else the first byte does not match.
284  if (*block1 != *block2) {
285    return false;
286  }
287#ifdef VCDIFF_USE_BLOCK_COMPARE_WORDS
288  return BlockCompareWordsInline(block1, block2);
289#else  // !VCDIFF_USE_BLOCK_COMPARE_WORDS
290  return memcmp(block1, block2, BlockHash::kBlockSize) == 0;
291#endif  // VCDIFF_USE_BLOCK_COMPARE_WORDS
292}
293
294bool BlockHash::BlockContentsMatch(const char* block1, const char* block2) {
295  return BlockContentsMatchInline(block1, block2);
296}
297
298inline int BlockHash::SkipNonMatchingBlocks(int block_number,
299                                            const char* block_ptr) const {
300  int probes = 0;
301  while ((block_number >= 0) &&
302         !BlockContentsMatchInline(block_ptr,
303                                   &source_data_[block_number * kBlockSize])) {
304    if (++probes > kMaxProbes) {
305      return -1;  // Avoid too much chaining
306    }
307    block_number = next_block_table_[block_number];
308  }
309  return block_number;
310}
311
312// Init() must have been called and returned true before using
313// FirstMatchingBlock or NextMatchingBlock.  No check is performed
314// for this condition; the code will crash if this condition is violated.
315inline int BlockHash::FirstMatchingBlockInline(uint32_t hash_value,
316                                               const char* block_ptr) const {
317  return SkipNonMatchingBlocks(hash_table_[GetHashTableIndex(hash_value)],
318                               block_ptr);
319}
320
321int BlockHash::FirstMatchingBlock(uint32_t hash_value,
322                                  const char* block_ptr) const {
323  return FirstMatchingBlockInline(hash_value, block_ptr);
324}
325
326int BlockHash::NextMatchingBlock(int block_number,
327                                 const char* block_ptr) const {
328  if (static_cast<size_t>(block_number) >= GetNumberOfBlocks()) {
329    VCD_DFATAL << "NextMatchingBlock called for invalid block number "
330               << block_number << VCD_ENDL;
331    return -1;
332  }
333  return SkipNonMatchingBlocks(next_block_table_[block_number], block_ptr);
334}
335
336// Keep a count of the number of matches found.  This will throttle the
337// number of iterations in FindBestMatch.  For example, if the entire
338// dictionary is made up of spaces (' ') and the search string is also
339// made up of spaces, there will be one match for each block in the
340// dictionary.
341inline bool BlockHash::TooManyMatches(int* match_counter) {
342  ++(*match_counter);
343  return (*match_counter) > kMaxMatchesToCheck;
344}
345
346// Returns the number of bytes to the left of source_match_start
347// that match the corresponding bytes to the left of target_match_start.
348// Will not examine more than max_bytes bytes, which is to say that
349// the return value will be in the range [0, max_bytes] inclusive.
350int BlockHash::MatchingBytesToLeft(const char* source_match_start,
351                                   const char* target_match_start,
352                                   int max_bytes) {
353  const char* source_ptr = source_match_start;
354  const char* target_ptr = target_match_start;
355  int bytes_found = 0;
356  while (bytes_found < max_bytes) {
357    --source_ptr;
358    --target_ptr;
359    if (*source_ptr != *target_ptr) {
360      break;
361    }
362    ++bytes_found;
363  }
364  return bytes_found;
365}
366
367// Returns the number of bytes starting at source_match_end
368// that match the corresponding bytes starting at target_match_end.
369// Will not examine more than max_bytes bytes, which is to say that
370// the return value will be in the range [0, max_bytes] inclusive.
371int BlockHash::MatchingBytesToRight(const char* source_match_end,
372                                    const char* target_match_end,
373                                    int max_bytes) {
374  const char* source_ptr = source_match_end;
375  const char* target_ptr = target_match_end;
376  int bytes_found = 0;
377  while ((bytes_found < max_bytes) && (*source_ptr == *target_ptr)) {
378    ++bytes_found;
379    ++source_ptr;
380    ++target_ptr;
381  }
382  return bytes_found;
383}
384
385// No NULL checks are performed on the pointer arguments.  The caller
386// must guarantee that none of the arguments is NULL, or a crash will occur.
387//
388// The vast majority of calls to FindBestMatch enter the loop *zero* times,
389// which is to say that most candidate blocks find no matches in the dictionary.
390// The important sections for optimization are therefore the code outside the
391// loop and the code within the loop conditions.  Keep this to a minimum.
392void BlockHash::FindBestMatch(uint32_t hash_value,
393                              const char* target_candidate_start,
394                              const char* target_start,
395                              size_t target_size,
396                              Match* best_match) const {
397  int match_counter = 0;
398  for (int block_number = FirstMatchingBlockInline(hash_value,
399                                                   target_candidate_start);
400       (block_number >= 0) && !TooManyMatches(&match_counter);
401       block_number = NextMatchingBlock(block_number, target_candidate_start)) {
402    int source_match_offset = block_number * kBlockSize;
403    const int source_match_end = source_match_offset + kBlockSize;
404
405    int target_match_offset =
406        static_cast<int>(target_candidate_start - target_start);
407    const int target_match_end = target_match_offset + kBlockSize;
408
409    size_t match_size = kBlockSize;
410    {
411      // Extend match start towards beginning of unencoded data
412      const int limit_bytes_to_left = std::min(source_match_offset,
413                                               target_match_offset);
414      const int matching_bytes_to_left =
415          MatchingBytesToLeft(source_data_ + source_match_offset,
416                              target_start + target_match_offset,
417                              limit_bytes_to_left);
418      source_match_offset -= matching_bytes_to_left;
419      target_match_offset -= matching_bytes_to_left;
420      match_size += matching_bytes_to_left;
421    }
422    {
423      // Extend match end towards end of unencoded data
424      const size_t source_bytes_to_right = source_size_ - source_match_end;
425      const size_t target_bytes_to_right = target_size - target_match_end;
426      const size_t limit_bytes_to_right = std::min(source_bytes_to_right,
427                                                   target_bytes_to_right);
428      match_size +=
429          MatchingBytesToRight(source_data_ + source_match_end,
430                               target_start + target_match_end,
431                               static_cast<int>(limit_bytes_to_right));
432    }
433    // Update in/out parameter if the best match found was better
434    // than any match already stored in *best_match.
435    best_match->ReplaceIfBetterMatch(match_size,
436                                     source_match_offset + starting_offset_,
437                                     target_match_offset);
438  }
439}
440
441}  // namespace open_vcdiff
442