1311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// Copyright 2006 Google Inc.
2311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// Authors: Sanjay Ghemawat, Jeff Dean, Chandra Chereddi, Lincoln Smith
3311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff//
4311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// Licensed under the Apache License, Version 2.0 (the "License");
5311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// you may not use this file except in compliance with the License.
6311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// You may obtain a copy of the License at
7311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff//
8311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff//      http://www.apache.org/licenses/LICENSE-2.0
9311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff//
10311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// Unless required by applicable law or agreed to in writing, software
11311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// distributed under the License is distributed on an "AS IS" BASIS,
12311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// See the License for the specific language governing permissions and
14311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// limitations under the License.
15311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff//
16311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// Implementation of the Bentley/McIlroy algorithm for finding differences.
17311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// Bentley, McIlroy.  DCC 1999.  Data Compression Using Long Common Strings.
18311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// http://citeseer.ist.psu.edu/555557.html
19311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
20311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff#ifndef OPEN_VCDIFF_BLOCKHASH_H_
21311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff#define OPEN_VCDIFF_BLOCKHASH_H_
22311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
23311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff#include <config.h>
2428db8079f707ebdf43ce62cdfd96eb39c8f889e0openvcdiff#include <stddef.h>  // size_t
25311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff#include <stdint.h>  // uint32_t
26311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff#include <vector>
27311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
28311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiffnamespace open_vcdiff {
29311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
30311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// A generic hash table which will be used to keep track of byte runs
31311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// of size kBlockSize in both the incrementally processed target data
32311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// and the preprocessed source dictionary.
33311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff//
34311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// A custom hash table implementation is used instead of the standard
35311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// hash_map template because we know that there will be exactly one
36311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// entry in the BlockHash corresponding to each kBlockSize bytes
37311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// in the source data, which makes certain optimizations possible:
38311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// * The memory for the hash table and for all hash entries can be allocated
39311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff//   in one step rather than incrementally for each insert operation.
40311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff// * A single integer can be used to represent both
41311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff//   the index of the next hash entry in the chain
42311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff//   and the position of the entry within the source data
43311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff//   (== kBlockSize * block_number).  This greatly reduces the size
44311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff//   of a hash entry.
45311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff//
46311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiffclass BlockHash {
47311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff public:
48311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Block size as per Bentley/McIlroy; must be a power of two.
49311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
50311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Using (for example) kBlockSize = 4 guarantees that no match smaller
51311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // than size 4 will be identified, that some matches having sizes
52311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // 4, 5, or 6 may be identified, and that all matches
53311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // having size 7 or greater will be identified (because any string of
54311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // 7 bytes must contain a complete aligned block of 4 bytes.)
55311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
56311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Increasing kBlockSize by a factor of two will halve the amount of
57311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // memory needed for the next block table, and will halve the setup time
58311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // for a new BlockHash.  However, it also doubles the minimum
59311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // match length that is guaranteed to be found in FindBestMatch(),
60311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // so that function will be less effective in finding matches.
61311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
62311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Computational effort in FindBestMatch (which is the inner loop of
63311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the encoding algorithm) will be proportional to the number of
64311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // matches found, and a low value of kBlockSize will waste time
65311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // tracking down small matches.  On the other hand, if this value
66311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // is set too high, no matches will be found at all.
67311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
68311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // It is suggested that different values of kBlockSize be tried against
69311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // a representative data set to find the best tradeoff between
70311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // memory/CPU and the effectiveness of FindBestMatch().
71311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
72311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // If you change kBlockSize to a smaller value, please increase
73311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // kMaxMatchesToCheck accordingly.
74d18457863096b3685e56f5a8919959f6afbdb121openvcdiff  static const int kBlockSize = 16;
75311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
76311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // This class is used to store the best match found by FindBestMatch()
77311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // and return it to the caller.
78311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  class Match {
79311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff   public:
80311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    Match() : size_(0), source_offset_(-1), target_offset_(-1) { }
81311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
82311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    void ReplaceIfBetterMatch(size_t candidate_size,
83311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff                              int candidate_source_offset,
84311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff                              int candidate_target_offset) {
85311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff      if (candidate_size > size_) {
86311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff        size_ = candidate_size;
87311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff        source_offset_ = candidate_source_offset;
88311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff        target_offset_ = candidate_target_offset;
89311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff      }
90311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    }
91311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
92311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    size_t size() const { return size_; }
93311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    int source_offset() const { return source_offset_; }
94311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    int target_offset() const { return target_offset_; }
95311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
96311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff   private:
97311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff     // The size of the best (longest) match passed to ReplaceIfBetterMatch().
98311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    size_t size_;
99311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
100311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    // The source offset of the match, including the starting_offset_
101311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    // of the BlockHash for which the match was found.
102311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    int source_offset_;
103311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
104311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    // The target offset of the match.  An offset of 0 corresponds to the
105311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    // data at target_start, which is an argument of FindBestMatch().
106311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    int target_offset_;
107311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
108311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    // Making these private avoids implicit copy constructor
109311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    // & assignment operator
110311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    Match(const Match&);  // NOLINT
111311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    void operator=(const Match&);
112311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  };
113311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
114311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // A BlockHash is created using a buffer of source data.  The hash table
115311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // will contain one entry for each kBlockSize-byte block in the
116311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // source data.
117311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
118311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // See the comments for starting_offset_, below, for a description of
119311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the starting_offset argument.  For a hash of source (dictionary) data,
120311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // starting_offset_ will be zero; for a hash of previously encoded
121311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // target data, starting_offset_ will be equal to the dictionary size.
122311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
123311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  BlockHash(const char* source_data, size_t source_size, int starting_offset);
124311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
125311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  ~BlockHash();
126311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
127311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Initializes the object before use.
128311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // This method must be called after constructing a BlockHash object,
129311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // and before any other method may be called.  This is because
130311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Init() dynamically allocates hash_table_ and next_block_table_.
131311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Returns true if initialization succeeded, or false if an error occurred,
132311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // in which case no other method except the destructor may then be used
133311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // on the object.
134311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
135311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // If populate_hash_table is true, then AddAllBlocks() will be called
136311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // to populate the hash table.  If populate_hash_table is false, then
137311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // classes that inherit from BlockHash are expected to call AddBlock()
138311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // to incrementally populate individual blocks of data.
139311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
140311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  bool Init(bool populate_hash_table);
141311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
142311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // In the context of the open-vcdiff encoder, BlockHash is used for two
143311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // purposes: to hash the source (dictionary) data, and to hash
144311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the previously encoded target data.  The main differences between
145311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // a dictionary BlockHash and a target BlockHash are as follows:
146311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
147311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //   1. The best_match->source_offset() returned from FindBestMatch()
148311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //      for a target BlockHash is computed in the following manner:
149311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //      the starting offset of the first byte in the target data
150311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //      is equal to the dictionary size.  FindBestMatch() will add
151311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //      starting_offset_ to any best_match->source_offset() value it returns,
152311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //      in order to produce the correct offset value for a target BlockHash.
153311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //   2. For a dictionary BlockHash, the entire data set is hashed at once
154311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //      when Init() is called with the parameter populate_hash_table = true.
155311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //      For a target BlockHash, because the previously encoded target data
156311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //      includes only the data seen up to the current encoding position,
157311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //      the data blocks are hashed incrementally as the encoding position
158311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //      advances, using AddOneIndexHash() and AddAllBlocksThroughIndex().
159311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
160311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // The following two factory functions can be used to create BlockHash
161311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // objects for each of these two purposes.  Each factory function calls
162311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the object constructor and also calls Init().  If an error occurs,
163311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // NULL is returned; otherwise a valid BlockHash object is returned.
164311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Since a dictionary BlockHash is not expected to be modified after
165311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // initialization, a const object is returned.
166311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // The caller is responsible for deleting the returned object
167311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // (using the C++ delete operator) once it is no longer needed.
168311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  static const BlockHash* CreateDictionaryHash(const char* dictionary_data,
169311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff                                               size_t dictionary_size);
170311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  static BlockHash* CreateTargetHash(const char* target_data,
171311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff                                     size_t target_size,
172311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff                                     size_t dictionary_size);
173311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
174311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // This function will be called to add blocks incrementally to the target hash
175311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // as the encoding position advances through the target data.  It will be
176311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // called for every kBlockSize-byte block in the target data, regardless
177311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // of whether the block is aligned evenly on a block boundary.  The
178311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // BlockHash will only store hash entries for the evenly-aligned blocks.
179311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
180311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  void AddOneIndexHash(int index, uint32_t hash_value) {
181311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    if (index == NextIndexToAdd()) {
182311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff      AddBlock(hash_value);
183311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    }
184311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  }
185311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
186311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Calls AddBlock() for each kBlockSize-byte block in the range
187311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // (last_block_added_ * kBlockSize, end_index), exclusive of the endpoints.
188311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // If end_index <= the last index added (last_block_added_ * kBlockSize),
189311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // this function does nothing.
190311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
191311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // A partial block beginning anywhere up to (end_index - 1) is also added,
192311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // unless it extends outside the end of the source data.  Like AddAllBlocks(),
193311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // this function computes the hash value for each of the blocks in question
194311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // from scratch, so it is not a good option if the hash values have already
195311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // been computed for some other purpose.
196311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
197311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Example: assume kBlockSize = 4, last_block_added_ = 1, and there are
198311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // 14 bytes of source data.
199311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // If AddAllBlocksThroughIndex(9) is invoked, then it will call AddBlock()
200311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // only for block number 2 (at index 8).
201311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // If, after that, AddAllBlocksThroughIndex(14) is invoked, it will not call
202311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // AddBlock() at all, because block 3 (beginning at index 12) would
203311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // fall outside the range of source data.
204311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
205311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // VCDiffEngine::Encode (in vcdiffengine.cc) uses this function to
206311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // add a whole range of data to a target hash when a COPY instruction
207311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // is generated.
208311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  void AddAllBlocksThroughIndex(int end_index);
209311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
210311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // FindBestMatch takes a position within the unencoded target data
211311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // (target_candidate_start) and the hash value of the kBlockSize bytes
212311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // beginning at that position (hash_value).  It attempts to find a matching
213311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // set of bytes within the source (== dictionary) data, expanding
214311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the match both below and above the target block.  It cannot expand
215311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the match outside the bounds of the source data, or below
216311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // target_start within the target data, or past
217311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the end limit of (target_start + target_length).
218311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
219311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // target_candidate_start is the start of the candidate block within the
220311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // target data for which a match will be sought, while
221311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // target_start (which is <= target_candidate_start)
222311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // is the start of the target data that has yet to be encoded.
223311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
224311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // If a match is found whose size is greater than the size
225311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // of best_match, this function populates *best_match with the
226311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // size, source_offset, and target_offset of the match found.
227311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // best_match->source_offset() will contain the index of the start of the
228311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // matching source data, plus starting_offset_
229311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // (see description of starting_offset_ for details);
230311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // best_match->target_offset() will contain the offset of the match
231311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // beginning with target_start = offset 0, such that
232311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //     0 <= best_match->target_offset()
233311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //              <= (target_candidate_start - target_start);
234311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // and best_match->size() will contain the size of the match.
235311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // If no such match is found, this function leaves *best_match unmodified.
236311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
237311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // On calling FindBestMatch(), best_match must
238311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // point to a valid Match object, and cannot be NULL.
239311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // The same Match object can be passed
240311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // when calling FindBestMatch() on a different BlockHash object
241311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // for the same candidate data block, in order to find
242311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the best match possible across both objects.  For example:
243311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
244311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //     open_vcdiff::BlockHash::Match best_match;
245311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //     uint32_t hash_value =
246311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //         RollingHash<BlockHash::kBlockSize>::Hash(target_candidate_start);
247311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //     bh1.FindBestMatch(hash_value,
248311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //                       target_candidate_start,
249311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //                       target_start,
250311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //                       target_length,
251311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //                       &best_match);
252311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //     bh2.FindBestMatch(hash_value,
253311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //                       target_candidate_start,
254311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //                       target_start,
255311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //                       target_length,
256311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //                       &best_match);
257311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //     if (best_size >= 0) {
258311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //       // a match was found; its size, source offset, and target offset
259311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //       // can be found in best_match
260311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //     }
261311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
262311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // hash_value is passed as a separate parameter from target_candidate_start,
263311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // (rather than calculated within FindBestMatch) in order to take
264311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // advantage of the rolling hash, which quickly calculates the hash value
265311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // of the block starting at target_candidate_start based on
266311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the known hash value of the block starting at (target_candidate_start - 1).
267311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // See vcdiffengine.cc for more details.
268311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
269311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Example:
270311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //    kBlockSize: 4
271311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //    target text: "ANDREW LLOYD WEBBER"
272311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //                 1^    5^2^         3^
273311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //    dictionary: "INSURANCE : LLOYDS OF LONDON"
274311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //                           4^
275311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //    hashed dictionary blocks:
276311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //        "INSU", "RANC", "E : ", "LLOY", "DS O", "F LON"
277311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
278311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //    1: target_start (beginning of unencoded data)
279311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //    2: target_candidate_start (for the block "LLOY")
280311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //    3: target_length (points one byte beyond the last byte of data.)
281311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //    4: best_match->source_offset() (after calling FindBestMatch)
282311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //    5: best_match->target_offset() (after calling FindBestMatch)
283311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
284311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //    Under these conditions, FindBestMatch will find a matching
285311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //    hashed dictionary block for "LLOY", and will extend the beginning of
286311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //    this match backwards by one byte, and the end of the match forwards
287311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //    by one byte, finding that the best match is " LLOYD"
288311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //    with best_match->source_offset() = 10
289311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //                                  (offset of " LLOYD" in the source string),
290311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //         best_match->target_offset() = 6
291311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //                                  (offset of " LLOYD" in the target string),
292311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //     and best_match->size() = 6.
293311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
294311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  void FindBestMatch(uint32_t hash_value,
295311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff                     const char* target_candidate_start,
296311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff                     const char* target_start,
297311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff                     size_t target_size,
298311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff                     Match* best_match) const;
299311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
300311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff protected:
301311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // FindBestMatch() will not process more than this number
302311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // of matching hash entries.
303311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
304311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // It is necessary to have a limit on the maximum number of matches
305311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // that will be checked in order to avoid the worst-case performance
306311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // possible if, for example, all the blocks in the dictionary have
307311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the same hash value.  See the unit test SearchStringFindsTooManyMatches
308311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // for an example of such a case.  The encoder uses a loop in
309311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // VCDiffEngine::Encode over each target byte, containing a loop in
310311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // BlockHash::FindBestMatch over the number of matches (up to a maximum
311311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // of the number of source blocks), containing two loops that extend
312311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the match forwards and backwards up to the number of source bytes.
313311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Total complexity in the worst case is
314311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //     O([target size] * source_size_ * source_size_)
315311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Placing a limit on the possible number of matches checked changes this to
316311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //     O([target size] * source_size_ * kMaxMatchesToCheck)
317311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
318311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // In empirical testing on real HTML text, using a block size of 4,
319311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the number of true matches per call to FindBestMatch() did not exceed 78;
320311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // with a block size of 32, the number of matches did not exceed 3.
321311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
322311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // The expected number of true matches scales super-linearly
323311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // with the inverse of kBlockSize, but here a linear scale is used
324311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // for block sizes smaller than 32.
325d18457863096b3685e56f5a8919959f6afbdb121openvcdiff  static const int kMaxMatchesToCheck = (kBlockSize >= 32) ? 32 :
326d18457863096b3685e56f5a8919959f6afbdb121openvcdiff                                            (32 * (32 / kBlockSize));
327311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
328311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Do not skip more than this number of non-matching hash collisions
329311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // to find the next matching entry in the hash chain.
330311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  static const int kMaxProbes = 16;
331311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
332311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Internal routine which calculates a hash table size based on kBlockSize and
333311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the dictionary_size.  Will return a power of two if successful, or 0 if an
334311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // internal error occurs.  Some calculations (such as GetHashTableIndex())
335311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // depend on the table size being a power of two.
33683bbde0df33922d8dc6fa737cfb306d9caae13b1openvcdiff  static size_t CalcTableSize(const size_t dictionary_size);
337311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
33883bbde0df33922d8dc6fa737cfb306d9caae13b1openvcdiff  size_t GetNumberOfBlocks() const {
339311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    return source_size_ / kBlockSize;
340311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  }
341311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
342311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Use the lowest-order bits of the hash value
343311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // as the index into the hash table.
344311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  uint32_t GetHashTableIndex(uint32_t hash_value) const {
345311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    return hash_value & hash_table_mask_;
346311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  }
347311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
348311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // The index within source_data_ of the next block
349311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // for which AddBlock() should be called.
350311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  int NextIndexToAdd() const {
351311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff    return (last_block_added_ + 1) * kBlockSize;
352311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  }
353311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
354311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  static inline bool TooManyMatches(int* match_counter);
355311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
35683bbde0df33922d8dc6fa737cfb306d9caae13b1openvcdiff  const char* source_data() { return source_data_; }
35783bbde0df33922d8dc6fa737cfb306d9caae13b1openvcdiff  size_t source_size() { return source_size_; }
358311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
359311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Adds an entry to the hash table for one block of source data of length
360311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // kBlockSize, starting at source_data_[block_number * kBlockSize],
361311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // where block_number is always (last_block_added_ + 1).  That is,
362311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // AddBlock() must be called once for each block in source_data_
363311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // in increasing order.
364311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  void AddBlock(uint32_t hash_value);
365311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
366311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Calls AddBlock() for each complete kBlockSize-byte block between
367311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // source_data_ and (source_data_ + source_size_).  It is equivalent
368311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // to calling AddAllBlocksThroughIndex(source_data + source_size).
369311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // This function is called when Init(true) is invoked.
370311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  void AddAllBlocks();
371311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
372311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Returns true if the contents of the kBlockSize-byte block
373311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // beginning at block1 are identical to the contents of
374311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the block beginning at block2; false otherwise.
375311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  static bool BlockContentsMatch(const char* block1, const char* block2);
376311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
377311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Compares each machine word of the two (possibly unaligned) blocks, rather
378311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // than each byte, thus reducing the number of test-and-branch instructions
379311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // executed.  Returns a boolean (do the blocks match?) rather than
380311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the signed byte difference returned by memcmp.
381311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
382311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // BlockContentsMatch will use either this function or memcmp to do its work,
383311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // depending on which is faster for a particular architecture.
384311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
385311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // For gcc on x86-based architectures, this function has been shown to run
386311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // about twice as fast as the library function memcmp(), and between five and
387311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // nine times faster than the assembly instructions (repz and cmpsb) that gcc
388311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // uses by default for builtin memcmp.  On other architectures, or using
389311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // other compilers, this function has not shown to be faster than memcmp.
390311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  static bool BlockCompareWords(const char* block1, const char* block2);
391311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
392311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Finds the first block number within the hashed data
393311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // that represents a match for the given hash value.
394311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Returns -1 if no match was found.
395311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
396311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Init() must have been called and returned true before using
397311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // FirstMatchingBlock or NextMatchingBlock.  No check is performed
398311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // for this condition; the code will crash if this condition is violated.
399311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  //
400311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // The hash table is initially populated with -1 (not found) values,
401311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // so if this function is called before the hash table has been populated
402311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // using AddAllBlocks() or AddBlock(), it will simply return -1
403311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // for any value of hash_value.
404311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  int FirstMatchingBlock(uint32_t hash_value, const char* block_ptr) const;
405311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
406311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Given a block number returned by FirstMatchingBlock()
407311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // or by a previous call to NextMatchingBlock(), returns
408311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the next block number that matches the same hash value.
409311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Returns -1 if no match was found.
410311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  int NextMatchingBlock(int block_number, const char* block_ptr) const;
411311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
412311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Inline version of FirstMatchingBlock.  This saves the cost of a function
413311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // call when this routine is called from within the module.  The external
414311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // (non-inlined) version is called only by unit tests.
415311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  inline int FirstMatchingBlockInline(uint32_t hash_value,
416311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff                                      const char* block_ptr) const;
417311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
418311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Walk through the hash entry chain, skipping over any false matches
419311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // (for which the lowest bits of the fingerprints match,
420311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // but the actual block data does not.)  Returns the block number of
421311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the first true match found, or -1 if no true match was found.
422311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // If block_number is a matching block, the function will return block_number
423311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // without skipping to the next block.
424311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  int SkipNonMatchingBlocks(int block_number, const char* block_ptr) const;
425311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
426311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Returns the number of bytes to the left of source_match_start
427311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // that match the corresponding bytes to the left of target_match_start.
428311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Will not examine more than max_bytes bytes, which is to say that
429311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the return value will be in the range [0, max_bytes] inclusive.
430311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  static int MatchingBytesToLeft(const char* source_match_start,
431311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff                                 const char* target_match_start,
432311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff                                 int max_bytes);
433311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
434311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Returns the number of bytes starting at source_match_end
435311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // that match the corresponding bytes starting at target_match_end.
436311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Will not examine more than max_bytes bytes, which is to say that
437311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the return value will be in the range [0, max_bytes] inclusive.
438311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  static int MatchingBytesToRight(const char* source_match_end,
439311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff                                  const char* target_match_end,
440311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff                                  int max_bytes);
441311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
442311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // The protected functions BlockContentsMatch, FirstMatchingBlock,
443311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // NextMatchingBlock, MatchingBytesToLeft, and MatchingBytesToRight
444311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // should be made accessible to unit tests.
445311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  friend class BlockHashTest;
446311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
447311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff private:
448311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  const char* const  source_data_;
449311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  const size_t       source_size_;
450311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
451311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // The size of this array is determined using CalcTableSize().  It has at
452311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // least one element for each kBlockSize-byte block in the source data.
453311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // GetHashTableIndex() returns an index into this table for a given hash
454311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // value.  The value of each element of hash_table_ is the lowest block
455311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // number in the source data whose hash value would return the same value from
456311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // GetHashTableIndex(), or -1 if there is no matching block.  This value can
457311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // then be used as an index into next_block_table_ to retrieve the entire set
458311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // of matching block numbers.
459311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  std::vector<int> hash_table_;
460311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
461311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // An array containing one element for each source block.  Each element is
462311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // either -1 (== not found) or the index of the next block whose hash value
463311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // would produce a matching result from GetHashTableIndex().
464311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  std::vector<int> next_block_table_;
465311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
466311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // This vector has the same size as next_block_table_.  For every block number
467311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // B that is referenced in hash_table_, last_block_table_[B] will contain
468311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the maximum block number that has the same GetHashTableIndex() value
469311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // as block B.  This number may be B itself.  For a block number B' that
470311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // is not referenced in hash_table_, the value of last_block_table_[B'] is -1.
471311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // This table is used only while populating the hash table, not while looking
472311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // up hash values in the table.  Keeping track of the last block number in the
473311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // chain allows us to construct the block chains as FIFO rather than LIFO
474311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // lists, so that the match with the lowest index is returned first.  This
475311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // should result in a more compact encoding because the VCDIFF format favors
476311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // smaller index values and repeated index values.
477311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  std::vector<int> last_block_table_;
478311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
479311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Performing a bitwise AND with hash_table_mask_ will produce a value ranging
480311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // from 0 to the number of elements in hash_table_.
481311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  uint32_t hash_table_mask_;
482311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
483311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // The offset of the first byte of source data (the data at source_data_[0]).
484311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // For the purpose of computing offsets, the source data and target data
485311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // are considered to be concatenated -- not literally in a single memory
486311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // buffer, but conceptually as described in the RFC.
487311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // The first byte of the previously encoded target data
488311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // has an offset that is equal to dictionary_size, i.e., just after
489311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // the last byte of source data.
490311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // For a hash of source (dictionary) data, starting_offset_ will be zero;
491311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // for a hash of previously encoded target data, starting_offset_ will be
492311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // equal to the dictionary size.
493311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  const int starting_offset_;
494311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
495311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // The last index added by AddBlock().  This determines the block number
496311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // for successive calls to AddBlock(), and is also
497311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // used to determine the starting block for AddAllBlocksThroughIndex().
498311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  int last_block_added_;
499311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
500311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  // Making these private avoids implicit copy constructor & assignment operator
501311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  BlockHash(const BlockHash&);  // NOLINT
502311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff  void operator=(const BlockHash&);
503311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff};
504311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
505311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff}  // namespace open_vcdiff
506311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff
507311c71486f5f6074e5ba62a7f4c5397c8700b868openvcdiff#endif  // OPEN_VCDIFF_BLOCKHASH_H_
508