1// Copyright 2006, 2008 Google Inc.
2// Authors: Chandra Chereddi, Lincoln Smith
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#include <config.h>
17#include "vcdiffengine.h"
18#include <stdint.h>  // uint32_t
19#include <string.h>  // memcpy
20#include "blockhash.h"
21#include "codetablewriter_interface.h"
22#include "logging.h"
23#include "rolling_hash.h"
24
25namespace open_vcdiff {
26
27VCDiffEngine::VCDiffEngine(const char* dictionary, size_t dictionary_size)
28    // If dictionary_size == 0, then dictionary could be NULL.  Guard against
29    // using a NULL value.
30    : dictionary_((dictionary_size > 0) ? new char[dictionary_size] : ""),
31      dictionary_size_(dictionary_size),
32      hashed_dictionary_(NULL) {
33  if (dictionary_size > 0) {
34    memcpy(const_cast<char*>(dictionary_), dictionary, dictionary_size);
35  }
36}
37
38VCDiffEngine::~VCDiffEngine() {
39  delete hashed_dictionary_;
40  if (dictionary_size_ > 0) {
41    delete[] dictionary_;
42  }
43}
44
45bool VCDiffEngine::Init() {
46  if (hashed_dictionary_) {
47    VCD_DFATAL << "Init() called twice for same VCDiffEngine object"
48               << VCD_ENDL;
49    return false;
50  }
51  hashed_dictionary_ = BlockHash::CreateDictionaryHash(dictionary_,
52                                                       dictionary_size());
53  if (!hashed_dictionary_) {
54    VCD_DFATAL << "Creation of dictionary hash failed" << VCD_ENDL;
55    return false;
56  }
57  RollingHash<BlockHash::kBlockSize>::Init();
58  return true;
59}
60
61// This helper function tries to find an appropriate match within
62// hashed_dictionary_ for the block starting at the current target position.
63// If target_hash is not NULL, this function will also look for a match
64// within the previously encoded target data.
65//
66// If a match is found, this function will generate an ADD instruction
67// for all unencoded data that precedes the match,
68// and a COPY instruction for the match itself; then it returns
69// the number of bytes processed by both instructions,
70// which is guaranteed to be > 0.
71// If no appropriate match is found, the function returns 0.
72//
73// The first four parameters are input parameters which are passed
74// directly to BlockHash::FindBestMatch; please see that function
75// for a description of their allowable values.
76template<bool look_for_target_matches>
77inline size_t VCDiffEngine::EncodeCopyForBestMatch(
78    uint32_t hash_value,
79    const char* target_candidate_start,
80    const char* unencoded_target_start,
81    size_t unencoded_target_size,
82    const BlockHash* target_hash,
83    CodeTableWriterInterface* coder) const {
84  // When FindBestMatch() comes up with a match for a candidate block,
85  // it will populate best_match with the size, source offset,
86  // and target offset of the match.
87  BlockHash::Match best_match;
88
89  // First look for a match in the dictionary.
90  hashed_dictionary_->FindBestMatch(hash_value,
91                                    target_candidate_start,
92                                    unencoded_target_start,
93                                    unencoded_target_size,
94                                    &best_match);
95  // If target matching is enabled, then see if there is a better match
96  // within the target data that has been encoded so far.
97  if (look_for_target_matches) {
98    target_hash->FindBestMatch(hash_value,
99                               target_candidate_start,
100                               unencoded_target_start,
101                               unencoded_target_size,
102                               &best_match);
103  }
104  if (!ShouldGenerateCopyInstructionForMatchOfSize(best_match.size())) {
105    return 0;
106  }
107  if (best_match.target_offset() > 0) {
108    // Create an ADD instruction to encode all target bytes
109    // from the end of the last COPY match, if any, up to
110    // the beginning of this COPY match.
111    coder->Add(unencoded_target_start, best_match.target_offset());
112  }
113  coder->Copy(best_match.source_offset(), best_match.size());
114  return best_match.target_offset()  // ADD size
115       + best_match.size();          // + COPY size
116}
117
118// Once the encoder loop has finished checking for matches in the target data,
119// this function creates an ADD instruction to encode all target bytes
120// from the end of the last COPY match, if any, through the end of
121// the target data.  In the worst case, if no matches were found at all,
122// this function will create one big ADD instruction
123// for the entire buffer of target data.
124inline void VCDiffEngine::AddUnmatchedRemainder(
125    const char* unencoded_target_start,
126    size_t unencoded_target_size,
127    CodeTableWriterInterface* coder) const {
128  if (unencoded_target_size > 0) {
129    coder->Add(unencoded_target_start, unencoded_target_size);
130  }
131}
132
133// This helper function tells the coder to finish the encoding and write
134// the results into the output string "diff".
135inline void VCDiffEngine::FinishEncoding(
136    size_t target_size,
137    OutputStringInterface* diff,
138    CodeTableWriterInterface* coder) const {
139  if (target_size != static_cast<size_t>(coder->target_length())) {
140    VCD_DFATAL << "Internal error in VCDiffEngine::Encode: "
141                  "original target size (" << target_size
142               << ") does not match number of bytes processed ("
143               << coder->target_length() << ")" << VCD_ENDL;
144  }
145  coder->Output(diff);
146}
147
148template<bool look_for_target_matches>
149void VCDiffEngine::EncodeInternal(const char* target_data,
150                                  size_t target_size,
151                                  OutputStringInterface* diff,
152                                  CodeTableWriterInterface* coder) const {
153  if (!hashed_dictionary_) {
154    VCD_DFATAL << "Internal error: VCDiffEngine::Encode() "
155                  "called before VCDiffEngine::Init()" << VCD_ENDL;
156    return;
157  }
158  if (target_size == 0) {
159    return;  // Do nothing for empty target
160  }
161  // Special case for really small input
162  if (target_size < static_cast<size_t>(BlockHash::kBlockSize)) {
163    AddUnmatchedRemainder(target_data, target_size, coder);
164    FinishEncoding(target_size, diff, coder);
165    return;
166  }
167  RollingHash<BlockHash::kBlockSize> hasher;
168  BlockHash* target_hash = NULL;
169  if (look_for_target_matches) {
170    // Check matches against previously encoded target data
171    // in this same target window, as well as against the dictionary
172    target_hash = BlockHash::CreateTargetHash(target_data,
173                                              target_size,
174                                              dictionary_size());
175    if (!target_hash) {
176      VCD_DFATAL << "Instantiation of target hash failed" << VCD_ENDL;
177      return;
178    }
179  }
180  const char* const target_end = target_data + target_size;
181  const char* const start_of_last_block = target_end - BlockHash::kBlockSize;
182  // Offset of next bytes in string to ADD if NOT copied (i.e., not found in
183  // dictionary)
184  const char* next_encode = target_data;
185  // candidate_pos points to the start of the kBlockSize-byte block that may
186  // begin a match with the dictionary or previously encoded target data.
187  const char* candidate_pos = target_data;
188  uint32_t hash_value = hasher.Hash(candidate_pos);
189  while (1) {
190    const size_t bytes_encoded =
191        EncodeCopyForBestMatch<look_for_target_matches>(
192            hash_value,
193            candidate_pos,
194            next_encode,
195            (target_end - next_encode),
196            target_hash,
197            coder);
198    if (bytes_encoded > 0) {
199      next_encode += bytes_encoded;  // Advance past COPYed data
200      candidate_pos = next_encode;
201      if (candidate_pos > start_of_last_block) {
202        break;  // Reached end of target data
203      }
204      // candidate_pos has jumped ahead by bytes_encoded bytes, so UpdateHash
205      // can't be used to calculate the hash value at its new position.
206      hash_value = hasher.Hash(candidate_pos);
207      if (look_for_target_matches) {
208        // Update the target hash for the ADDed and COPYed data
209        target_hash->AddAllBlocksThroughIndex(
210            static_cast<int>(next_encode - target_data));
211      }
212    } else {
213      // No match, or match is too small to be worth a COPY instruction.
214      // Move to the next position in the target data.
215      if ((candidate_pos + 1) > start_of_last_block) {
216        break;  // Reached end of target data
217      }
218      if (look_for_target_matches) {
219        target_hash->AddOneIndexHash(
220            static_cast<int>(candidate_pos - target_data),
221            hash_value);
222      }
223      hash_value = hasher.UpdateHash(hash_value,
224                                     candidate_pos[0],
225                                     candidate_pos[BlockHash::kBlockSize]);
226      ++candidate_pos;
227    }
228  }
229  AddUnmatchedRemainder(next_encode, target_end - next_encode, coder);
230  FinishEncoding(target_size, diff, coder);
231  delete target_hash;
232}
233
234void VCDiffEngine::Encode(const char* target_data,
235                          size_t target_size,
236                          bool look_for_target_matches,
237                          OutputStringInterface* diff,
238                          CodeTableWriterInterface* coder) const {
239  if (look_for_target_matches) {
240    EncodeInternal<true>(target_data, target_size, diff, coder);
241  } else {
242    EncodeInternal<false>(target_data, target_size, diff, coder);
243  }
244}
245
246}  // namespace open_vcdiff
247