1// Copyright 2006, 2008 Google Inc.
2// Authors: Chandra Chereddi, Lincoln Smith
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#include <config.h>
17#include "vcdiffengine.h"
18#include <stdint.h>  // uint32_t
19#include <string.h>  // memcpy
20#include "blockhash.h"
21#include "codetablewriter_interface.h"
22#include "logging.h"
23#include "rolling_hash.h"
24
25namespace open_vcdiff {
26
27VCDiffEngine::VCDiffEngine(const char* dictionary, size_t dictionary_size)
28    // If dictionary_size == 0, then dictionary could be NULL.  Guard against
29    // using a NULL value.
30    : dictionary_((dictionary_size > 0) ? new char[dictionary_size] : ""),
31      dictionary_size_(dictionary_size),
32      hashed_dictionary_(NULL) {
33  if (dictionary_size > 0) {
34    memcpy(const_cast<char*>(dictionary_), dictionary, dictionary_size);
35  }
36}
37
38VCDiffEngine::~VCDiffEngine() {
39  delete hashed_dictionary_;
40  if (dictionary_size_ > 0) {
41    delete[] dictionary_;
42  }
43}
44
45bool VCDiffEngine::Init() {
46  if (hashed_dictionary_) {
47    LOG(DFATAL) << "Init() called twice for same VCDiffEngine object"
48                << LOG_ENDL;
49    return false;
50  }
51  hashed_dictionary_ = BlockHash::CreateDictionaryHash(dictionary_,
52                                                       dictionary_size());
53  if (!hashed_dictionary_) {
54    LOG(DFATAL) << "Creation of dictionary hash failed" << LOG_ENDL;
55    return false;
56  }
57  if (!RollingHash<BlockHash::kBlockSize>::Init()) {
58    LOG(DFATAL) << "RollingHash initialization failed" << LOG_ENDL;
59    return false;
60  }
61  return true;
62}
63
64// This helper function tries to find an appropriate match within
65// hashed_dictionary_ for the block starting at the current target position.
66// If target_hash is not NULL, this function will also look for a match
67// within the previously encoded target data.
68//
69// If a match is found, this function will generate an ADD instruction
70// for all unencoded data that precedes the match,
71// and a COPY instruction for the match itself; then it returns
72// the number of bytes processed by both instructions,
73// which is guaranteed to be > 0.
74// If no appropriate match is found, the function returns 0.
75//
76// The first four parameters are input parameters which are passed
77// directly to BlockHash::FindBestMatch; please see that function
78// for a description of their allowable values.
79template<bool look_for_target_matches>
80inline size_t VCDiffEngine::EncodeCopyForBestMatch(
81    uint32_t hash_value,
82    const char* target_candidate_start,
83    const char* unencoded_target_start,
84    size_t unencoded_target_size,
85    const BlockHash* target_hash,
86    CodeTableWriterInterface* coder) const {
87  // When FindBestMatch() comes up with a match for a candidate block,
88  // it will populate best_match with the size, source offset,
89  // and target offset of the match.
90  BlockHash::Match best_match;
91
92  // First look for a match in the dictionary.
93  hashed_dictionary_->FindBestMatch(hash_value,
94                                    target_candidate_start,
95                                    unencoded_target_start,
96                                    unencoded_target_size,
97                                    &best_match);
98  // If target matching is enabled, then see if there is a better match
99  // within the target data that has been encoded so far.
100  if (look_for_target_matches) {
101    target_hash->FindBestMatch(hash_value,
102                               target_candidate_start,
103                               unencoded_target_start,
104                               unencoded_target_size,
105                               &best_match);
106  }
107  if (!ShouldGenerateCopyInstructionForMatchOfSize(best_match.size())) {
108    return 0;
109  }
110  if (best_match.target_offset() > 0) {
111    // Create an ADD instruction to encode all target bytes
112    // from the end of the last COPY match, if any, up to
113    // the beginning of this COPY match.
114    coder->Add(unencoded_target_start, best_match.target_offset());
115  }
116  coder->Copy(best_match.source_offset(), best_match.size());
117  return best_match.target_offset()  // ADD size
118       + best_match.size();          // + COPY size
119}
120
121// Once the encoder loop has finished checking for matches in the target data,
122// this function creates an ADD instruction to encode all target bytes
123// from the end of the last COPY match, if any, through the end of
124// the target data.  In the worst case, if no matches were found at all,
125// this function will create one big ADD instruction
126// for the entire buffer of target data.
127inline void VCDiffEngine::AddUnmatchedRemainder(
128    const char* unencoded_target_start,
129    size_t unencoded_target_size,
130    CodeTableWriterInterface* coder) const {
131  if (unencoded_target_size > 0) {
132    coder->Add(unencoded_target_start, unencoded_target_size);
133  }
134}
135
136// This helper function tells the coder to finish the encoding and write
137// the results into the output string "diff".
138inline void VCDiffEngine::FinishEncoding(
139    size_t target_size,
140    OutputStringInterface* diff,
141    CodeTableWriterInterface* coder) const {
142  if (target_size != static_cast<size_t>(coder->target_length())) {
143    LOG(DFATAL) << "Internal error in VCDiffEngine::Encode: "
144                   "original target size (" << target_size
145                << ") does not match number of bytes processed ("
146                << coder->target_length() << ")" << LOG_ENDL;
147  }
148  coder->Output(diff);
149}
150
151template<bool look_for_target_matches>
152void VCDiffEngine::EncodeInternal(const char* target_data,
153                                  size_t target_size,
154                                  OutputStringInterface* diff,
155                                  CodeTableWriterInterface* coder) const {
156  if (!hashed_dictionary_) {
157    LOG(DFATAL) << "Internal error: VCDiffEngine::Encode() "
158                   "called before VCDiffEngine::Init()" << LOG_ENDL;
159    return;
160  }
161  if (target_size == 0) {
162    return;  // Do nothing for empty target
163  }
164  // Special case for really small input
165  if (target_size < static_cast<size_t>(BlockHash::kBlockSize)) {
166    AddUnmatchedRemainder(target_data, target_size, coder);
167    FinishEncoding(target_size, diff, coder);
168    return;
169  }
170  RollingHash<BlockHash::kBlockSize> hasher;
171  BlockHash* target_hash = NULL;
172  if (look_for_target_matches) {
173    // Check matches against previously encoded target data
174    // in this same target window, as well as against the dictionary
175    target_hash = BlockHash::CreateTargetHash(target_data,
176                                              target_size,
177                                              dictionary_size());
178    if (!target_hash) {
179      LOG(DFATAL) << "Instantiation of target hash failed" << LOG_ENDL;
180      return;
181    }
182  }
183  const char* const target_end = target_data + target_size;
184  const char* const start_of_last_block = target_end - BlockHash::kBlockSize;
185  // Offset of next bytes in string to ADD if NOT copied (i.e., not found in
186  // dictionary)
187  const char* next_encode = target_data;
188  // candidate_pos points to the start of the kBlockSize-byte block that may
189  // begin a match with the dictionary or previously encoded target data.
190  const char* candidate_pos = target_data;
191  uint32_t hash_value = hasher.Hash(candidate_pos);
192  while (1) {
193    const size_t bytes_encoded =
194        EncodeCopyForBestMatch<look_for_target_matches>(
195            hash_value,
196            candidate_pos,
197            next_encode,
198            (target_end - next_encode),
199            target_hash,
200            coder);
201    if (bytes_encoded > 0) {
202      next_encode += bytes_encoded;  // Advance past COPYed data
203      candidate_pos = next_encode;
204      if (candidate_pos > start_of_last_block) {
205        break;  // Reached end of target data
206      }
207      // candidate_pos has jumped ahead by bytes_encoded bytes, so UpdateHash
208      // can't be used to calculate the hash value at its new position.
209      hash_value = hasher.Hash(candidate_pos);
210      if (look_for_target_matches) {
211        // Update the target hash for the ADDed and COPYed data
212        target_hash->AddAllBlocksThroughIndex(
213            static_cast<int>(next_encode - target_data));
214      }
215    } else {
216      // No match, or match is too small to be worth a COPY instruction.
217      // Move to the next position in the target data.
218      if ((candidate_pos + 1) > start_of_last_block) {
219        break;  // Reached end of target data
220      }
221      if (look_for_target_matches) {
222        target_hash->AddOneIndexHash(
223            static_cast<int>(candidate_pos - target_data),
224            hash_value);
225      }
226      hash_value = hasher.UpdateHash(hash_value,
227                                     candidate_pos[0],
228                                     candidate_pos[BlockHash::kBlockSize]);
229      ++candidate_pos;
230    }
231  }
232  AddUnmatchedRemainder(next_encode, target_end - next_encode, coder);
233  FinishEncoding(target_size, diff, coder);
234  delete target_hash;
235}
236
237void VCDiffEngine::Encode(const char* target_data,
238                          size_t target_size,
239                          bool look_for_target_matches,
240                          OutputStringInterface* diff,
241                          CodeTableWriterInterface* coder) const {
242  if (look_for_target_matches) {
243    EncodeInternal<true>(target_data, target_size, diff, coder);
244  } else {
245    EncodeInternal<false>(target_data, target_size, diff, coder);
246  }
247}
248
249}  // namespace open_vcdiff
250