1// Copyright 2008 Google Inc.
2// Author: Lincoln Smith
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#include <config.h>
17#include "encodetable.h"
18#include <string>
19#include "addrcache.h"
20#include "codetable.h"
21#include "instruction_map.h"
22#include "logging.h"
23#include "google/output_string.h"
24#include "varint_bigendian.h"
25#include "vcdiff_defs.h"
26
27namespace open_vcdiff {
28
29// VCDiffCodeTableWriter members and methods
30
31// If interleaved is true, the encoder writes each delta file window
32// by interleaving instructions and sizes with their corresponding
33// addresses and data, rather than placing these elements into three
34// separate sections.  This facilitates providing partially
35// decoded results when only a portion of a delta file window
36// is received (e.g. when HTTP over TCP is used as the
37// transmission protocol.)  The interleaved format is
38// not consistent with the VCDIFF draft standard.
39//
40VCDiffCodeTableWriter::VCDiffCodeTableWriter(bool interleaved)
41    : max_mode_(VCDiffAddressCache::DefaultLastMode()),
42      dictionary_size_(0),
43      target_length_(0),
44      code_table_data_(&VCDiffCodeTableData::kDefaultCodeTableData),
45      instruction_map_(NULL),
46      last_opcode_index_(-1),
47      add_checksum_(false),
48      checksum_(0),
49      match_counts_(kMaxMatchSize, 0) {
50  InitSectionPointers(interleaved);
51}
52
53VCDiffCodeTableWriter::VCDiffCodeTableWriter(
54    bool interleaved,
55    int near_cache_size,
56    int same_cache_size,
57    const VCDiffCodeTableData& code_table_data,
58    unsigned char max_mode)
59    : max_mode_(max_mode),
60      address_cache_(near_cache_size, same_cache_size),
61      dictionary_size_(0),
62      target_length_(0),
63      code_table_data_(&code_table_data),
64      instruction_map_(NULL),
65      last_opcode_index_(-1),
66      add_checksum_(false),
67      checksum_(0)  {
68  InitSectionPointers(interleaved);
69}
70
71VCDiffCodeTableWriter::~VCDiffCodeTableWriter() {
72  if (code_table_data_ != &VCDiffCodeTableData::kDefaultCodeTableData) {
73    delete instruction_map_;
74  }
75}
76
77void VCDiffCodeTableWriter::InitSectionPointers(bool interleaved) {
78  if (interleaved) {
79    data_for_add_and_run_ = &instructions_and_sizes_;
80    addresses_for_copy_ = &instructions_and_sizes_;
81  } else {
82    data_for_add_and_run_ = &separate_data_for_add_and_run_;
83    addresses_for_copy_ = &separate_addresses_for_copy_;
84  }
85}
86
87bool VCDiffCodeTableWriter::Init(size_t dictionary_size) {
88  dictionary_size_ = dictionary_size;
89  if (!instruction_map_) {
90    if (code_table_data_ == &VCDiffCodeTableData::kDefaultCodeTableData) {
91      instruction_map_ = VCDiffInstructionMap::GetDefaultInstructionMap();
92    } else {
93      instruction_map_ = new VCDiffInstructionMap(*code_table_data_, max_mode_);
94    }
95    if (!instruction_map_) {
96      return false;
97    }
98  }
99  if (!address_cache_.Init()) {
100    return false;
101  }
102  target_length_ = 0;
103  last_opcode_index_ = -1;
104  return true;
105}
106
107// The VCDiff format allows each opcode to represent either
108// one or two delta instructions.  This function will first
109// examine the opcode generated by the last call to EncodeInstruction.
110// If that opcode was a single-instruction opcode, this function checks
111// whether there is a compound (double-instruction) opcode that can
112// combine that single instruction with the instruction that is now
113// being added, and so save a byte of space.  In that case, the
114// single-instruction opcode at position last_opcode_index_ will be
115// overwritten with the new double-instruction opcode.
116//
117// In the majority of cases, no compound opcode will be possible,
118// and a new single-instruction opcode will be appended to
119// instructions_and_sizes_, followed by a representation of its size
120// if the opcode does not implicitly give the instruction size.
121//
122// As an example, say instructions_and_sizes_ contains 10 bytes, the last
123// of which contains the opcode 0x02 (ADD size 1).  Because that was the
124// most recently added opcode, last_opcode_index_ has the value 10.
125// EncodeInstruction is then called with inst = VCD_COPY, size = 4, mode = 0.
126// The function will replace the old opcode 0x02 with the double-instruction
127// opcode 0xA3 (ADD size 1 + COPY size 4 mode 0).
128//
129// All of the double-instruction opcodes in the standard code table
130// have implicit sizes, meaning that the size of the instruction will not
131// need to be written to the instructions_and_sizes_ string separately
132// from the opcode.  If a custom code table were used that did not have
133// this property, then instructions_and_sizes_ might contain a
134// double-instruction opcode (say, COPY size 0 mode 0 + ADD size 0)
135// followed by the size of the COPY, then by the size of the ADD.
136// If using the SDCH interleaved format, the address of the COPY instruction
137// would follow its size, so the ordering would be
138// [Compound Opcode][Size of COPY][Address of COPY][Size of ADD]
139//
140void VCDiffCodeTableWriter::EncodeInstruction(VCDiffInstructionType inst,
141                                              size_t size,
142                                              unsigned char mode) {
143  if (!instruction_map_) {
144    LOG(DFATAL) << "EncodeInstruction() called without calling Init()"
145                << LOG_ENDL;
146    return;
147  }
148  if (last_opcode_index_ >= 0) {
149    const unsigned char last_opcode =
150        instructions_and_sizes_[last_opcode_index_];
151    // The encoding engine should not generate two ADD instructions in a row.
152    // This won't cause a failure, but it's inefficient encoding and probably
153    // represents a bug in the higher-level logic of the encoder.
154    if ((inst == VCD_ADD) &&
155        (code_table_data_->inst1[last_opcode] == VCD_ADD)) {
156      LOG(WARNING) << "EncodeInstruction() called for two ADD instructions"
157                      " in a row" << LOG_ENDL;
158    }
159    OpcodeOrNone compound_opcode = kNoOpcode;
160    if (size <= UCHAR_MAX) {
161      compound_opcode =
162          instruction_map_->LookupSecondOpcode(last_opcode,
163                                               inst,
164                                               static_cast<unsigned char>(size),
165                                               mode);
166      if (compound_opcode != kNoOpcode) {
167        instructions_and_sizes_[last_opcode_index_] =
168            static_cast<unsigned char>(compound_opcode);
169        last_opcode_index_ = -1;
170        return;
171      }
172    }
173    // Try finding a compound opcode with size 0.
174    compound_opcode = instruction_map_->LookupSecondOpcode(last_opcode,
175                                                           inst,
176                                                           0,
177                                                           mode);
178    if (compound_opcode != kNoOpcode) {
179      instructions_and_sizes_[last_opcode_index_] =
180          static_cast<unsigned char>(compound_opcode);
181      last_opcode_index_ = -1;
182      AppendSizeToString(size, &instructions_and_sizes_);
183      return;
184    }
185  }
186  OpcodeOrNone opcode = kNoOpcode;
187  if (size <= UCHAR_MAX) {
188    opcode =
189        instruction_map_->LookupFirstOpcode(inst,
190                                            static_cast<unsigned char>(size),
191                                            mode);
192    if (opcode != kNoOpcode) {
193      instructions_and_sizes_.push_back(static_cast<char>(opcode));
194      last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
195      return;
196    }
197  }
198  // There should always be an opcode with size 0.
199  opcode = instruction_map_->LookupFirstOpcode(inst, 0, mode);
200  if (opcode == kNoOpcode) {
201    LOG(DFATAL) << "No matching opcode found for inst " << inst
202                << ", mode " << mode << ", size 0" << LOG_ENDL;
203    return;
204  }
205  instructions_and_sizes_.push_back(static_cast<char>(opcode));
206  last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
207  AppendSizeToString(size, &instructions_and_sizes_);
208}
209
210void VCDiffCodeTableWriter::Add(const char* data, size_t size) {
211  EncodeInstruction(VCD_ADD, size);
212  data_for_add_and_run_->append(data, size);
213  target_length_ += size;
214}
215
216void VCDiffCodeTableWriter::Copy(int32_t offset, size_t size) {
217  if (!instruction_map_) {
218    LOG(DFATAL) << "VCDiffCodeTableWriter::Copy() called without calling Init()"
219                << LOG_ENDL;
220    return;
221  }
222  // If a single interleaved stream of encoded values is used
223  // instead of separate sections for instructions, addresses, and data,
224  // then the string instructions_and_sizes_ may be the same as
225  // addresses_for_copy_.  The address should therefore be encoded
226  // *after* the instruction and its size.
227  int32_t encoded_addr = 0;
228  const unsigned char mode = address_cache_.EncodeAddress(
229      offset,
230      static_cast<VCDAddress>(dictionary_size_ + target_length_),
231      &encoded_addr);
232  EncodeInstruction(VCD_COPY, size, mode);
233  if (address_cache_.WriteAddressAsVarintForMode(mode)) {
234    VarintBE<int32_t>::AppendToString(encoded_addr, addresses_for_copy_);
235  } else {
236    addresses_for_copy_->push_back(static_cast<unsigned char>(encoded_addr));
237  }
238  target_length_ += size;
239  if (size >= match_counts_.size()) {
240    match_counts_.resize(size * 2, 0);  // Be generous to avoid resizing again
241  }
242  ++match_counts_[size];
243}
244
245void VCDiffCodeTableWriter::Run(size_t size, unsigned char byte) {
246  EncodeInstruction(VCD_RUN, size);
247  data_for_add_and_run_->push_back(byte);
248  target_length_ += size;
249}
250
251size_t VCDiffCodeTableWriter::CalculateLengthOfSizeAsVarint(size_t size) {
252  return VarintBE<int32_t>::Length(static_cast<int32_t>(size));
253}
254
255void VCDiffCodeTableWriter::AppendSizeToString(size_t size, string* out) {
256  VarintBE<int32_t>::AppendToString(static_cast<int32_t>(size), out);
257}
258
259void VCDiffCodeTableWriter::AppendSizeToOutputString(
260    size_t size,
261    OutputStringInterface* out) {
262  VarintBE<int32_t>::AppendToOutputString(static_cast<int32_t>(size), out);
263}
264
265// This calculation must match the items added between "Start of Delta Encoding"
266// and "End of Delta Encoding" in Output(), below.
267size_t VCDiffCodeTableWriter::CalculateLengthOfTheDeltaEncoding() const {
268  size_t length_of_the_delta_encoding =
269    CalculateLengthOfSizeAsVarint(target_length_) +
270    1 +  // Delta_Indicator
271    CalculateLengthOfSizeAsVarint(separate_data_for_add_and_run_.size()) +
272    CalculateLengthOfSizeAsVarint(instructions_and_sizes_.size()) +
273    CalculateLengthOfSizeAsVarint(separate_addresses_for_copy_.size()) +
274    separate_data_for_add_and_run_.size() +
275    instructions_and_sizes_.size() +
276    separate_addresses_for_copy_.size();
277  if (add_checksum_) {
278    length_of_the_delta_encoding +=
279        VarintBE<int64_t>::Length(static_cast<int64_t>(checksum_));
280  }
281  return length_of_the_delta_encoding;
282}
283
284void VCDiffCodeTableWriter::Output(OutputStringInterface* out) {
285  if (instructions_and_sizes_.empty()) {
286    LOG(WARNING) << "Empty input; no delta window produced" << LOG_ENDL;
287  } else {
288    const size_t length_of_the_delta_encoding =
289        CalculateLengthOfTheDeltaEncoding();
290    const size_t delta_window_size =
291        length_of_the_delta_encoding +
292        1 +  // Win_Indicator
293        CalculateLengthOfSizeAsVarint(dictionary_size_) +
294        CalculateLengthOfSizeAsVarint(0) +
295        CalculateLengthOfSizeAsVarint(length_of_the_delta_encoding);
296    // append() will be called many times on the output string; make sure
297    // the output string is resized only once at most.
298    out->ReserveAdditionalBytes(delta_window_size);
299
300    // Add first element: Win_Indicator
301    if (add_checksum_) {
302      out->push_back(VCD_SOURCE | VCD_CHECKSUM);
303    } else {
304      out->push_back(VCD_SOURCE);
305    }
306    // Source segment size: dictionary size
307    AppendSizeToOutputString(dictionary_size_, out);
308    // Source segment position: 0 (start of dictionary)
309    AppendSizeToOutputString(0, out);
310
311    // [Here is where a secondary compressor would be used
312    //  if the encoder and decoder supported that feature.]
313
314    AppendSizeToOutputString(length_of_the_delta_encoding, out);
315    // Start of Delta Encoding
316    const size_t size_before_delta_encoding = out->size();
317    AppendSizeToOutputString(target_length_, out);
318    out->push_back(0x00);  // Delta_Indicator: no compression
319    AppendSizeToOutputString(separate_data_for_add_and_run_.size(), out);
320    AppendSizeToOutputString(instructions_and_sizes_.size(), out);
321    AppendSizeToOutputString(separate_addresses_for_copy_.size(), out);
322    if (add_checksum_) {
323      // The checksum is a 32-bit *unsigned* integer.  VarintBE requires a
324      // signed type, so use a 64-bit signed integer to store the checksum.
325      VarintBE<int64_t>::AppendToOutputString(static_cast<int64_t>(checksum_),
326                                              out);
327    }
328    out->append(separate_data_for_add_and_run_.data(),
329                separate_data_for_add_and_run_.size());
330    out->append(instructions_and_sizes_.data(),
331                instructions_and_sizes_.size());
332    out->append(separate_addresses_for_copy_.data(),
333                separate_addresses_for_copy_.size());
334    // End of Delta Encoding
335    const size_t size_after_delta_encoding = out->size();
336    if (length_of_the_delta_encoding !=
337        (size_after_delta_encoding - size_before_delta_encoding)) {
338      LOG(DFATAL) << "Internal error: calculated length of the delta encoding ("
339                  << length_of_the_delta_encoding
340                  << ") does not match actual length ("
341                  << (size_after_delta_encoding - size_before_delta_encoding)
342                  << LOG_ENDL;
343    }
344    separate_data_for_add_and_run_.clear();
345    instructions_and_sizes_.clear();
346    separate_addresses_for_copy_.clear();
347    if (target_length_ == 0) {
348      LOG(WARNING) << "Empty target window" << LOG_ENDL;
349    }
350  }
351
352  // Reset state for next window; assume we are using same code table
353  // and dictionary.  The caller will have to invoke Init() if a different
354  // dictionary is used.
355  //
356  // Notably, Init() calls address_cache_.Init().  This resets the address
357  // cache between delta windows, as required by RFC section 5.1.
358  if (!Init(dictionary_size_)) {
359    LOG(DFATAL) << "Internal error: calling Init() to reset "
360                   "VCDiffCodeTableWriter state failed" << LOG_ENDL;
361  }
362}
363
364};  // namespace open_vcdiff
365