1// Copyright 2008 Google Inc.
2// Author: Lincoln Smith
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#include <config.h>
17#include <limits.h>  // UCHAR_MAX
18#include <string>
19#include "addrcache.h"
20#include "codetable.h"
21#include "encodetable.h"
22#include "instruction_map.h"
23#include "logging.h"
24#include "google/output_string.h"
25#include "varint_bigendian.h"
26#include "vcdiff_defs.h"
27
28namespace open_vcdiff {
29
30static const DeltaFileHeader kHeaderStandardFormat = {
31    0xD6,  // Header1: "V" | 0x80
32    0xC3,  // Header2: "C" | 0x80
33    0xC4,  // Header3: "D" | 0x80
34    0x00,  // Header4: Draft standard format
35    0x00 };  // Hdr_Indicator:
36             // No compression, no custom code table
37
38static const DeltaFileHeader kHeaderExtendedFormat = {
39    0xD6,  // Header1: "V" | 0x80
40    0xC3,  // Header2: "C" | 0x80
41    0xC4,  // Header3: "D" | 0x80
42    'S',   // Header4: VCDIFF/SDCH, extensions used
43    0x00 };  // Hdr_Indicator:
44             // No compression, no custom code table
45
46// VCDiffCodeTableWriter members and methods
47
48// If interleaved is true, the encoder writes each delta file window
49// by interleaving instructions and sizes with their corresponding
50// addresses and data, rather than placing these elements into three
51// separate sections.  This facilitates providing partially
52// decoded results when only a portion of a delta file window
53// is received (e.g. when HTTP over TCP is used as the
54// transmission protocol.)  The interleaved format is
55// not consistent with the VCDIFF draft standard.
56//
57VCDiffCodeTableWriter::VCDiffCodeTableWriter(bool interleaved)
58    : max_mode_(VCDiffAddressCache::DefaultLastMode()),
59      dictionary_size_(0),
60      target_length_(0),
61      code_table_data_(&VCDiffCodeTableData::kDefaultCodeTableData),
62      instruction_map_(NULL),
63      last_opcode_index_(-1),
64      add_checksum_(false),
65      checksum_(0) {
66  InitSectionPointers(interleaved);
67}
68
69VCDiffCodeTableWriter::VCDiffCodeTableWriter(
70    bool interleaved,
71    int near_cache_size,
72    int same_cache_size,
73    const VCDiffCodeTableData& code_table_data,
74    unsigned char max_mode)
75    : max_mode_(max_mode),
76      address_cache_(near_cache_size, same_cache_size),
77      dictionary_size_(0),
78      target_length_(0),
79      code_table_data_(&code_table_data),
80      instruction_map_(NULL),
81      last_opcode_index_(-1),
82      add_checksum_(false),
83      checksum_(0)  {
84  InitSectionPointers(interleaved);
85}
86
87VCDiffCodeTableWriter::~VCDiffCodeTableWriter() {
88  if (code_table_data_ != &VCDiffCodeTableData::kDefaultCodeTableData) {
89    delete instruction_map_;
90  }
91}
92
93void VCDiffCodeTableWriter::InitSectionPointers(bool interleaved) {
94  if (interleaved) {
95    data_for_add_and_run_ = &instructions_and_sizes_;
96    addresses_for_copy_ = &instructions_and_sizes_;
97  } else {
98    data_for_add_and_run_ = &separate_data_for_add_and_run_;
99    addresses_for_copy_ = &separate_addresses_for_copy_;
100  }
101}
102
103bool VCDiffCodeTableWriter::Init(size_t dictionary_size) {
104  dictionary_size_ = dictionary_size;
105  if (!instruction_map_) {
106    if (code_table_data_ == &VCDiffCodeTableData::kDefaultCodeTableData) {
107      instruction_map_ = VCDiffInstructionMap::GetDefaultInstructionMap();
108    } else {
109      instruction_map_ = new VCDiffInstructionMap(*code_table_data_, max_mode_);
110    }
111    if (!instruction_map_) {
112      return false;
113    }
114  }
115  if (!address_cache_.Init()) {
116    return false;
117  }
118  target_length_ = 0;
119  last_opcode_index_ = -1;
120  return true;
121}
122
123void VCDiffCodeTableWriter::WriteHeader(
124    OutputStringInterface* out,
125    VCDiffFormatExtensionFlags format_extensions) {
126  if (format_extensions == VCD_STANDARD_FORMAT) {
127    out->append(reinterpret_cast<const char*>(&kHeaderStandardFormat),
128                sizeof(kHeaderStandardFormat));
129  } else {
130    out->append(reinterpret_cast<const char*>(&kHeaderExtendedFormat),
131                sizeof(kHeaderExtendedFormat));
132  }
133  // If custom cache table sizes or a custom code table were used
134  // for encoding, here is where they would be appended to *output.
135  // This implementation of the encoder does not use those features,
136  // although the decoder can understand and interpret them.
137}
138
139// The VCDiff format allows each opcode to represent either
140// one or two delta instructions.  This function will first
141// examine the opcode generated by the last call to EncodeInstruction.
142// If that opcode was a single-instruction opcode, this function checks
143// whether there is a compound (double-instruction) opcode that can
144// combine that single instruction with the instruction that is now
145// being added, and so save a byte of space.  In that case, the
146// single-instruction opcode at position last_opcode_index_ will be
147// overwritten with the new double-instruction opcode.
148//
149// In the majority of cases, no compound opcode will be possible,
150// and a new single-instruction opcode will be appended to
151// instructions_and_sizes_, followed by a representation of its size
152// if the opcode does not implicitly give the instruction size.
153//
154// As an example, say instructions_and_sizes_ contains 10 bytes, the last
155// of which contains the opcode 0x02 (ADD size 1).  Because that was the
156// most recently added opcode, last_opcode_index_ has the value 10.
157// EncodeInstruction is then called with inst = VCD_COPY, size = 4, mode = 0.
158// The function will replace the old opcode 0x02 with the double-instruction
159// opcode 0xA3 (ADD size 1 + COPY size 4 mode 0).
160//
161// All of the double-instruction opcodes in the standard code table
162// have implicit sizes, meaning that the size of the instruction will not
163// need to be written to the instructions_and_sizes_ string separately
164// from the opcode.  If a custom code table were used that did not have
165// this property, then instructions_and_sizes_ might contain a
166// double-instruction opcode (say, COPY size 0 mode 0 + ADD size 0)
167// followed by the size of the COPY, then by the size of the ADD.
168// If using the SDCH interleaved format, the address of the COPY instruction
169// would follow its size, so the ordering would be
170// [Compound Opcode][Size of COPY][Address of COPY][Size of ADD]
171//
172void VCDiffCodeTableWriter::EncodeInstruction(VCDiffInstructionType inst,
173                                              size_t size,
174                                              unsigned char mode) {
175  if (!instruction_map_) {
176    VCD_DFATAL << "EncodeInstruction() called without calling Init()"
177               << VCD_ENDL;
178    return;
179  }
180  if (last_opcode_index_ >= 0) {
181    const unsigned char last_opcode =
182        instructions_and_sizes_[last_opcode_index_];
183    // The encoding engine should not generate two ADD instructions in a row.
184    // This won't cause a failure, but it's inefficient encoding and probably
185    // represents a bug in the higher-level logic of the encoder.
186    if ((inst == VCD_ADD) &&
187        (code_table_data_->inst1[last_opcode] == VCD_ADD)) {
188      VCD_WARNING << "EncodeInstruction() called for two ADD instructions"
189                     " in a row" << VCD_ENDL;
190    }
191    OpcodeOrNone compound_opcode = kNoOpcode;
192    if (size <= UCHAR_MAX) {
193      compound_opcode =
194          instruction_map_->LookupSecondOpcode(last_opcode,
195                                               inst,
196                                               static_cast<unsigned char>(size),
197                                               mode);
198      if (compound_opcode != kNoOpcode) {
199        instructions_and_sizes_[last_opcode_index_] =
200            static_cast<unsigned char>(compound_opcode);
201        last_opcode_index_ = -1;
202        return;
203      }
204    }
205    // Try finding a compound opcode with size 0.
206    compound_opcode = instruction_map_->LookupSecondOpcode(last_opcode,
207                                                           inst,
208                                                           0,
209                                                           mode);
210    if (compound_opcode != kNoOpcode) {
211      instructions_and_sizes_[last_opcode_index_] =
212          static_cast<unsigned char>(compound_opcode);
213      last_opcode_index_ = -1;
214      AppendSizeToString(size, &instructions_and_sizes_);
215      return;
216    }
217  }
218  OpcodeOrNone opcode = kNoOpcode;
219  if (size <= UCHAR_MAX) {
220    opcode =
221        instruction_map_->LookupFirstOpcode(inst,
222                                            static_cast<unsigned char>(size),
223                                            mode);
224    if (opcode != kNoOpcode) {
225      instructions_and_sizes_.push_back(static_cast<char>(opcode));
226      last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
227      return;
228    }
229  }
230  // There should always be an opcode with size 0.
231  opcode = instruction_map_->LookupFirstOpcode(inst, 0, mode);
232  if (opcode == kNoOpcode) {
233    VCD_DFATAL << "No matching opcode found for inst " << inst
234               << ", mode " << mode << ", size 0" << VCD_ENDL;
235    return;
236  }
237  instructions_and_sizes_.push_back(static_cast<char>(opcode));
238  last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
239  AppendSizeToString(size, &instructions_and_sizes_);
240}
241
242void VCDiffCodeTableWriter::Add(const char* data, size_t size) {
243  EncodeInstruction(VCD_ADD, size);
244  data_for_add_and_run_->append(data, size);
245  target_length_ += size;
246}
247
248void VCDiffCodeTableWriter::Copy(int32_t offset, size_t size) {
249  if (!instruction_map_) {
250    VCD_DFATAL << "VCDiffCodeTableWriter::Copy() called without calling Init()"
251               << VCD_ENDL;
252    return;
253  }
254  // If a single interleaved stream of encoded values is used
255  // instead of separate sections for instructions, addresses, and data,
256  // then the string instructions_and_sizes_ may be the same as
257  // addresses_for_copy_.  The address should therefore be encoded
258  // *after* the instruction and its size.
259  int32_t encoded_addr = 0;
260  const unsigned char mode = address_cache_.EncodeAddress(
261      offset,
262      static_cast<VCDAddress>(dictionary_size_ + target_length_),
263      &encoded_addr);
264  EncodeInstruction(VCD_COPY, size, mode);
265  if (address_cache_.WriteAddressAsVarintForMode(mode)) {
266    VarintBE<int32_t>::AppendToString(encoded_addr, addresses_for_copy_);
267  } else {
268    addresses_for_copy_->push_back(static_cast<unsigned char>(encoded_addr));
269  }
270  target_length_ += size;
271}
272
273void VCDiffCodeTableWriter::Run(size_t size, unsigned char byte) {
274  EncodeInstruction(VCD_RUN, size);
275  data_for_add_and_run_->push_back(byte);
276  target_length_ += size;
277}
278
279size_t VCDiffCodeTableWriter::CalculateLengthOfSizeAsVarint(size_t size) {
280  return VarintBE<int32_t>::Length(static_cast<int32_t>(size));
281}
282
283void VCDiffCodeTableWriter::AppendSizeToString(size_t size, string* out) {
284  VarintBE<int32_t>::AppendToString(static_cast<int32_t>(size), out);
285}
286
287void VCDiffCodeTableWriter::AppendSizeToOutputString(
288    size_t size,
289    OutputStringInterface* out) {
290  VarintBE<int32_t>::AppendToOutputString(static_cast<int32_t>(size), out);
291}
292
293// This calculation must match the items added between "Start of Delta Encoding"
294// and "End of Delta Encoding" in Output(), below.
295size_t VCDiffCodeTableWriter::CalculateLengthOfTheDeltaEncoding() const {
296  size_t length_of_the_delta_encoding =
297    CalculateLengthOfSizeAsVarint(target_length_) +
298    1 +  // Delta_Indicator
299    CalculateLengthOfSizeAsVarint(separate_data_for_add_and_run_.size()) +
300    CalculateLengthOfSizeAsVarint(instructions_and_sizes_.size()) +
301    CalculateLengthOfSizeAsVarint(separate_addresses_for_copy_.size()) +
302    separate_data_for_add_and_run_.size() +
303    instructions_and_sizes_.size() +
304    separate_addresses_for_copy_.size();
305  if (add_checksum_) {
306    length_of_the_delta_encoding +=
307        VarintBE<int64_t>::Length(static_cast<int64_t>(checksum_));
308  }
309  return length_of_the_delta_encoding;
310}
311
312void VCDiffCodeTableWriter::Output(OutputStringInterface* out) {
313  if (instructions_and_sizes_.empty()) {
314    VCD_WARNING << "Empty input; no delta window produced" << VCD_ENDL;
315  } else {
316    const size_t length_of_the_delta_encoding =
317        CalculateLengthOfTheDeltaEncoding();
318    const size_t delta_window_size =
319        length_of_the_delta_encoding +
320        1 +  // Win_Indicator
321        CalculateLengthOfSizeAsVarint(dictionary_size_) +
322        CalculateLengthOfSizeAsVarint(0) +
323        CalculateLengthOfSizeAsVarint(length_of_the_delta_encoding);
324    // append() will be called many times on the output string; make sure
325    // the output string is resized only once at most.
326    out->ReserveAdditionalBytes(delta_window_size);
327
328    // Add first element: Win_Indicator
329    if (add_checksum_) {
330      out->push_back(VCD_SOURCE | VCD_CHECKSUM);
331    } else {
332      out->push_back(VCD_SOURCE);
333    }
334    // Source segment size: dictionary size
335    AppendSizeToOutputString(dictionary_size_, out);
336    // Source segment position: 0 (start of dictionary)
337    AppendSizeToOutputString(0, out);
338
339    // [Here is where a secondary compressor would be used
340    //  if the encoder and decoder supported that feature.]
341
342    AppendSizeToOutputString(length_of_the_delta_encoding, out);
343    // Start of Delta Encoding
344    const size_t size_before_delta_encoding = out->size();
345    AppendSizeToOutputString(target_length_, out);
346    out->push_back(0x00);  // Delta_Indicator: no compression
347    AppendSizeToOutputString(separate_data_for_add_and_run_.size(), out);
348    AppendSizeToOutputString(instructions_and_sizes_.size(), out);
349    AppendSizeToOutputString(separate_addresses_for_copy_.size(), out);
350    if (add_checksum_) {
351      // The checksum is a 32-bit *unsigned* integer.  VarintBE requires a
352      // signed type, so use a 64-bit signed integer to store the checksum.
353      VarintBE<int64_t>::AppendToOutputString(static_cast<int64_t>(checksum_),
354                                              out);
355    }
356    out->append(separate_data_for_add_and_run_.data(),
357                separate_data_for_add_and_run_.size());
358    out->append(instructions_and_sizes_.data(),
359                instructions_and_sizes_.size());
360    out->append(separate_addresses_for_copy_.data(),
361                separate_addresses_for_copy_.size());
362    // End of Delta Encoding
363    const size_t size_after_delta_encoding = out->size();
364    if (length_of_the_delta_encoding !=
365        (size_after_delta_encoding - size_before_delta_encoding)) {
366      VCD_DFATAL << "Internal error: calculated length of the delta encoding ("
367                 << length_of_the_delta_encoding
368                 << ") does not match actual length ("
369                 << (size_after_delta_encoding - size_before_delta_encoding)
370                 << VCD_ENDL;
371    }
372    separate_data_for_add_and_run_.clear();
373    instructions_and_sizes_.clear();
374    separate_addresses_for_copy_.clear();
375    if (target_length_ == 0) {
376      VCD_WARNING << "Empty target window" << VCD_ENDL;
377    }
378  }
379
380  // Reset state for next window; assume we are using same code table
381  // and dictionary.  The caller will have to invoke Init() if a different
382  // dictionary is used.
383  //
384  // Notably, Init() calls address_cache_.Init().  This resets the address
385  // cache between delta windows, as required by RFC section 5.1.
386  if (!Init(dictionary_size_)) {
387    VCD_DFATAL << "Internal error: calling Init() to reset "
388                  "VCDiffCodeTableWriter state failed" << VCD_ENDL;
389  }
390}
391
392};  // namespace open_vcdiff
393