1// Copyright 2008 Google Inc. 2// Author: Lincoln Smith 3// 4// Licensed under the Apache License, Version 2.0 (the "License"); 5// you may not use this file except in compliance with the License. 6// You may obtain a copy of the License at 7// 8// http://www.apache.org/licenses/LICENSE-2.0 9// 10// Unless required by applicable law or agreed to in writing, software 11// distributed under the License is distributed on an "AS IS" BASIS, 12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13// See the License for the specific language governing permissions and 14// limitations under the License. 15 16#include <config.h> 17#include <limits.h> // UCHAR_MAX 18#include <string> 19#include "addrcache.h" 20#include "codetable.h" 21#include "encodetable.h" 22#include "instruction_map.h" 23#include "logging.h" 24#include "google/output_string.h" 25#include "varint_bigendian.h" 26#include "vcdiff_defs.h" 27 28namespace open_vcdiff { 29 30static const DeltaFileHeader kHeaderStandardFormat = { 31 0xD6, // Header1: "V" | 0x80 32 0xC3, // Header2: "C" | 0x80 33 0xC4, // Header3: "D" | 0x80 34 0x00, // Header4: Draft standard format 35 0x00 }; // Hdr_Indicator: 36 // No compression, no custom code table 37 38static const DeltaFileHeader kHeaderExtendedFormat = { 39 0xD6, // Header1: "V" | 0x80 40 0xC3, // Header2: "C" | 0x80 41 0xC4, // Header3: "D" | 0x80 42 'S', // Header4: VCDIFF/SDCH, extensions used 43 0x00 }; // Hdr_Indicator: 44 // No compression, no custom code table 45 46// VCDiffCodeTableWriter members and methods 47 48// If interleaved is true, the encoder writes each delta file window 49// by interleaving instructions and sizes with their corresponding 50// addresses and data, rather than placing these elements into three 51// separate sections. This facilitates providing partially 52// decoded results when only a portion of a delta file window 53// is received (e.g. when HTTP over TCP is used as the 54// transmission protocol.) The interleaved format is 55// not consistent with the VCDIFF draft standard. 56// 57VCDiffCodeTableWriter::VCDiffCodeTableWriter(bool interleaved) 58 : max_mode_(VCDiffAddressCache::DefaultLastMode()), 59 dictionary_size_(0), 60 target_length_(0), 61 code_table_data_(&VCDiffCodeTableData::kDefaultCodeTableData), 62 instruction_map_(NULL), 63 last_opcode_index_(-1), 64 add_checksum_(false), 65 checksum_(0) { 66 InitSectionPointers(interleaved); 67} 68 69VCDiffCodeTableWriter::VCDiffCodeTableWriter( 70 bool interleaved, 71 int near_cache_size, 72 int same_cache_size, 73 const VCDiffCodeTableData& code_table_data, 74 unsigned char max_mode) 75 : max_mode_(max_mode), 76 address_cache_(near_cache_size, same_cache_size), 77 dictionary_size_(0), 78 target_length_(0), 79 code_table_data_(&code_table_data), 80 instruction_map_(NULL), 81 last_opcode_index_(-1), 82 add_checksum_(false), 83 checksum_(0) { 84 InitSectionPointers(interleaved); 85} 86 87VCDiffCodeTableWriter::~VCDiffCodeTableWriter() { 88 if (code_table_data_ != &VCDiffCodeTableData::kDefaultCodeTableData) { 89 delete instruction_map_; 90 } 91} 92 93void VCDiffCodeTableWriter::InitSectionPointers(bool interleaved) { 94 if (interleaved) { 95 data_for_add_and_run_ = &instructions_and_sizes_; 96 addresses_for_copy_ = &instructions_and_sizes_; 97 } else { 98 data_for_add_and_run_ = &separate_data_for_add_and_run_; 99 addresses_for_copy_ = &separate_addresses_for_copy_; 100 } 101} 102 103bool VCDiffCodeTableWriter::Init(size_t dictionary_size) { 104 dictionary_size_ = dictionary_size; 105 if (!instruction_map_) { 106 if (code_table_data_ == &VCDiffCodeTableData::kDefaultCodeTableData) { 107 instruction_map_ = VCDiffInstructionMap::GetDefaultInstructionMap(); 108 } else { 109 instruction_map_ = new VCDiffInstructionMap(*code_table_data_, max_mode_); 110 } 111 if (!instruction_map_) { 112 return false; 113 } 114 } 115 if (!address_cache_.Init()) { 116 return false; 117 } 118 target_length_ = 0; 119 last_opcode_index_ = -1; 120 return true; 121} 122 123void VCDiffCodeTableWriter::WriteHeader( 124 OutputStringInterface* out, 125 VCDiffFormatExtensionFlags format_extensions) { 126 if (format_extensions == VCD_STANDARD_FORMAT) { 127 out->append(reinterpret_cast<const char*>(&kHeaderStandardFormat), 128 sizeof(kHeaderStandardFormat)); 129 } else { 130 out->append(reinterpret_cast<const char*>(&kHeaderExtendedFormat), 131 sizeof(kHeaderExtendedFormat)); 132 } 133 // If custom cache table sizes or a custom code table were used 134 // for encoding, here is where they would be appended to *output. 135 // This implementation of the encoder does not use those features, 136 // although the decoder can understand and interpret them. 137} 138 139// The VCDiff format allows each opcode to represent either 140// one or two delta instructions. This function will first 141// examine the opcode generated by the last call to EncodeInstruction. 142// If that opcode was a single-instruction opcode, this function checks 143// whether there is a compound (double-instruction) opcode that can 144// combine that single instruction with the instruction that is now 145// being added, and so save a byte of space. In that case, the 146// single-instruction opcode at position last_opcode_index_ will be 147// overwritten with the new double-instruction opcode. 148// 149// In the majority of cases, no compound opcode will be possible, 150// and a new single-instruction opcode will be appended to 151// instructions_and_sizes_, followed by a representation of its size 152// if the opcode does not implicitly give the instruction size. 153// 154// As an example, say instructions_and_sizes_ contains 10 bytes, the last 155// of which contains the opcode 0x02 (ADD size 1). Because that was the 156// most recently added opcode, last_opcode_index_ has the value 10. 157// EncodeInstruction is then called with inst = VCD_COPY, size = 4, mode = 0. 158// The function will replace the old opcode 0x02 with the double-instruction 159// opcode 0xA3 (ADD size 1 + COPY size 4 mode 0). 160// 161// All of the double-instruction opcodes in the standard code table 162// have implicit sizes, meaning that the size of the instruction will not 163// need to be written to the instructions_and_sizes_ string separately 164// from the opcode. If a custom code table were used that did not have 165// this property, then instructions_and_sizes_ might contain a 166// double-instruction opcode (say, COPY size 0 mode 0 + ADD size 0) 167// followed by the size of the COPY, then by the size of the ADD. 168// If using the SDCH interleaved format, the address of the COPY instruction 169// would follow its size, so the ordering would be 170// [Compound Opcode][Size of COPY][Address of COPY][Size of ADD] 171// 172void VCDiffCodeTableWriter::EncodeInstruction(VCDiffInstructionType inst, 173 size_t size, 174 unsigned char mode) { 175 if (!instruction_map_) { 176 VCD_DFATAL << "EncodeInstruction() called without calling Init()" 177 << VCD_ENDL; 178 return; 179 } 180 if (last_opcode_index_ >= 0) { 181 const unsigned char last_opcode = 182 instructions_and_sizes_[last_opcode_index_]; 183 // The encoding engine should not generate two ADD instructions in a row. 184 // This won't cause a failure, but it's inefficient encoding and probably 185 // represents a bug in the higher-level logic of the encoder. 186 if ((inst == VCD_ADD) && 187 (code_table_data_->inst1[last_opcode] == VCD_ADD)) { 188 VCD_WARNING << "EncodeInstruction() called for two ADD instructions" 189 " in a row" << VCD_ENDL; 190 } 191 OpcodeOrNone compound_opcode = kNoOpcode; 192 if (size <= UCHAR_MAX) { 193 compound_opcode = 194 instruction_map_->LookupSecondOpcode(last_opcode, 195 inst, 196 static_cast<unsigned char>(size), 197 mode); 198 if (compound_opcode != kNoOpcode) { 199 instructions_and_sizes_[last_opcode_index_] = 200 static_cast<unsigned char>(compound_opcode); 201 last_opcode_index_ = -1; 202 return; 203 } 204 } 205 // Try finding a compound opcode with size 0. 206 compound_opcode = instruction_map_->LookupSecondOpcode(last_opcode, 207 inst, 208 0, 209 mode); 210 if (compound_opcode != kNoOpcode) { 211 instructions_and_sizes_[last_opcode_index_] = 212 static_cast<unsigned char>(compound_opcode); 213 last_opcode_index_ = -1; 214 AppendSizeToString(size, &instructions_and_sizes_); 215 return; 216 } 217 } 218 OpcodeOrNone opcode = kNoOpcode; 219 if (size <= UCHAR_MAX) { 220 opcode = 221 instruction_map_->LookupFirstOpcode(inst, 222 static_cast<unsigned char>(size), 223 mode); 224 if (opcode != kNoOpcode) { 225 instructions_and_sizes_.push_back(static_cast<char>(opcode)); 226 last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1); 227 return; 228 } 229 } 230 // There should always be an opcode with size 0. 231 opcode = instruction_map_->LookupFirstOpcode(inst, 0, mode); 232 if (opcode == kNoOpcode) { 233 VCD_DFATAL << "No matching opcode found for inst " << inst 234 << ", mode " << mode << ", size 0" << VCD_ENDL; 235 return; 236 } 237 instructions_and_sizes_.push_back(static_cast<char>(opcode)); 238 last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1); 239 AppendSizeToString(size, &instructions_and_sizes_); 240} 241 242void VCDiffCodeTableWriter::Add(const char* data, size_t size) { 243 EncodeInstruction(VCD_ADD, size); 244 data_for_add_and_run_->append(data, size); 245 target_length_ += size; 246} 247 248void VCDiffCodeTableWriter::Copy(int32_t offset, size_t size) { 249 if (!instruction_map_) { 250 VCD_DFATAL << "VCDiffCodeTableWriter::Copy() called without calling Init()" 251 << VCD_ENDL; 252 return; 253 } 254 // If a single interleaved stream of encoded values is used 255 // instead of separate sections for instructions, addresses, and data, 256 // then the string instructions_and_sizes_ may be the same as 257 // addresses_for_copy_. The address should therefore be encoded 258 // *after* the instruction and its size. 259 int32_t encoded_addr = 0; 260 const unsigned char mode = address_cache_.EncodeAddress( 261 offset, 262 static_cast<VCDAddress>(dictionary_size_ + target_length_), 263 &encoded_addr); 264 EncodeInstruction(VCD_COPY, size, mode); 265 if (address_cache_.WriteAddressAsVarintForMode(mode)) { 266 VarintBE<int32_t>::AppendToString(encoded_addr, addresses_for_copy_); 267 } else { 268 addresses_for_copy_->push_back(static_cast<unsigned char>(encoded_addr)); 269 } 270 target_length_ += size; 271} 272 273void VCDiffCodeTableWriter::Run(size_t size, unsigned char byte) { 274 EncodeInstruction(VCD_RUN, size); 275 data_for_add_and_run_->push_back(byte); 276 target_length_ += size; 277} 278 279size_t VCDiffCodeTableWriter::CalculateLengthOfSizeAsVarint(size_t size) { 280 return VarintBE<int32_t>::Length(static_cast<int32_t>(size)); 281} 282 283void VCDiffCodeTableWriter::AppendSizeToString(size_t size, string* out) { 284 VarintBE<int32_t>::AppendToString(static_cast<int32_t>(size), out); 285} 286 287void VCDiffCodeTableWriter::AppendSizeToOutputString( 288 size_t size, 289 OutputStringInterface* out) { 290 VarintBE<int32_t>::AppendToOutputString(static_cast<int32_t>(size), out); 291} 292 293// This calculation must match the items added between "Start of Delta Encoding" 294// and "End of Delta Encoding" in Output(), below. 295size_t VCDiffCodeTableWriter::CalculateLengthOfTheDeltaEncoding() const { 296 size_t length_of_the_delta_encoding = 297 CalculateLengthOfSizeAsVarint(target_length_) + 298 1 + // Delta_Indicator 299 CalculateLengthOfSizeAsVarint(separate_data_for_add_and_run_.size()) + 300 CalculateLengthOfSizeAsVarint(instructions_and_sizes_.size()) + 301 CalculateLengthOfSizeAsVarint(separate_addresses_for_copy_.size()) + 302 separate_data_for_add_and_run_.size() + 303 instructions_and_sizes_.size() + 304 separate_addresses_for_copy_.size(); 305 if (add_checksum_) { 306 length_of_the_delta_encoding += 307 VarintBE<int64_t>::Length(static_cast<int64_t>(checksum_)); 308 } 309 return length_of_the_delta_encoding; 310} 311 312void VCDiffCodeTableWriter::Output(OutputStringInterface* out) { 313 if (instructions_and_sizes_.empty()) { 314 VCD_WARNING << "Empty input; no delta window produced" << VCD_ENDL; 315 } else { 316 const size_t length_of_the_delta_encoding = 317 CalculateLengthOfTheDeltaEncoding(); 318 const size_t delta_window_size = 319 length_of_the_delta_encoding + 320 1 + // Win_Indicator 321 CalculateLengthOfSizeAsVarint(dictionary_size_) + 322 CalculateLengthOfSizeAsVarint(0) + 323 CalculateLengthOfSizeAsVarint(length_of_the_delta_encoding); 324 // append() will be called many times on the output string; make sure 325 // the output string is resized only once at most. 326 out->ReserveAdditionalBytes(delta_window_size); 327 328 // Add first element: Win_Indicator 329 if (add_checksum_) { 330 out->push_back(VCD_SOURCE | VCD_CHECKSUM); 331 } else { 332 out->push_back(VCD_SOURCE); 333 } 334 // Source segment size: dictionary size 335 AppendSizeToOutputString(dictionary_size_, out); 336 // Source segment position: 0 (start of dictionary) 337 AppendSizeToOutputString(0, out); 338 339 // [Here is where a secondary compressor would be used 340 // if the encoder and decoder supported that feature.] 341 342 AppendSizeToOutputString(length_of_the_delta_encoding, out); 343 // Start of Delta Encoding 344 const size_t size_before_delta_encoding = out->size(); 345 AppendSizeToOutputString(target_length_, out); 346 out->push_back(0x00); // Delta_Indicator: no compression 347 AppendSizeToOutputString(separate_data_for_add_and_run_.size(), out); 348 AppendSizeToOutputString(instructions_and_sizes_.size(), out); 349 AppendSizeToOutputString(separate_addresses_for_copy_.size(), out); 350 if (add_checksum_) { 351 // The checksum is a 32-bit *unsigned* integer. VarintBE requires a 352 // signed type, so use a 64-bit signed integer to store the checksum. 353 VarintBE<int64_t>::AppendToOutputString(static_cast<int64_t>(checksum_), 354 out); 355 } 356 out->append(separate_data_for_add_and_run_.data(), 357 separate_data_for_add_and_run_.size()); 358 out->append(instructions_and_sizes_.data(), 359 instructions_and_sizes_.size()); 360 out->append(separate_addresses_for_copy_.data(), 361 separate_addresses_for_copy_.size()); 362 // End of Delta Encoding 363 const size_t size_after_delta_encoding = out->size(); 364 if (length_of_the_delta_encoding != 365 (size_after_delta_encoding - size_before_delta_encoding)) { 366 VCD_DFATAL << "Internal error: calculated length of the delta encoding (" 367 << length_of_the_delta_encoding 368 << ") does not match actual length (" 369 << (size_after_delta_encoding - size_before_delta_encoding) 370 << VCD_ENDL; 371 } 372 separate_data_for_add_and_run_.clear(); 373 instructions_and_sizes_.clear(); 374 separate_addresses_for_copy_.clear(); 375 if (target_length_ == 0) { 376 VCD_WARNING << "Empty target window" << VCD_ENDL; 377 } 378 } 379 380 // Reset state for next window; assume we are using same code table 381 // and dictionary. The caller will have to invoke Init() if a different 382 // dictionary is used. 383 // 384 // Notably, Init() calls address_cache_.Init(). This resets the address 385 // cache between delta windows, as required by RFC section 5.1. 386 if (!Init(dictionary_size_)) { 387 VCD_DFATAL << "Internal error: calling Init() to reset " 388 "VCDiffCodeTableWriter state failed" << VCD_ENDL; 389 } 390} 391 392}; // namespace open_vcdiff 393