1// Copyright 2008 Google Inc.
2// Author: Lincoln Smith
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16// A command-line interface to the open-vcdiff library.
17
18#include <config.h>
19#include <assert.h>
20#include <errno.h>
21#ifdef WIN32
22#include <fcntl.h>
23#include <io.h>
24#endif  // WIN32
25#include <stdio.h>
26#include <string.h>  // strerror
27#include <iostream>
28#include <memory>
29#include <string>
30#include <vector>
31#include "gflags/gflags.h"
32#include "google/vcdecoder.h"
33#include "google/vcencoder.h"
34
35#ifndef HAS_GLOBAL_STRING
36using std::string;
37#endif  // !HAS_GLOBAL_STRING
38using google::GetCommandLineFlagInfoOrDie;
39using google::ShowUsageWithFlagsRestrict;
40
41static const size_t kDefaultMaxTargetSize = 1 << 26;      // 64 MB
42
43// Definitions of command-line flags
44DEFINE_string(dictionary, "",
45              "File containing dictionary data (required)");
46DEFINE_string(target, "",
47              "Target file (default is stdin for encode, stdout for decode");
48DEFINE_string(delta, "",
49              "Encoded delta file (default is stdout for encode, "
50              "stdin for decode");
51// --buffersize is the maximum allowable size of a target window.
52// This value may be increased if there is sufficient memory available.
53DEFINE_uint64(buffersize, 1 << 20,  // 1 MB
54              "Buffer size for reading input file");
55DEFINE_bool(allow_vcd_target, true,
56            "If false, the decoder issues an error when the VCD_TARGET flag "
57            "is encountered");
58DEFINE_bool(checksum, false,
59            "Include an Adler32 checksum of the target data when encoding");
60DEFINE_bool(interleaved, false, "Use interleaved format");
61DEFINE_bool(json, false, "Output diff in the JSON format when encoding");
62DEFINE_bool(stats, false, "Report compression percentage");
63DEFINE_bool(target_matches, false, "Find duplicate strings in target data"
64                                   " as well as dictionary data");
65DEFINE_uint64(max_target_file_size, kDefaultMaxTargetSize,
66              "Maximum target file size allowed by decoder");
67DEFINE_uint64(max_target_window_size, kDefaultMaxTargetSize,
68              "Maximum target window size allowed by decoder");
69
70static const char* const kUsageString =
71    " {encode | delta | decode | patch }[ <options> ]\n"
72    "encode or delta: create delta file from dictionary and target file\n"
73    "decode or patch: reconstruct target file from dictionary and delta file";
74
75namespace open_vcdiff {
76
77class VCDiffFileBasedCoder {
78 public:
79  VCDiffFileBasedCoder();
80  ~VCDiffFileBasedCoder();
81
82  // Once the command-line arguments have been parsed, these functions
83  // will use the supplied options to carry out a file-based encode
84  // or decode operation.
85  bool Encode();
86  bool Decode();
87  bool DecodeAndCompare();  // for "vcdiff test"; compare target with original
88
89 private:
90  // Determines the size of the file.  The given file must be an input file
91  // opened for reading only, not an input stream such as stdin.  The function
92  // returns true and populates file_size if successful; otherwise, it returns
93  // false.
94  static bool FileSize(FILE* file, size_t* file_size);
95
96  // Opens a file for incremental reading.  file_name is the name of the file
97  // to be opened.  file_type should be a descriptive name (like "target") for
98  // use in log messages.  If successful, returns true and sets *file to a
99  // valid input file, *buffer to a region of memory allocated using malloc()
100  // (so the caller must release it using free()), and buffer_size to the size
101  // of the buffer, which will not be larger than the size of the file, and
102  // will not be smaller than the --buffersize option.  If the function fails,
103  // it outputs a log message and returns false.
104  bool OpenFileForReading(const string& file_name,
105                          const char* file_type,
106                          FILE** file,
107                          std::vector<char>* buffer);
108
109  // Opens the dictionary file and reads it into a newly allocated buffer.
110  // If successful, returns true and populates dictionary_ with the dictionary
111  // contents; otherwise, returns false.
112  bool OpenDictionary();
113
114  // Opens the input file (the delta or target file) for reading.
115  // Allocates space for the input buffer.  If successful,
116  // input_file_ will be valid and input_buffer_ will be allocated.
117  bool OpenInputFile() {
118    return OpenFileForReading(input_file_name_,
119                              input_file_type_,
120                              &input_file_,
121                              &input_buffer_);
122  }
123
124  // Opens the output file (the target or delta file) for writing.
125  // If successful, output_file_ will be valid.
126  bool OpenOutputFile();
127
128  // Opens the output file (the target file) for comparison against the decoded
129  // output when using "vcdiff test".
130  bool OpenOutputFileForCompare() {
131    return OpenFileForReading(output_file_name_,
132                              output_file_type_,
133                              &output_file_,
134                              &compare_buffer_);
135  }
136
137  // Reads as much input data as possible from the input file
138  // into input_buffer_.  If successful, returns true and sets *bytes_read
139  // to the number of bytes read into input_buffer_.  If an error occurs,
140  // writes an error log message and returns false.
141  bool ReadInput(size_t* bytes_read);
142
143  // Writes the contents of output to output_file_.  If successful, returns
144  // true.  If an error occurs, writes an error log message and returns false.
145  bool WriteOutput(const string& output);
146
147  // Reads a number of bytes from output_file_ equal to the size of output,
148  // and compares to make sure they match the contents of output.  If the bytes
149  // do not match, or if end of file is reached before the expected number of
150  // bytes have been read, or a read error occurs, the function returns false;
151  // otherwise, returns true.
152  bool CompareOutput(const string& output);
153
154  // Dictionary contents.  The entire dictionary file will be read into memory.
155  std::vector<char> dictionary_;
156
157  std::auto_ptr<open_vcdiff::HashedDictionary> hashed_dictionary_;
158
159  // These should be set to either "delta" or "target".  They are only
160  // used in log messages such as "Error opening delta file..."
161  const char* input_file_type_;
162  const char* output_file_type_;
163
164  // The filenames used for input and output.  Will be empty if stdin
165  // or stdout is being used.
166  string input_file_name_;
167  string output_file_name_;
168
169  // stdio-style file handles for the input and output files and the dictionary.
170  // When encoding, input_file_ is the target file and output_file_ is the delta
171  // file; when decoding, the reverse is true.  The dictionary is always read
172  // from a file rather than from standard input.
173  FILE* input_file_;
174  FILE* output_file_;
175
176  // A memory buffer used to load the input file into memory.  If the input
177  // comes from stdin because no input file was specified, then the size of
178  // input_buffer_ will be the value specified by the --buffersize option.
179  // If the input comes from a file, then the buffer will be allocated to match
180  // the file size, if possible.  However, the buffer will not exceed
181  // --buffersize bytes in length.
182  std::vector<char> input_buffer_;
183
184  // A memory buffer used to load the output file into memory for comparison
185  // if "vcdiff test" is specified.
186  std::vector<char> compare_buffer_;
187
188  // Making these private avoids implicit copy constructor & assignment operator
189  VCDiffFileBasedCoder(const VCDiffFileBasedCoder&);  // NOLINT
190  void operator=(const VCDiffFileBasedCoder&);
191};
192
193inline VCDiffFileBasedCoder::VCDiffFileBasedCoder()
194    : input_file_type_(""),
195      output_file_type_(""),
196      input_file_(NULL),
197      output_file_(NULL) { }
198
199VCDiffFileBasedCoder::~VCDiffFileBasedCoder() {
200  if (input_file_ && (input_file_ != stdin)) {
201    fclose(input_file_);
202    input_file_ = NULL;
203  }
204  if (output_file_ && (output_file_ != stdout)) {
205    fclose(output_file_);
206    output_file_ = NULL;
207  }
208}
209
210bool VCDiffFileBasedCoder::FileSize(FILE* file, size_t* file_size) {
211  long initial_position = ftell(file);
212  if (fseek(file, 0, SEEK_END) != 0) {
213    return false;
214  }
215  *file_size = static_cast<size_t>(ftell(file));
216  if (fseek(file, initial_position, SEEK_SET) != 0) {
217    return false;
218  }
219  return true;
220}
221
222bool VCDiffFileBasedCoder::OpenDictionary() {
223  assert(dictionary_.empty());
224  assert(!FLAGS_dictionary.empty());
225  FILE* dictionary_file = fopen(FLAGS_dictionary.c_str(), "rb");
226  if (!dictionary_file) {
227    std::cerr << "Error opening dictionary file '" << FLAGS_dictionary
228              << "': " << strerror(errno) << std::endl;
229    return false;
230  }
231  size_t dictionary_size = 0U;
232  if (!FileSize(dictionary_file, &dictionary_size)) {
233    std::cerr << "Error finding size of dictionary file '" << FLAGS_dictionary
234              << "': " << strerror(errno) << std::endl;
235    return false;
236  }
237  dictionary_.resize(dictionary_size);
238  if (dictionary_size > 0) {
239    if (fread(&dictionary_[0], 1, dictionary_size, dictionary_file)
240            != dictionary_size) {
241      std::cerr << "Unable to read dictionary file '" << FLAGS_dictionary
242                << "': " << strerror(errno) << std::endl;
243      fclose(dictionary_file);
244      dictionary_.clear();
245      return false;
246    }
247  }
248  fclose(dictionary_file);
249  return true;
250}
251
252bool VCDiffFileBasedCoder::OpenFileForReading(const string& file_name,
253                                              const char* file_type,
254                                              FILE** file,
255                                              std::vector<char>* buffer) {
256  assert(buffer->empty());
257  size_t buffer_size = 0U;
258  if (!*file && file_name.empty()) {
259#ifdef WIN32
260    _setmode(_fileno(stdin), _O_BINARY);
261#endif
262    *file = stdin;
263    buffer_size = static_cast<size_t>(FLAGS_buffersize);
264  } else {
265    if (!*file) {
266      *file = fopen(file_name.c_str(), "rb");
267      if (!*file) {
268        std::cerr << "Error opening " << file_type << " file '"
269                  << file_name << "': " << strerror(errno) << std::endl;
270        return false;
271      }
272    }
273    size_t file_size = 0U;
274    if (!FileSize(*file, &file_size)) {
275      std::cerr << "Error finding size of " << file_type << " file '"
276                << file_name << "': " << strerror(errno) << std::endl;
277      return false;
278    }
279    buffer_size = static_cast<size_t>(FLAGS_buffersize);
280    if (file_size < buffer_size) {
281      // Allocate just enough memory to store the entire file
282      buffer_size = file_size;
283    }
284  }
285  buffer->resize(buffer_size);
286  return true;
287}
288
289// Opens the output file for streamed read operations using the
290// standard C I/O library, i.e., fopen(), fwrite(), fclose().
291// No output buffer is allocated because the encoded/decoded output
292// is constructed progressively using a std::string object
293// whose buffer is resized as needed.
294bool VCDiffFileBasedCoder::OpenOutputFile() {
295  if (output_file_name_.empty()) {
296#ifdef WIN32
297    _setmode(_fileno(stdout), _O_BINARY);
298#endif
299    output_file_ = stdout;
300  } else {
301    output_file_ = fopen(output_file_name_.c_str(), "wb");
302    if (!output_file_) {
303      std::cerr << "Error opening " << output_file_type_ << " file '"
304                << output_file_name_
305                << "': " << strerror(errno) << std::endl;
306      return false;
307    }
308  }
309  return true;
310}
311
312bool VCDiffFileBasedCoder::ReadInput(size_t* bytes_read) {
313  // Read from file or stdin
314  *bytes_read = fread(&input_buffer_[0], 1, input_buffer_.size(), input_file_);
315  if (ferror(input_file_)) {
316    std::cerr << "Error reading from " << input_file_type_ << " file '"
317              << input_file_name_
318              << "': " << strerror(errno) << std::endl;
319    return false;
320  }
321  return true;
322}
323
324bool VCDiffFileBasedCoder::WriteOutput(const string& output) {
325  if (!output.empty()) {
326    // Some new output has been generated and is ready to be written
327    // to the output file or to stdout.
328    fwrite(output.data(), 1, output.size(), output_file_);
329    if (ferror(output_file_)) {
330      std::cerr << "Error writing " << output.size() << " bytes to "
331                << output_file_type_ << " file '" << output_file_name_
332                << "': " << strerror(errno) << std::endl;
333      return false;
334    }
335  }
336  return true;
337}
338
339bool VCDiffFileBasedCoder::CompareOutput(const string& output) {
340  if (!output.empty()) {
341    size_t output_size = output.size();
342    // Some new output has been generated and is ready to be compared against
343    // the output file.
344    if (output_size > compare_buffer_.size()) {
345      compare_buffer_.resize(output_size);
346    }
347    size_t bytes_read = fread(&compare_buffer_[0],
348                              1,
349                              output_size,
350                              output_file_);
351    if (ferror(output_file_)) {
352      std::cerr << "Error reading from " << output_file_type_ << " file '"
353                << output_file_name_ << "': " << strerror(errno) << std::endl;
354      return false;
355    }
356    if (bytes_read < output_size) {
357      std::cerr << "Decoded target is longer than original target file"
358                << std::endl;
359      return false;
360    }
361    if (output.compare(0, output_size, &compare_buffer_[0], bytes_read) != 0) {
362      std::cerr << "Original target file does not match decoded target"
363                << std::endl;
364      return false;
365    }
366  }
367  return true;
368}
369
370bool VCDiffFileBasedCoder::Encode() {
371  input_file_type_ = "target";
372  input_file_name_ = FLAGS_target;
373  output_file_type_ = "delta";
374  output_file_name_ = FLAGS_delta;
375  if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) {
376    return false;
377  }
378  // Issue 6: Visual Studio STL produces a runtime exception
379  // if &dictionary_[0] is attempted for an empty dictionary.
380  if (dictionary_.empty()) {
381    hashed_dictionary_.reset(new open_vcdiff::HashedDictionary("", 0));
382  } else {
383    hashed_dictionary_.reset(
384        new open_vcdiff::HashedDictionary(&dictionary_[0],
385                                          dictionary_.size()));
386  }
387  if (!hashed_dictionary_->Init()) {
388    std::cerr << "Error initializing hashed dictionary" << std::endl;
389    return false;
390  }
391  VCDiffFormatExtensionFlags format_flags = open_vcdiff::VCD_STANDARD_FORMAT;
392  if (FLAGS_interleaved) {
393    format_flags |= open_vcdiff::VCD_FORMAT_INTERLEAVED;
394  }
395  if (FLAGS_checksum) {
396    format_flags |= open_vcdiff::VCD_FORMAT_CHECKSUM;
397  }
398  if (FLAGS_json) {
399    format_flags |= open_vcdiff::VCD_FORMAT_JSON;
400  }
401  open_vcdiff::VCDiffStreamingEncoder encoder(hashed_dictionary_.get(),
402                                              format_flags,
403                                              FLAGS_target_matches);
404  string output;
405  size_t input_size = 0;
406  size_t output_size = 0;
407  {
408    if (!encoder.StartEncoding(&output)) {
409      std::cerr << "Error during encoder initialization" << std::endl;
410      return false;
411    }
412  }
413  do {
414    size_t bytes_read = 0;
415    if (!WriteOutput(output) || !ReadInput(&bytes_read)) {
416      return false;
417    }
418    output_size += output.size();
419    output.clear();
420    if (bytes_read > 0) {
421      input_size += bytes_read;
422      if (!encoder.EncodeChunk(&input_buffer_[0], bytes_read, &output)) {
423        std::cerr << "Error trying to encode data chunk of length "
424                  << bytes_read << std::endl;
425        return false;
426      }
427    }
428  } while (!feof(input_file_));
429  encoder.FinishEncoding(&output);
430  if (!WriteOutput(output)) {
431    return false;
432  }
433  output_size += output.size();
434  output.clear();
435  if (FLAGS_stats && (input_size > 0)) {
436    std::cerr << "Original size: " << input_size
437              << "\tCompressed size: " << output_size << " ("
438              << ((static_cast<double>(output_size) / input_size) * 100)
439              << "% of original)" << std::endl;
440  }
441  return true;
442}
443
444bool VCDiffFileBasedCoder::Decode() {
445  input_file_type_ = "delta";
446  input_file_name_ = FLAGS_delta;
447  output_file_type_ = "target";
448  output_file_name_ = FLAGS_target;
449  if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) {
450    return false;
451  }
452
453  open_vcdiff::VCDiffStreamingDecoder decoder;
454  decoder.SetMaximumTargetFileSize(
455      static_cast<size_t>(FLAGS_max_target_file_size));
456  decoder.SetMaximumTargetWindowSize(
457      static_cast<size_t>(FLAGS_max_target_window_size));
458  decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target);
459  string output;
460  size_t input_size = 0;
461  size_t output_size = 0;
462  // Issue 6: Visual Studio STL produces a runtime exception
463  // if &dictionary_[0] is attempted for an empty dictionary.
464  if (dictionary_.empty()) {
465    decoder.StartDecoding("", 0);
466  } else {
467    decoder.StartDecoding(&dictionary_[0], dictionary_.size());
468  }
469
470  do {
471    size_t bytes_read = 0;
472    if (!ReadInput(&bytes_read)) {
473      return false;
474    }
475    if (bytes_read > 0) {
476      input_size += bytes_read;
477      if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) {
478        std::cerr << "Error trying to decode data chunk of length "
479                  << bytes_read << std::endl;
480        return false;
481      }
482    }
483    if (!WriteOutput(output)) {
484      return false;
485    }
486    output_size += output.size();
487    output.clear();
488  } while (!feof(input_file_));
489  if (!decoder.FinishDecoding()) {
490    std::cerr << "Decode error; '" << FLAGS_delta
491              << " may not be a valid VCDIFF delta file" << std::endl;
492    return false;
493  }
494  if (!WriteOutput(output)) {
495    return false;
496  }
497  output_size += output.size();
498  output.clear();
499  if (FLAGS_stats && (output_size > 0)) {
500    std::cerr << "Decompressed size: " << output_size
501              << "\tCompressed size: " << input_size << " ("
502              << ((static_cast<double>(input_size) / output_size) * 100)
503              << "% of original)" << std::endl;
504  }
505  return true;
506}
507
508bool VCDiffFileBasedCoder::DecodeAndCompare() {
509  input_file_type_ = "delta";
510  input_file_name_ = FLAGS_delta;
511  output_file_type_ = "target";
512  output_file_name_ = FLAGS_target;
513  if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFileForCompare()) {
514    return false;
515  }
516
517  open_vcdiff::VCDiffStreamingDecoder decoder;
518  decoder.SetMaximumTargetFileSize(
519      static_cast<size_t>(FLAGS_max_target_file_size));
520  decoder.SetMaximumTargetWindowSize(
521      static_cast<size_t>(FLAGS_max_target_window_size));
522  decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target);
523  string output;
524  size_t input_size = 0;
525  size_t output_size = 0;
526  // Issue 6: Visual Studio STL produces a runtime exception
527  // if &dictionary_[0] is attempted for an empty dictionary.
528  if (dictionary_.empty()) {
529    decoder.StartDecoding("", 0);
530  } else {
531    decoder.StartDecoding(&dictionary_[0], dictionary_.size());
532  }
533
534  do {
535    size_t bytes_read = 0;
536    if (!ReadInput(&bytes_read)) {
537      return false;
538    }
539    if (bytes_read > 0) {
540      input_size += bytes_read;
541      if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) {
542        std::cerr << "Error trying to decode data chunk of length "
543                  << bytes_read << std::endl;
544        return false;
545      }
546    }
547    if (!CompareOutput(output)) {
548      return false;
549    }
550    output_size += output.size();
551    output.clear();
552  } while (!feof(input_file_));
553  if (!decoder.FinishDecoding()) {
554    std::cerr << "Decode error; '" << FLAGS_delta
555              << " may not be a valid VCDIFF delta file" << std::endl;
556    return false;
557  }
558  if (!CompareOutput(output)) {
559    return false;
560  }
561  output_size += output.size();
562  output.clear();
563  if (fgetc(output_file_) != EOF) {
564    std::cerr << "Decoded target is shorter than original target file"
565              << std::endl;
566    return false;
567  }
568  if (ferror(output_file_)) {
569    std::cerr << "Error reading end-of-file indicator from target file"
570              << std::endl;
571    return false;
572  }
573  if (FLAGS_stats && (output_size > 0)) {
574    std::cerr << "Decompressed size: " << output_size
575              << "\tCompressed size: " << input_size << " ("
576              << ((static_cast<double>(input_size) / output_size) * 100)
577              << "% of original)" << std::endl;
578  }
579  return true;
580}
581
582}  // namespace open_vcdiff
583
584int main(int argc, char** argv) {
585  const char* const command_name = argv[0];
586  google::SetUsageMessage(kUsageString);
587  google::ParseCommandLineFlags(&argc, &argv, true);
588  if (argc != 2) {
589    std::cerr << command_name << ": Must specify exactly one command option"
590              << std::endl;
591    ShowUsageWithFlagsRestrict(command_name, "vcdiff");
592    return 1;
593  }
594  const char* const command_option = argv[1];
595  if (FLAGS_dictionary.empty()) {
596    std::cerr << command_name << " " << command_option
597              << ": Must specify --dictionary <file-name>" << std::endl;
598    ShowUsageWithFlagsRestrict(command_name, "vcdiff");
599    return 1;
600  }
601  if (!GetCommandLineFlagInfoOrDie("buffersize").is_default &&
602       (FLAGS_buffersize == 0)) {
603    std::cerr << command_name << ": Option --buffersize cannot be 0"
604              << std::endl;
605    ShowUsageWithFlagsRestrict(command_name, "vcdiff");
606    return 1;
607  }
608  if ((strcmp(command_option, "encode") == 0) ||
609      (strcmp(command_option, "delta") == 0)) {
610    open_vcdiff::VCDiffFileBasedCoder coder;
611    if (!coder.Encode()) {
612      return 1;
613    }
614    // The destructor for VCDiffFileBasedCoder will clean up the open files
615    // and allocated memory.
616  } else if ((strcmp(command_option, "decode") == 0) ||
617             (strcmp(command_option, "patch") == 0)) {
618    open_vcdiff::VCDiffFileBasedCoder coder;
619    if (!coder.Decode()) {
620      return 1;
621    }
622  } else if ((strcmp(command_option, "test") == 0)) {
623    // "vcdiff test" does not appear in the usage string, but can be
624    // used for debugging.  It encodes, then decodes, then compares the result
625    // with the original target. It expects the same arguments as
626    // "vcdiff encode", with the additional requirement that the --target
627    // and --delta file arguments must be specified, rather than using stdin
628    // or stdout.  It produces a delta file just as for "vcdiff encode".
629    if (FLAGS_target.empty() || FLAGS_delta.empty()) {
630      std::cerr << command_name
631                << " test: Must specify both --target <file-name>"
632                   " and --delta <file-name>" << std::endl;
633      return 1;
634    }
635    const string original_target(FLAGS_target);
636    // Put coder into a separate scope.
637    {
638      open_vcdiff::VCDiffFileBasedCoder coder;
639      if (!coder.Encode()) {
640        return 1;
641      }
642    }
643    {
644      open_vcdiff::VCDiffFileBasedCoder coder;
645      if (!coder.DecodeAndCompare()) {
646        return 1;
647      }
648    }
649  } else {
650    std::cerr << command_name << ": Unrecognized command option "
651              << command_option << std::endl;
652    ShowUsageWithFlagsRestrict(command_name, "vcdiff");
653    return 1;
654  }
655  return 0;
656}
657