1// Copyright 2008 Google Inc.
2// Author: Lincoln Smith
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16// A command-line interface to the open-vcdiff library.
17
18#include <config.h>
19#include <assert.h>
20#include <errno.h>
21#ifdef WIN32
22#include <fcntl.h>
23#include <io.h>
24#endif  // WIN32
25#include <stdio.h>
26#include <string.h>  // strerror
27#include <iostream>
28#include <memory>
29#include <string>
30#include <vector>
31#include "gflags/gflags.h"
32#include "google/vcdecoder.h"
33#include "google/vcencoder.h"
34
35#ifndef HAS_GLOBAL_STRING
36using std::string;
37#endif  // !HAS_GLOBAL_STRING
38using google::GetCommandLineFlagInfoOrDie;
39using google::ShowUsageWithFlagsRestrict;
40
41static const size_t kDefaultMaxTargetSize = 1 << 26;      // 64 MB
42
43// Definitions of command-line flags
44DEFINE_string(dictionary, "",
45              "File containing dictionary data (required)");
46DEFINE_string(target, "",
47              "Target file (default is stdin for encode, stdout for decode");
48DEFINE_string(delta, "",
49              "Encoded delta file (default is stdout for encode, "
50              "stdin for decode");
51// --buffersize is the maximum allowable size of a target window.
52// This value may be increased if there is sufficient memory available.
53DEFINE_uint64(buffersize, 1 << 20,  // 1 MB
54              "Buffer size for reading input file");
55DEFINE_bool(allow_vcd_target, true,
56            "If false, the decoder issues an error when the VCD_TARGET flag "
57            "is encountered");
58DEFINE_bool(checksum, false,
59            "Include an Adler32 checksum of the target data when encoding");
60DEFINE_bool(interleaved, false, "Use interleaved format");
61DEFINE_bool(stats, false, "Report compression percentage");
62DEFINE_bool(target_matches, false, "Find duplicate strings in target data"
63                                   " as well as dictionary data");
64DEFINE_uint64(max_target_file_size, kDefaultMaxTargetSize,
65              "Maximum target file size allowed by decoder");
66DEFINE_uint64(max_target_window_size, kDefaultMaxTargetSize,
67              "Maximum target window size allowed by decoder");
68
69static const char* const kUsageString =
70    " {encode | delta | decode | patch }[ <options> ]\n"
71    "encode or delta: create delta file from dictionary and target file\n"
72    "decode or patch: reconstruct target file from dictionary and delta file";
73
74namespace open_vcdiff {
75
76class VCDiffFileBasedCoder {
77 public:
78  VCDiffFileBasedCoder();
79  ~VCDiffFileBasedCoder();
80
81  // Once the command-line arguments have been parsed, these functions
82  // will use the supplied options to carry out a file-based encode
83  // or decode operation.
84  bool Encode();
85  bool Decode();
86  bool DecodeAndCompare();  // for "vcdiff test"; compare target with original
87
88 private:
89  // Determines the size of the file.  The given file must be an input file
90  // opened for reading only, not an input stream such as stdin.  The function
91  // returns true and populates file_size if successful; otherwise, it returns
92  // false.
93  static bool FileSize(FILE* file, size_t* file_size);
94
95  // Opens a file for incremental reading.  file_name is the name of the file
96  // to be opened.  file_type should be a descriptive name (like "target") for
97  // use in log messages.  If successful, returns true and sets *file to a
98  // valid input file, *buffer to a region of memory allocated using malloc()
99  // (so the caller must release it using free()), and buffer_size to the size
100  // of the buffer, which will not be larger than the size of the file, and
101  // will not be smaller than the --buffersize option.  If the function fails,
102  // it outputs a log message and returns false.
103  bool OpenFileForReading(const string& file_name,
104                          const char* file_type,
105                          FILE** file,
106                          std::vector<char>* buffer);
107
108  // Opens the dictionary file and reads it into a newly allocated buffer.
109  // If successful, returns true and populates dictionary_ with the dictionary
110  // contents; otherwise, returns false.
111  bool OpenDictionary();
112
113  // Opens the input file (the delta or target file) for reading.
114  // Allocates space for the input buffer.  If successful,
115  // input_file_ will be valid and input_buffer_ will be allocated.
116  bool OpenInputFile() {
117    return OpenFileForReading(input_file_name_,
118                              input_file_type_,
119                              &input_file_,
120                              &input_buffer_);
121  }
122
123  // Opens the output file (the target or delta file) for writing.
124  // If successful, output_file_ will be valid.
125  bool OpenOutputFile();
126
127  // Opens the output file (the target file) for comparison against the decoded
128  // output when using "vcdiff test".
129  bool OpenOutputFileForCompare() {
130    return OpenFileForReading(output_file_name_,
131                              output_file_type_,
132                              &output_file_,
133                              &compare_buffer_);
134  }
135
136  // Reads as much input data as possible from the input file
137  // into input_buffer_.  If successful, returns true and sets *bytes_read
138  // to the number of bytes read into input_buffer_.  If an error occurs,
139  // writes an error log message and returns false.
140  bool ReadInput(size_t* bytes_read);
141
142  // Writes the contents of output to output_file_.  If successful, returns
143  // true.  If an error occurs, writes an error log message and returns false.
144  bool WriteOutput(const string& output);
145
146  // Reads a number of bytes from output_file_ equal to the size of output,
147  // and compares to make sure they match the contents of output.  If the bytes
148  // do not match, or if end of file is reached before the expected number of
149  // bytes have been read, or a read error occurs, the function returns false;
150  // otherwise, returns true.
151  bool CompareOutput(const string& output);
152
153  // Dictionary contents.  The entire dictionary file will be read into memory.
154  std::vector<char> dictionary_;
155
156  std::auto_ptr<open_vcdiff::HashedDictionary> hashed_dictionary_;
157
158  // These should be set to either "delta" or "target".  They are only
159  // used in log messages such as "Error opening delta file..."
160  const char* input_file_type_;
161  const char* output_file_type_;
162
163  // The filenames used for input and output.  Will be empty if stdin
164  // or stdout is being used.
165  string input_file_name_;
166  string output_file_name_;
167
168  // stdio-style file handles for the input and output files and the dictionary.
169  // When encoding, input_file_ is the target file and output_file_ is the delta
170  // file; when decoding, the reverse is true.  The dictionary is always read
171  // from a file rather than from standard input.
172  FILE* input_file_;
173  FILE* output_file_;
174
175  // A memory buffer used to load the input file into memory.  If the input
176  // comes from stdin because no input file was specified, then the size of
177  // input_buffer_ will be the value specified by the --buffersize option.
178  // If the input comes from a file, then the buffer will be allocated to match
179  // the file size, if possible.  However, the buffer will not exceed
180  // --buffersize bytes in length.
181  std::vector<char> input_buffer_;
182
183  // A memory buffer used to load the output file into memory for comparison
184  // if "vcdiff test" is specified.
185  std::vector<char> compare_buffer_;
186
187  // Making these private avoids implicit copy constructor & assignment operator
188  VCDiffFileBasedCoder(const VCDiffFileBasedCoder&);  // NOLINT
189  void operator=(const VCDiffFileBasedCoder&);
190};
191
192inline VCDiffFileBasedCoder::VCDiffFileBasedCoder()
193    : input_file_type_(""),
194      output_file_type_(""),
195      input_file_(NULL),
196      output_file_(NULL) { }
197
198VCDiffFileBasedCoder::~VCDiffFileBasedCoder() {
199  if (input_file_ && (input_file_ != stdin)) {
200    fclose(input_file_);
201    input_file_ = NULL;
202  }
203  if (output_file_ && (output_file_ != stdout)) {
204    fclose(output_file_);
205    output_file_ = NULL;
206  }
207}
208
209bool VCDiffFileBasedCoder::FileSize(FILE* file, size_t* file_size) {
210  long initial_position = ftell(file);
211  if (fseek(file, 0, SEEK_END) != 0) {
212    return false;
213  }
214  *file_size = static_cast<size_t>(ftell(file));
215  if (fseek(file, initial_position, SEEK_SET) != 0) {
216    return false;
217  }
218  return true;
219}
220
221bool VCDiffFileBasedCoder::OpenDictionary() {
222  assert(dictionary_.empty());
223  assert(!FLAGS_dictionary.empty());
224  FILE* dictionary_file = fopen(FLAGS_dictionary.c_str(), "rb");
225  if (!dictionary_file) {
226    std::cerr << "Error opening dictionary file '" << FLAGS_dictionary
227              << "': " << strerror(errno) << std::endl;
228    return false;
229  }
230  size_t dictionary_size = 0U;
231  if (!FileSize(dictionary_file, &dictionary_size)) {
232    std::cerr << "Error finding size of dictionary file '" << FLAGS_dictionary
233              << "': " << strerror(errno) << std::endl;
234    return false;
235  }
236  dictionary_.resize(dictionary_size);
237  if (dictionary_size > 0) {
238    if (fread(&dictionary_[0], 1, dictionary_size, dictionary_file)
239            != dictionary_size) {
240      std::cerr << "Unable to read dictionary file '" << FLAGS_dictionary
241                << "': " << strerror(errno) << std::endl;
242      fclose(dictionary_file);
243      dictionary_.clear();
244      return false;
245    }
246  }
247  fclose(dictionary_file);
248  return true;
249}
250
251bool VCDiffFileBasedCoder::OpenFileForReading(const string& file_name,
252                                              const char* file_type,
253                                              FILE** file,
254                                              std::vector<char>* buffer) {
255  assert(buffer->empty());
256  size_t buffer_size = 0U;
257  if (!*file && file_name.empty()) {
258#ifdef WIN32
259    _setmode(_fileno(stdin), _O_BINARY);
260#endif
261    *file = stdin;
262    buffer_size = static_cast<size_t>(FLAGS_buffersize);
263  } else {
264    if (!*file) {
265      *file = fopen(file_name.c_str(), "rb");
266      if (!*file) {
267        std::cerr << "Error opening " << file_type << " file '"
268                  << file_name << "': " << strerror(errno) << std::endl;
269        return false;
270      }
271    }
272    size_t file_size = 0U;
273    if (!FileSize(*file, &file_size)) {
274      std::cerr << "Error finding size of " << file_type << " file '"
275                << file_name << "': " << strerror(errno) << std::endl;
276      return false;
277    }
278    buffer_size = static_cast<size_t>(FLAGS_buffersize);
279    if (file_size < buffer_size) {
280      // Allocate just enough memory to store the entire file
281      buffer_size = file_size;
282    }
283  }
284  buffer->resize(buffer_size);
285  return true;
286}
287
288// Opens the output file for streamed read operations using the
289// standard C I/O library, i.e., fopen(), fwrite(), fclose().
290// No output buffer is allocated because the encoded/decoded output
291// is constructed progressively using a std::string object
292// whose buffer is resized as needed.
293bool VCDiffFileBasedCoder::OpenOutputFile() {
294  if (output_file_name_.empty()) {
295#ifdef WIN32
296    _setmode(_fileno(stdout), _O_BINARY);
297#endif
298    output_file_ = stdout;
299  } else {
300    output_file_ = fopen(output_file_name_.c_str(), "wb");
301    if (!output_file_) {
302      std::cerr << "Error opening " << output_file_type_ << " file '"
303                << output_file_name_
304                << "': " << strerror(errno) << std::endl;
305      return false;
306    }
307  }
308  return true;
309}
310
311bool VCDiffFileBasedCoder::ReadInput(size_t* bytes_read) {
312  // Read from file or stdin
313  *bytes_read = fread(&input_buffer_[0], 1, input_buffer_.size(), input_file_);
314  if (ferror(input_file_)) {
315    std::cerr << "Error reading from " << input_file_type_ << " file '"
316              << input_file_name_
317              << "': " << strerror(errno) << std::endl;
318    return false;
319  }
320  return true;
321}
322
323bool VCDiffFileBasedCoder::WriteOutput(const string& output) {
324  if (!output.empty()) {
325    // Some new output has been generated and is ready to be written
326    // to the output file or to stdout.
327    fwrite(output.data(), 1, output.size(), output_file_);
328    if (ferror(output_file_)) {
329      std::cerr << "Error writing " << output.size() << " bytes to "
330                << output_file_type_ << " file '" << output_file_name_
331                << "': " << strerror(errno) << std::endl;
332      return false;
333    }
334  }
335  return true;
336}
337
338bool VCDiffFileBasedCoder::CompareOutput(const string& output) {
339  if (!output.empty()) {
340    size_t output_size = output.size();
341    // Some new output has been generated and is ready to be compared against
342    // the output file.
343    if (output_size > compare_buffer_.size()) {
344      compare_buffer_.resize(output_size);
345    }
346    size_t bytes_read = fread(&compare_buffer_[0],
347                              1,
348                              output_size,
349                              output_file_);
350    if (ferror(output_file_)) {
351      std::cerr << "Error reading from " << output_file_type_ << " file '"
352                << output_file_name_ << "': " << strerror(errno) << std::endl;
353      return false;
354    }
355    if (bytes_read < output_size) {
356      std::cerr << "Decoded target is longer than original target file"
357                << std::endl;
358      return false;
359    }
360    if (output.compare(0, output_size, &compare_buffer_[0], bytes_read) != 0) {
361      std::cerr << "Original target file does not match decoded target"
362                << std::endl;
363      return false;
364    }
365  }
366  return true;
367}
368
369bool VCDiffFileBasedCoder::Encode() {
370  input_file_type_ = "target";
371  input_file_name_ = FLAGS_target;
372  output_file_type_ = "delta";
373  output_file_name_ = FLAGS_delta;
374  if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) {
375    return false;
376  }
377  // Issue 6: Visual Studio STL produces a runtime exception
378  // if &dictionary_[0] is attempted for an empty dictionary.
379  if (dictionary_.empty()) {
380    hashed_dictionary_.reset(new open_vcdiff::HashedDictionary("", 0));
381  } else {
382    hashed_dictionary_.reset(
383        new open_vcdiff::HashedDictionary(&dictionary_[0],
384                                          dictionary_.size()));
385  }
386  if (!hashed_dictionary_->Init()) {
387    std::cerr << "Error initializing hashed dictionary" << std::endl;
388    return false;
389  }
390  VCDiffFormatExtensionFlags format_flags = open_vcdiff::VCD_STANDARD_FORMAT;
391  if (FLAGS_interleaved) {
392    format_flags |= open_vcdiff::VCD_FORMAT_INTERLEAVED;
393  }
394  if (FLAGS_checksum) {
395    format_flags |= open_vcdiff::VCD_FORMAT_CHECKSUM;
396  }
397  open_vcdiff::VCDiffStreamingEncoder encoder(hashed_dictionary_.get(),
398                                              format_flags,
399                                              FLAGS_target_matches);
400  string output;
401  size_t input_size = 0;
402  size_t output_size = 0;
403  {
404    if (!encoder.StartEncoding(&output)) {
405      std::cerr << "Error during encoder initialization" << std::endl;
406      return false;
407    }
408  }
409  do {
410    size_t bytes_read = 0;
411    if (!WriteOutput(output) || !ReadInput(&bytes_read)) {
412      return false;
413    }
414    output_size += output.size();
415    output.clear();
416    if (bytes_read > 0) {
417      input_size += bytes_read;
418      if (!encoder.EncodeChunk(&input_buffer_[0], bytes_read, &output)) {
419        std::cerr << "Error trying to encode data chunk of length "
420                  << bytes_read << std::endl;
421        return false;
422      }
423    }
424  } while (!feof(input_file_));
425  encoder.FinishEncoding(&output);
426  if (!WriteOutput(output)) {
427    return false;
428  }
429  output_size += output.size();
430  output.clear();
431  if (FLAGS_stats && (input_size > 0)) {
432    std::cerr << "Original size: " << input_size
433              << "\tCompressed size: " << output_size << " ("
434              << ((static_cast<double>(output_size) / input_size) * 100)
435              << "% of original)" << std::endl;
436  }
437  return true;
438}
439
440bool VCDiffFileBasedCoder::Decode() {
441  input_file_type_ = "delta";
442  input_file_name_ = FLAGS_delta;
443  output_file_type_ = "target";
444  output_file_name_ = FLAGS_target;
445  if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) {
446    return false;
447  }
448
449  open_vcdiff::VCDiffStreamingDecoder decoder;
450  decoder.SetMaximumTargetFileSize(
451      static_cast<size_t>(FLAGS_max_target_file_size));
452  decoder.SetMaximumTargetWindowSize(
453      static_cast<size_t>(FLAGS_max_target_window_size));
454  decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target);
455  string output;
456  size_t input_size = 0;
457  size_t output_size = 0;
458  // Issue 6: Visual Studio STL produces a runtime exception
459  // if &dictionary_[0] is attempted for an empty dictionary.
460  if (dictionary_.empty()) {
461    decoder.StartDecoding("", 0);
462  } else {
463    decoder.StartDecoding(&dictionary_[0], dictionary_.size());
464  }
465
466  do {
467    size_t bytes_read = 0;
468    if (!ReadInput(&bytes_read)) {
469      return false;
470    }
471    if (bytes_read > 0) {
472      input_size += bytes_read;
473      if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) {
474        std::cerr << "Error trying to decode data chunk of length "
475                  << bytes_read << std::endl;
476        return false;
477      }
478    }
479    if (!WriteOutput(output)) {
480      return false;
481    }
482    output_size += output.size();
483    output.clear();
484  } while (!feof(input_file_));
485  if (!decoder.FinishDecoding()) {
486    std::cerr << "Decode error; '" << FLAGS_delta
487              << " may not be a valid VCDIFF delta file" << std::endl;
488    return false;
489  }
490  if (!WriteOutput(output)) {
491    return false;
492  }
493  output_size += output.size();
494  output.clear();
495  if (FLAGS_stats && (output_size > 0)) {
496    std::cerr << "Decompressed size: " << output_size
497              << "\tCompressed size: " << input_size << " ("
498              << ((static_cast<double>(input_size) / output_size) * 100)
499              << "% of original)" << std::endl;
500  }
501  return true;
502}
503
504bool VCDiffFileBasedCoder::DecodeAndCompare() {
505  input_file_type_ = "delta";
506  input_file_name_ = FLAGS_delta;
507  output_file_type_ = "target";
508  output_file_name_ = FLAGS_target;
509  if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFileForCompare()) {
510    return false;
511  }
512
513  open_vcdiff::VCDiffStreamingDecoder decoder;
514  decoder.SetMaximumTargetFileSize(
515      static_cast<size_t>(FLAGS_max_target_file_size));
516  decoder.SetMaximumTargetWindowSize(
517      static_cast<size_t>(FLAGS_max_target_window_size));
518  decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target);
519  string output;
520  size_t input_size = 0;
521  size_t output_size = 0;
522  // Issue 6: Visual Studio STL produces a runtime exception
523  // if &dictionary_[0] is attempted for an empty dictionary.
524  if (dictionary_.empty()) {
525    decoder.StartDecoding("", 0);
526  } else {
527    decoder.StartDecoding(&dictionary_[0], dictionary_.size());
528  }
529
530  do {
531    size_t bytes_read = 0;
532    if (!ReadInput(&bytes_read)) {
533      return false;
534    }
535    if (bytes_read > 0) {
536      input_size += bytes_read;
537      if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) {
538        std::cerr << "Error trying to decode data chunk of length "
539                  << bytes_read << std::endl;
540        return false;
541      }
542    }
543    if (!CompareOutput(output)) {
544      return false;
545    }
546    output_size += output.size();
547    output.clear();
548  } while (!feof(input_file_));
549  if (!decoder.FinishDecoding()) {
550    std::cerr << "Decode error; '" << FLAGS_delta
551              << " may not be a valid VCDIFF delta file" << std::endl;
552    return false;
553  }
554  if (!CompareOutput(output)) {
555    return false;
556  }
557  output_size += output.size();
558  output.clear();
559  if (fgetc(output_file_) != EOF) {
560    std::cerr << "Decoded target is shorter than original target file"
561              << std::endl;
562    return false;
563  }
564  if (ferror(output_file_)) {
565    std::cerr << "Error reading end-of-file indicator from target file"
566              << std::endl;
567    return false;
568  }
569  if (FLAGS_stats && (output_size > 0)) {
570    std::cerr << "Decompressed size: " << output_size
571              << "\tCompressed size: " << input_size << " ("
572              << ((static_cast<double>(input_size) / output_size) * 100)
573              << "% of original)" << std::endl;
574  }
575  return true;
576}
577
578}  // namespace open_vcdiff
579
580int main(int argc, char** argv) {
581  const char* const command_name = argv[0];
582  google::SetUsageMessage(kUsageString);
583  google::ParseCommandLineFlags(&argc, &argv, true);
584  if (argc != 2) {
585    std::cerr << command_name << ": Must specify exactly one command option"
586              << std::endl;
587    ShowUsageWithFlagsRestrict(command_name, "vcdiff");
588    return 1;
589  }
590  const char* const command_option = argv[1];
591  if (FLAGS_dictionary.empty()) {
592    std::cerr << command_name << " " << command_option
593              << ": Must specify --dictionary <file-name>" << std::endl;
594    ShowUsageWithFlagsRestrict(command_name, "vcdiff");
595    return 1;
596  }
597  if (!GetCommandLineFlagInfoOrDie("buffersize").is_default &&
598       (FLAGS_buffersize == 0)) {
599    std::cerr << command_name << ": Option --buffersize cannot be 0"
600              << std::endl;
601    ShowUsageWithFlagsRestrict(command_name, "vcdiff");
602    return 1;
603  }
604  if ((strcmp(command_option, "encode") == 0) ||
605      (strcmp(command_option, "delta") == 0)) {
606    open_vcdiff::VCDiffFileBasedCoder coder;
607    if (!coder.Encode()) {
608      return 1;
609    }
610    // The destructor for VCDiffFileBasedCoder will clean up the open files
611    // and allocated memory.
612  } else if ((strcmp(command_option, "decode") == 0) ||
613             (strcmp(command_option, "patch") == 0)) {
614    open_vcdiff::VCDiffFileBasedCoder coder;
615    if (!coder.Decode()) {
616      return 1;
617    }
618  } else if ((strcmp(command_option, "test") == 0)) {
619    // "vcdiff test" does not appear in the usage string, but can be
620    // used for debugging.  It encodes, then decodes, then compares the result
621    // with the original target. It expects the same arguments as
622    // "vcdiff encode", with the additional requirement that the --target
623    // and --delta file arguments must be specified, rather than using stdin
624    // or stdout.  It produces a delta file just as for "vcdiff encode".
625    if (FLAGS_target.empty() || FLAGS_delta.empty()) {
626      std::cerr << command_name
627                << " test: Must specify both --target <file-name>"
628                   " and --delta <file-name>" << std::endl;
629      return 1;
630    }
631    const string original_target(FLAGS_target);
632    // Put coder into a separate scope.
633    {
634      open_vcdiff::VCDiffFileBasedCoder coder;
635      if (!coder.Encode()) {
636        return 1;
637      }
638    }
639    {
640      open_vcdiff::VCDiffFileBasedCoder coder;
641      if (!coder.DecodeAndCompare()) {
642        return 1;
643      }
644    }
645  } else {
646    std::cerr << command_name << ": Unrecognized command option "
647              << command_option << std::endl;
648    ShowUsageWithFlagsRestrict(command_name, "vcdiff");
649    return 1;
650  }
651  return 0;
652}
653