1// Copyright 2007 Google Inc.
2// Author: Lincoln Smith
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#ifndef OPEN_VCDIFF_VCENCODER_H_
17#define OPEN_VCDIFF_VCENCODER_H_
18
19#include <stddef.h>  // size_t
20#include "google/format_extension_flags.h"
21#include "google/output_string.h"
22
23namespace open_vcdiff {
24
25class VCDiffEngine;
26class VCDiffStreamingEncoderImpl;
27
28// A HashedDictionary must be constructed from the dictionary data
29// in order to use VCDiffStreamingEncoder.  If the same dictionary will
30// be used to perform several encoding operations, then the caller should
31// create the HashedDictionary once and cache it for reuse.  This object
32// is thread-safe: the same const HashedDictionary can be used
33// by several threads simultaneously, each with its own VCDiffStreamingEncoder.
34//
35// dictionary_contents is copied into the HashedDictionary, so the
36// caller may free that string, if desired, after the constructor returns.
37//
38class HashedDictionary {
39 public:
40  HashedDictionary(const char* dictionary_contents,
41                   size_t dictionary_size);
42  ~HashedDictionary();
43
44  // Init() must be called before using the HashedDictionary as an argument
45  // to the VCDiffStreamingEncoder, or for any other purpose except
46  // destruction.  It returns true if initialization succeeded, or false
47  // if an error occurred, in which case the caller should destroy the object
48  // without using it.
49  bool Init();
50
51  const VCDiffEngine* engine() const { return engine_; }
52
53 private:
54  const VCDiffEngine* engine_;
55
56  // Make the copy constructor and assignment operator private
57  // so that they don't inadvertently get used.
58  HashedDictionary(const HashedDictionary&);  // NOLINT
59  void operator=(const HashedDictionary&);
60};
61
62// The standard streaming interface to the VCDIFF (RFC 3284) encoder.
63// "Streaming" in this context means that, even though the entire set of
64// input data to be encoded may not be available at once, the encoder
65// can produce partial output based on what is available.  Of course,
66// the caller should try to maximize the sizes of the data chunks passed
67// to the encoder.
68class VCDiffStreamingEncoder {
69 public:
70  // The HashedDictionary object passed to the constructor must remain valid,
71  // without being deleted, for the lifetime of the VCDiffStreamingEncoder
72  // object.
73  //
74  // format_extensions allows certain open-vcdiff extensions to the VCDIFF
75  // format to be included in the encoded output.  These extensions are not
76  // part of the RFC 3284 draft standard, so specifying any extension flags
77  // will make the output compatible only with open-vcdiff, or with other
78  // VCDIFF implementations that accept these extensions.  See above for an
79  // explanation of each possible flag value.
80  //
81  // *** look_for_target_matches:
82  // The VCDIFF format allows COPY instruction addresses to reference data from
83  // the source (dictionary), or from previously encoded target data.
84  //
85  // If look_for_target_matches is false, then the encoder will only
86  // produce COPY instructions that reference source data from the dictionary,
87  // never from previously encoded target data.  This will speed up the encoding
88  // process, but the encoded data will not be as compact.
89  //
90  // If this value is true, then the encoder will produce COPY instructions
91  // that reference either source data or target data.  A COPY instruction from
92  // the previously encoded target data may even extend into the range of the
93  // data being produced by that same COPY instruction; for example, if the
94  // previously encoded target data is "LA", then a single COPY instruction of
95  // length 10 can produce the additional target data "LALALALALA".
96  //
97  // There is a third type of COPY instruction that starts within
98  // the source data and extends from the end of the source data
99  // into the beginning of the target data.  This VCDIFF encoder will never
100  // produce a COPY instruction of this third type (regardless of the value of
101  // look_for_target_matches) because the cost of checking for matches
102  // across the source-target boundary would not justify its benefits.
103  //
104  VCDiffStreamingEncoder(const HashedDictionary* dictionary,
105                         VCDiffFormatExtensionFlags format_extensions,
106                         bool look_for_target_matches);
107  ~VCDiffStreamingEncoder();
108
109  // The client should use these routines as follows:
110  //    HashedDictionary hd(dictionary, dictionary_size);
111  //    if (!hd.Init()) {
112  //      HandleError();
113  //      return;
114  //    }
115  //    string output_string;
116  //    VCDiffStreamingEncoder v(hd, false, false);
117  //    if (!v.StartEncoding(&output_string)) {
118  //      HandleError();
119  //      return;  // No need to call FinishEncoding()
120  //    }
121  //    Process(output_string.data(), output_string.size());
122  //    output_string.clear();
123  //    while (get data_buf) {
124  //      if (!v.EncodeChunk(data_buf, data_len, &output_string)) {
125  //        HandleError();
126  //        return;  // No need to call FinishEncoding()
127  //      }
128  //      // The encoding is appended to output_string at each call,
129  //      // so clear output_string once its contents have been processed.
130  //      Process(output_string.data(), output_string.size());
131  //      output_string.clear();
132  //    }
133  //    if (!v.FinishEncoding(&output_string)) {
134  //      HandleError();
135  //      return;
136  //    }
137  //    Process(output_string.data(), output_string.size());
138  //    output_string.clear();
139  //
140  // I.e., the allowed pattern of calls is
141  //    StartEncoding EncodeChunk* FinishEncoding
142  //
143  // The size of the encoded output depends on the sizes of the chunks
144  // passed in (i.e. the chunking boundary affects compression).
145  // However the decoded output is independent of chunk boundaries.
146
147  // Sets up the data structures for encoding.
148  // Writes a VCDIFF delta file header (as defined in RFC section 4.1)
149  // to *output_string.
150  //
151  // Note: we *append*, so the old contents of *output_string stick around.
152  // This convention differs from the non-streaming Encode/Decode
153  // interfaces in VCDiffEncoder.
154  //
155  // If an error occurs, this function returns false; otherwise it returns true.
156  // If this function returns false, the caller does not need to call
157  // FinishEncoding or to do any cleanup except destroying the
158  // VCDiffStreamingEncoder object.
159  template<class OutputType>
160  bool StartEncoding(OutputType* output) {
161    OutputString<OutputType> output_string(output);
162    return StartEncodingToInterface(&output_string);
163  }
164
165  bool StartEncodingToInterface(OutputStringInterface* output_string);
166
167  // Appends compressed encoding for "data" (one complete VCDIFF delta window)
168  // to *output_string.
169  // If an error occurs (for example, if StartEncoding was not called
170  // earlier or StartEncoding returned false), this function returns false;
171  // otherwise it returns true.  The caller does not need to call FinishEncoding
172  // or do any cleanup except destroying the VCDiffStreamingEncoder
173  // if this function returns false.
174  template<class OutputType>
175  bool EncodeChunk(const char* data, size_t len, OutputType* output) {
176    OutputString<OutputType> output_string(output);
177    return EncodeChunkToInterface(data, len, &output_string);
178  }
179
180  bool EncodeChunkToInterface(const char* data, size_t len,
181                              OutputStringInterface* output_string);
182
183  // Finishes encoding and appends any leftover encoded data to *output_string.
184  // If an error occurs (for example, if StartEncoding was not called
185  // earlier or StartEncoding returned false), this function returns false;
186  // otherwise it returns true.  The caller does not need to
187  // do any cleanup except destroying the VCDiffStreamingEncoder
188  // if this function returns false.
189  template<class OutputType>
190  bool FinishEncoding(OutputType* output) {
191    OutputString<OutputType> output_string(output);
192    return FinishEncodingToInterface(&output_string);
193  }
194
195  bool FinishEncodingToInterface(OutputStringInterface* output_string);
196
197 private:
198  VCDiffStreamingEncoderImpl* const impl_;
199
200  // Make the copy constructor and assignment operator private
201  // so that they don't inadvertently get used.
202  VCDiffStreamingEncoder(const VCDiffStreamingEncoder&);  // NOLINT
203  void operator=(const VCDiffStreamingEncoder&);
204};
205
206// A simpler (non-streaming) interface to the VCDIFF encoder that can be used
207// if the entire target data string is available.
208//
209class VCDiffEncoder {
210 public:
211  VCDiffEncoder(const char* dictionary_contents, size_t dictionary_size)
212      : dictionary_(dictionary_contents, dictionary_size),
213        encoder_(NULL),
214        flags_(VCD_STANDARD_FORMAT),
215        look_for_target_matches_(true) { }
216
217  ~VCDiffEncoder() {
218    delete encoder_;
219  }
220
221  // By default, VCDiffEncoder uses standard VCDIFF format.  This function
222  // can be used before calling Encode(), to specify that interleaved format
223  // and/or checksum format should be used.
224  void SetFormatFlags(VCDiffFormatExtensionFlags flags) { flags_ = flags; }
225
226  // By default, VCDiffEncoder looks for matches in the dictionary and also in
227  // the previously encoded target data.  This function can be used before
228  // calling Encode(), to specify whether or not target matching should be
229  // enabled.
230  void SetTargetMatching(bool look_for_target_matches) {
231    look_for_target_matches_ = look_for_target_matches;
232  }
233
234  // Replaces old contents of output_string with the encoded form of
235  // target_data.
236  template<class OutputType>
237  bool Encode(const char* target_data,
238              size_t target_len,
239              OutputType* output) {
240    OutputString<OutputType> output_string(output);
241    return EncodeToInterface(target_data, target_len, &output_string);
242  }
243
244 private:
245  bool EncodeToInterface(const char* target_data,
246                         size_t target_len,
247                         OutputStringInterface* output_string);
248
249  HashedDictionary dictionary_;
250  VCDiffStreamingEncoder* encoder_;
251  VCDiffFormatExtensionFlags flags_;
252  bool look_for_target_matches_;
253
254  // Make the copy constructor and assignment operator private
255  // so that they don't inadvertently get used.
256  VCDiffEncoder(const VCDiffEncoder&);  // NOLINT
257  void operator=(const VCDiffEncoder&);
258};
259
260}  // namespace open_vcdiff
261
262#endif  // OPEN_VCDIFF_VCENCODER_H_
263