1// Copyright 2007 Google Inc.
2// Author: Lincoln Smith
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#ifndef OPEN_VCDIFF_VCENCODER_H_
17#define OPEN_VCDIFF_VCENCODER_H_
18
19#include <stddef.h>  // size_t
20#include <vector>
21#include "google/output_string.h"
22
23namespace open_vcdiff {
24
25class VCDiffEngine;
26class VCDiffStreamingEncoderImpl;
27
28// These flags are passed to the constructor of VCDiffStreamingEncoder
29// to determine whether certain open-vcdiff format extensions
30// (which are not part of the RFC 3284 draft standard for VCDIFF)
31// are employed.
32//
33// Because these extensions are not part of the VCDIFF standard, if
34// any of these flags except VCD_STANDARD_FORMAT is specified, then the caller
35// must be certain that the receiver of the data will be using open-vcdiff
36// to decode the delta file, or at least that the receiver can interpret
37// these extensions.  The encoder will use an 'S' as the fourth character
38// in the delta file to indicate that non-standard extensions are being used.
39//
40enum VCDiffFormatExtensionFlagValues {
41  // No extensions: the encoded format will conform to the RFC
42  // draft standard for VCDIFF.
43  VCD_STANDARD_FORMAT = 0x00,
44  // If this flag is specified, then the encoder writes each delta file
45  // window by interleaving instructions and sizes with their corresponding
46  // addresses and data, rather than placing these elements
47  // into three separate sections.  This facilitates providing partially
48  // decoded results when only a portion of a delta file window is received
49  // (e.g. when HTTP over TCP is used as the transmission protocol.)
50  VCD_FORMAT_INTERLEAVED = 0x01,
51  // If this flag is specified, then an Adler32 checksum
52  // of the target window data is included in the delta window.
53  VCD_FORMAT_CHECKSUM = 0x02
54};
55
56typedef int VCDiffFormatExtensionFlags;
57
58// A HashedDictionary must be constructed from the dictionary data
59// in order to use VCDiffStreamingEncoder.  If the same dictionary will
60// be used to perform several encoding operations, then the caller should
61// create the HashedDictionary once and cache it for reuse.  This object
62// is thread-safe: the same const HashedDictionary can be used
63// by several threads simultaneously, each with its own VCDiffStreamingEncoder.
64//
65// dictionary_contents is copied into the HashedDictionary, so the
66// caller may free that string, if desired, after the constructor returns.
67//
68class HashedDictionary {
69 public:
70  HashedDictionary(const char* dictionary_contents,
71                   size_t dictionary_size);
72  ~HashedDictionary();
73
74  // Init() must be called before using the HashedDictionary as an argument
75  // to the VCDiffStreamingEncoder, or for any other purpose except
76  // destruction.  It returns true if initialization succeeded, or false
77  // if an error occurred, in which case the caller should destroy the object
78  // without using it.
79  bool Init();
80
81  const VCDiffEngine* engine() const { return engine_; }
82
83 private:
84  const VCDiffEngine* engine_;
85
86  // Make the copy constructor and assignment operator private
87  // so that they don't inadvertently get used.
88  HashedDictionary(const HashedDictionary&);  // NOLINT
89  void operator=(const HashedDictionary&);
90};
91
92// The standard streaming interface to the VCDIFF (RFC 3284) encoder.
93// "Streaming" in this context means that, even though the entire set of
94// input data to be encoded may not be available at once, the encoder
95// can produce partial output based on what is available.  Of course,
96// the caller should try to maximize the sizes of the data chunks passed
97// to the encoder.
98class VCDiffStreamingEncoder {
99 public:
100  // The HashedDictionary object passed to the constructor must remain valid,
101  // without being deleted, for the lifetime of the VCDiffStreamingEncoder
102  // object.
103  //
104  // format_extensions allows certain open-vcdiff extensions to the VCDIFF
105  // format to be included in the encoded output.  These extensions are not
106  // part of the RFC 3284 draft standard, so specifying any extension flags
107  // will make the output compatible only with open-vcdiff, or with other
108  // VCDIFF implementations that accept these extensions.  See above for an
109  // explanation of each possible flag value.
110  //
111  // *** look_for_target_matches:
112  // The VCDIFF format allows COPY instruction addresses to reference data from
113  // the source (dictionary), or from previously encoded target data.
114  //
115  // If look_for_target_matches is false, then the encoder will only
116  // produce COPY instructions that reference source data from the dictionary,
117  // never from previously encoded target data.  This will speed up the encoding
118  // process, but the encoded data will not be as compact.
119  //
120  // If this value is true, then the encoder will produce COPY instructions
121  // that reference either source data or target data.  A COPY instruction from
122  // the previously encoded target data may even extend into the range of the
123  // data being produced by that same COPY instruction; for example, if the
124  // previously encoded target data is "LA", then a single COPY instruction of
125  // length 10 can produce the additional target data "LALALALALA".
126  //
127  // There is a third type of COPY instruction that starts within
128  // the source data and extends from the end of the source data
129  // into the beginning of the target data.  This VCDIFF encoder will never
130  // produce a COPY instruction of this third type (regardless of the value of
131  // look_for_target_matches) because the cost of checking for matches
132  // across the source-target boundary would not justify its benefits.
133  //
134  VCDiffStreamingEncoder(const HashedDictionary* dictionary,
135                         VCDiffFormatExtensionFlags format_extensions,
136                         bool look_for_target_matches);
137  ~VCDiffStreamingEncoder();
138
139  // The client should use these routines as follows:
140  //    HashedDictionary hd(dictionary, dictionary_size);
141  //    if (!hd.Init()) {
142  //      HandleError();
143  //      return;
144  //    }
145  //    string output_string;
146  //    VCDiffStreamingEncoder v(hd, false, false);
147  //    if (!v.StartEncoding(&output_string)) {
148  //      HandleError();
149  //      return;  // No need to call FinishEncoding()
150  //    }
151  //    Process(output_string.data(), output_string.size());
152  //    output_string.clear();
153  //    while (get data_buf) {
154  //      if (!v.EncodeChunk(data_buf, data_len, &output_string)) {
155  //        HandleError();
156  //        return;  // No need to call FinishEncoding()
157  //      }
158  //      // The encoding is appended to output_string at each call,
159  //      // so clear output_string once its contents have been processed.
160  //      Process(output_string.data(), output_string.size());
161  //      output_string.clear();
162  //    }
163  //    if (!v.FinishEncoding(&output_string)) {
164  //      HandleError();
165  //      return;
166  //    }
167  //    Process(output_string.data(), output_string.size());
168  //    output_string.clear();
169  //
170  // I.e., the allowed pattern of calls is
171  //    StartEncoding EncodeChunk* FinishEncoding
172  //
173  // The size of the encoded output depends on the sizes of the chunks
174  // passed in (i.e. the chunking boundary affects compression).
175  // However the decoded output is independent of chunk boundaries.
176
177  // Sets up the data structures for encoding.
178  // Writes a VCDIFF delta file header (as defined in RFC section 4.1)
179  // to *output_string.
180  //
181  // Note: we *append*, so the old contents of *output_string stick around.
182  // This convention differs from the non-streaming Encode/Decode
183  // interfaces in VCDiffEncoder.
184  //
185  // If an error occurs, this function returns false; otherwise it returns true.
186  // If this function returns false, the caller does not need to call
187  // FinishEncoding or to do any cleanup except destroying the
188  // VCDiffStreamingEncoder object.
189  template<class OutputType>
190  bool StartEncoding(OutputType* output) {
191    OutputString<OutputType> output_string(output);
192    return StartEncodingToInterface(&output_string);
193  }
194
195  bool StartEncodingToInterface(OutputStringInterface* output_string);
196
197  // Appends compressed encoding for "data" (one complete VCDIFF delta window)
198  // to *output_string.
199  // If an error occurs (for example, if StartEncoding was not called
200  // earlier or StartEncoding returned false), this function returns false;
201  // otherwise it returns true.  The caller does not need to call FinishEncoding
202  // or do any cleanup except destroying the VCDiffStreamingEncoder
203  // if this function returns false.
204  template<class OutputType>
205  bool EncodeChunk(const char* data, size_t len, OutputType* output) {
206    OutputString<OutputType> output_string(output);
207    return EncodeChunkToInterface(data, len, &output_string);
208  }
209
210  bool EncodeChunkToInterface(const char* data, size_t len,
211                              OutputStringInterface* output_string);
212
213  // Finishes encoding and appends any leftover encoded data to *output_string.
214  // If an error occurs (for example, if StartEncoding was not called
215  // earlier or StartEncoding returned false), this function returns false;
216  // otherwise it returns true.  The caller does not need to
217  // do any cleanup except destroying the VCDiffStreamingEncoder
218  // if this function returns false.
219  template<class OutputType>
220  bool FinishEncoding(OutputType* output) {
221    OutputString<OutputType> output_string(output);
222    return FinishEncodingToInterface(&output_string);
223  }
224
225  bool FinishEncodingToInterface(OutputStringInterface* output_string);
226
227  // Replaces the contents of match_counts with a vector of integers,
228  // one for each possible match length.  The value of match_counts[n]
229  // is equal to the number of matches of length n found so far
230  // for this VCDiffStreamingEncoder object.
231  void GetMatchCounts(std::vector<int>* match_counts) const;
232
233 private:
234  VCDiffStreamingEncoderImpl* const impl_;
235
236  // Make the copy constructor and assignment operator private
237  // so that they don't inadvertently get used.
238  VCDiffStreamingEncoder(const VCDiffStreamingEncoder&);  // NOLINT
239  void operator=(const VCDiffStreamingEncoder&);
240};
241
242// A simpler (non-streaming) interface to the VCDIFF encoder that can be used
243// if the entire target data string is available.
244//
245class VCDiffEncoder {
246 public:
247  VCDiffEncoder(const char* dictionary_contents, size_t dictionary_size)
248      : dictionary_(dictionary_contents, dictionary_size),
249        encoder_(NULL),
250        flags_(VCD_STANDARD_FORMAT),
251        look_for_target_matches_(true) { }
252
253  ~VCDiffEncoder() {
254    delete encoder_;
255  }
256
257  // By default, VCDiffEncoder uses standard VCDIFF format.  This function
258  // can be used before calling Encode(), to specify that interleaved format
259  // and/or checksum format should be used.
260  void SetFormatFlags(VCDiffFormatExtensionFlags flags) { flags_ = flags; }
261
262  // By default, VCDiffEncoder looks for matches in the dictionary and also in
263  // the previously encoded target data.  This function can be used before
264  // calling Encode(), to specify whether or not target matching should be
265  // enabled.
266  void SetTargetMatching(bool look_for_target_matches) {
267    look_for_target_matches_ = look_for_target_matches;
268  }
269
270  // Replaces old contents of output_string with the encoded form of
271  // target_data.
272  template<class OutputType>
273  bool Encode(const char* target_data,
274              size_t target_len,
275              OutputType* output) {
276    OutputString<OutputType> output_string(output);
277    return EncodeToInterface(target_data, target_len, &output_string);
278  }
279
280 private:
281  bool EncodeToInterface(const char* target_data,
282                         size_t target_len,
283                         OutputStringInterface* output_string);
284
285  HashedDictionary dictionary_;
286  VCDiffStreamingEncoder* encoder_;
287  VCDiffFormatExtensionFlags flags_;
288  bool look_for_target_matches_;
289
290  // Make the copy constructor and assignment operator private
291  // so that they don't inadvertently get used.
292  VCDiffEncoder(const VCDiffEncoder&);  // NOLINT
293  void operator=(const VCDiffEncoder&);
294};
295
296}  // namespace open_vcdiff
297
298#endif  // OPEN_VCDIFF_VCENCODER_H_
299