1// Copyright 2008 Google Inc.
2// Author: Lincoln Smith
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#ifndef OPEN_VCDIFF_ENCODETABLE_H_
17#define OPEN_VCDIFF_ENCODETABLE_H_
18
19#include <config.h>
20#include <stddef.h>  // size_t
21#include <stdint.h>  // int32_t
22#include <string>
23#include <vector>
24#include "addrcache.h"
25#include "checksum.h"
26#include "codetable.h"
27#include "codetablewriter_interface.h"
28
29namespace open_vcdiff {
30
31class OutputStringInterface;
32class VCDiffInstructionMap;
33
34// The method calls after construction *must* conform
35// to the following pattern:
36//    {{Add|Copy|Run}* [AddChecksum] Output}*
37//
38// When Output has been called in this sequence, a complete target window
39// (as defined in RFC 3284 section 4.3) will have been appended to
40// out (unless no calls to Add, Run, or Copy were made, in which
41// case Output will do nothing.)  The output will not be available for use
42// until after each call to Output().
43//
44// NOT threadsafe.
45//
46class VCDiffCodeTableWriter : public CodeTableWriterInterface {
47 public:
48  // This constructor uses the default code table.
49  // If interleaved is true, the encoder writes each delta file window
50  // by interleaving instructions and sizes with their corresponding
51  // addresses and data, rather than placing these elements into three
52  // separate sections.  This facilitates providing partially
53  // decoded results when only a portion of a delta file window
54  // is received (e.g. when HTTP over TCP is used as the
55  // transmission protocol.)  The interleaved format is
56  // not consistent with the VCDIFF draft standard.
57  //
58  explicit VCDiffCodeTableWriter(bool interleaved);
59
60  // Uses a non-standard code table and non-standard cache sizes.  The caller
61  // must guarantee that code_table_data remains allocated for the lifetime of
62  // the VCDiffCodeTableWriter object.  Note that this is different from how
63  // VCDiffCodeTableReader::UseCodeTable works.  It is assumed that a given
64  // encoder will use either the default code table or a statically-defined
65  // non-standard code table, whereas the decoder must have the ability to read
66  // an arbitrary non-standard code table from a delta file and discard it once
67  // the file has been decoded.
68  //
69  VCDiffCodeTableWriter(bool interleaved,
70                        int near_cache_size,
71                        int same_cache_size,
72                        const VCDiffCodeTableData& code_table_data,
73                        unsigned char max_mode);
74
75  virtual ~VCDiffCodeTableWriter();
76
77  // Initializes the constructed object for use.
78  // This method must be called after a VCDiffCodeTableWriter is constructed
79  // and before any of its other methods can be called.  It will return
80  // false if there was an error initializing the object, or true if it
81  // was successful.  After the object has been initialized and used,
82  // Init() can be called again to restore the initial state of the object.
83  //
84  bool Init(size_t dictionary_size);
85
86  virtual size_t target_length() const { return target_length_; }
87
88  // Encode an ADD opcode with the "size" bytes starting at data
89  virtual void Add(const char* data, size_t size);
90
91  // Encode a COPY opcode with args "offset" (into dictionary) and "size" bytes.
92  virtual void Copy(int32_t offset, size_t size);
93
94  // Encode a RUN opcode for "size" copies of the value "byte".
95  virtual void Run(size_t size, unsigned char byte);
96
97  void AddChecksum(VCDChecksum checksum) {
98    add_checksum_ = true;
99    checksum_ = checksum;
100  }
101
102  // Finishes encoding and appends the encoded delta window to the output
103  // string.  The output string is not null-terminated and may contain embedded
104  // '\0' characters.
105  virtual void Output(OutputStringInterface* out);
106
107  const std::vector<int>& match_counts() const { return match_counts_; }
108
109 private:
110  typedef std::string string;
111
112  // This is an estimate of the longest match size the encoder expects to find.
113  // It is used to determine the initial size of the vector match_counts_.
114  // If it is too large, then some space will be wasted on vector elements
115  // that are not used.  If it is too small, then some time will be wasted
116  // expanding match_counts_ to accommodate larger match sizes.
117  static const size_t kMaxMatchSize = 2000;
118
119  // The maximum value for the mode of a COPY instruction.
120  const unsigned char max_mode_;
121
122  // If interleaved is true, sets data_for_add_and_run_ and
123  // addresses_for_copy_ to point at instructions_and_sizes_,
124  // so that instructions, sizes, addresses and data will be
125  // combined into a single interleaved stream.
126  // If interleaved is false, sets data_for_add_and_run_ and
127  // addresses_for_copy_ to point at their corresponding
128  // separate_... strings, so that the three sections will
129  // be generated separately from one another.
130  //
131  void InitSectionPointers(bool interleaved);
132
133  // Determines the best opcode to encode an instruction, and appends
134  // or substitutes that opcode and its size into the
135  // instructions_and_sizes_ string.
136  //
137  void EncodeInstruction(VCDiffInstructionType inst,
138                         size_t size,
139                         unsigned char mode);
140
141  void EncodeInstruction(VCDiffInstructionType inst, size_t size) {
142    return EncodeInstruction(inst, size, 0);
143  }
144
145  // Calculates the number of bytes needed to store the given size value as a
146  // variable-length integer (VarintBE).
147  static size_t CalculateLengthOfSizeAsVarint(size_t size);
148
149  // Appends the size value to the string as a variable-length integer.
150  static void AppendSizeToString(size_t size, string* out);
151
152  // Appends the size value to the output string as a variable-length integer.
153  static void AppendSizeToOutputString(size_t size, OutputStringInterface* out);
154
155  // Calculates the "Length of the delta encoding" field for the delta window
156  // header, based on the sizes of the sections and of the other header
157  // elements.
158  size_t CalculateLengthOfTheDeltaEncoding() const;
159
160  // None of the following 'string' objects are null-terminated.
161
162  // A series of instruction opcodes, each of which may be followed
163  // by one or two Varint values representing the size parameters
164  // of the first and second instruction in the opcode.
165  string instructions_and_sizes_;
166
167  // A series of data arguments (byte values) used for ADD and RUN
168  // instructions.  Depending on whether interleaved output is used
169  // for streaming or not, the pointer may point to
170  // separate_data_for_add_and_run_ or to instructions_and_sizes_.
171  string *data_for_add_and_run_;
172  string separate_data_for_add_and_run_;
173
174  // A series of Varint addresses used for COPY instructions.
175  // For the SAME mode, a byte value is stored instead of a Varint.
176  // Depending on whether interleaved output is used
177  // for streaming or not, the pointer may point to
178  // separate_addresses_for_copy_ or to instructions_and_sizes_.
179  string *addresses_for_copy_;
180  string separate_addresses_for_copy_;
181
182  VCDiffAddressCache address_cache_;
183
184  size_t dictionary_size_;
185
186  // The number of bytes of target data that has been encoded so far.
187  // Each time Add(), Copy(), or Run() is called, this will be incremented.
188  // The target length is used to compute HERE mode addresses
189  // for COPY instructions, and is also written into the header
190  // of the delta window when Output() is called.
191  //
192  size_t target_length_;
193
194  const VCDiffCodeTableData* code_table_data_;
195
196  // The instruction map facilitates finding an opcode quickly given an
197  // instruction inst, size, and mode.  This is an alternate representation
198  // of the same information that is found in code_table_data_.
199  //
200  const VCDiffInstructionMap* instruction_map_;
201
202  // The zero-based index within instructions_and_sizes_ of the byte
203  // that contains the last single-instruction opcode generated by
204  // EncodeInstruction().  (See that function for exhaustive details.)
205  // It is necessary to use an index rather than a pointer for this value
206  // because instructions_and_sizes_ may be resized, which would invalidate
207  // any pointers into its data buffer.  The value -1 is reserved to mean that
208  // either no opcodes have been generated yet, or else the last opcode
209  // generated was a double-instruction opcode.
210  //
211  int last_opcode_index_;
212
213  // If true, an Adler32 checksum of the target window data will be written as
214  // a variable-length integer, just after the size of the addresses section.
215  //
216  bool add_checksum_;
217
218  // The checksum to be written to the current target window,
219  // if add_checksum_ is true.
220  // This will not be calculated based on the individual calls to Add(), Run(),
221  // and Copy(), which would be unnecessarily expensive.  Instead, the code
222  // that uses the VCDiffCodeTableWriter object is expected to calculate
223  // the checksum all at once and to call AddChecksum() with that value.
224  // Must be called sometime before calling Output(), though it can be called
225  // either before or after the calls to Add(), Run(), and Copy().
226  //
227  VCDChecksum checksum_;
228
229  // The value of match_counts_[n] is equal to the number of matches
230  // of length n (that is, COPY instructions of size n) found so far.
231  std::vector<int> match_counts_;
232
233  // Making these private avoids implicit copy constructor & assignment operator
234  VCDiffCodeTableWriter(const VCDiffCodeTableWriter&);  // NOLINT
235  void operator=(const VCDiffCodeTableWriter&);
236};
237
238};  // namespace open_vcdiff
239
240#endif  // OPEN_VCDIFF_ENCODETABLE_H_
241