1// Copyright 2017 The Chromium OS Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "puffin/src/include/puffin/utils.h"
6
7#include <inttypes.h>
8
9#include <string>
10#include <vector>
11
12#include <zlib.h>
13
14#include "puffin/src/bit_reader.h"
15#include "puffin/src/file_stream.h"
16#include "puffin/src/include/puffin/common.h"
17#include "puffin/src/include/puffin/errors.h"
18#include "puffin/src/include/puffin/puffer.h"
19#include "puffin/src/memory_stream.h"
20#include "puffin/src/puff_writer.h"
21#include "puffin/src/set_errors.h"
22
23namespace {
24// Use memcpy to access the unaligned data of type |T|.
25template <typename T>
26inline T get_unaligned(const void* address) {
27  T result;
28  memcpy(&result, address, sizeof(T));
29  return result;
30}
31
32// Calculate both the compressed size and uncompressed size of the deflate
33// block that starts from the offset |start| of buffer |data|.
34bool CalculateSizeOfDeflateBlock(const puffin::Buffer& data,
35                                 uint64_t start,
36                                 uint64_t* compressed_size,
37                                 uint64_t* uncompressed_size) {
38  TEST_AND_RETURN_FALSE(compressed_size != nullptr &&
39                        uncompressed_size != nullptr);
40
41  TEST_AND_RETURN_FALSE(start < data.size());
42
43  z_stream strm = {};
44  strm.avail_in = data.size() - start;
45  strm.next_in = data.data() + start;
46
47  // -15 means we are decoding a 'raw' stream without zlib headers.
48  if (inflateInit2(&strm, -15)) {
49    LOG(ERROR) << "Failed to initialize inflate: " << strm.msg;
50    return false;
51  }
52
53  const unsigned int kBufferSize = 32768;
54  std::vector<uint8_t> uncompressed_data(kBufferSize);
55  *uncompressed_size = 0;
56  int status = Z_OK;
57  do {
58    // Overwrite the same buffer since we don't need the uncompressed data.
59    strm.avail_out = kBufferSize;
60    strm.next_out = uncompressed_data.data();
61    status = inflate(&strm, Z_NO_FLUSH);
62    if (status < 0) {
63      LOG(ERROR) << "Inflate failed: " << strm.msg << ", has decompressed "
64                 << *uncompressed_size << " bytes.";
65      return false;
66    }
67    *uncompressed_size += kBufferSize - strm.avail_out;
68  } while (status != Z_STREAM_END);
69
70  *compressed_size = data.size() - start - strm.avail_in;
71  TEST_AND_RETURN_FALSE(inflateEnd(&strm) == Z_OK);
72  return true;
73}
74
75}  // namespace
76
77namespace puffin {
78
79using std::string;
80using std::vector;
81
82uint64_t BytesInByteExtents(const vector<ByteExtent>& extents) {
83  uint64_t bytes = 0;
84  for (const auto& extent : extents) {
85    bytes += extent.length;
86  }
87  return bytes;
88}
89
90// This function uses RFC1950 (https://www.ietf.org/rfc/rfc1950.txt) for the
91// definition of a zlib stream.  For finding the deflate blocks, we relying on
92// the proper size of the zlib stream in |data|. Basically the size of the zlib
93// stream should be known before hand. Otherwise we need to parse the stream and
94// find the location of compressed blocks using CalculateSizeOfDeflateBlock().
95bool LocateDeflatesInZlib(const Buffer& data,
96                          std::vector<ByteExtent>* deflate_blocks) {
97  // A zlib stream has the following format:
98  // 0           1     compression method and flag
99  // 1           1     flag
100  // 2           4     preset dictionary (optional)
101  // 2 or 6      n     compressed data
102  // n+(2 or 6)  4     Adler-32 checksum
103  TEST_AND_RETURN_FALSE(data.size() >= 6 + 4);  // Header + Footer
104  uint16_t cmf = data[0];
105  auto compression_method = cmf & 0x0F;
106  // For deflate compression_method should be 8.
107  TEST_AND_RETURN_FALSE(compression_method == 8);
108
109  auto cinfo = (cmf & 0xF0) >> 4;
110  // Value greater than 7 is not allowed in deflate.
111  TEST_AND_RETURN_FALSE(cinfo <= 7);
112
113  auto flag = data[1];
114  TEST_AND_RETURN_FALSE(((cmf << 8) + flag) % 31 == 0);
115
116  uint64_t header_len = 2;
117  if (flag & 0x20) {
118    header_len += 4;  // 4 bytes for the preset dictionary.
119  }
120
121  // 4 is for ADLER32.
122  deflate_blocks->emplace_back(header_len, data.size() - header_len - 4);
123  return true;
124}
125
126bool FindDeflateSubBlocks(const UniqueStreamPtr& src,
127                          const vector<ByteExtent>& deflates,
128                          vector<BitExtent>* subblock_deflates) {
129  Puffer puffer;
130  Buffer deflate_buffer;
131  for (const auto& deflate : deflates) {
132    TEST_AND_RETURN_FALSE(src->Seek(deflate.offset));
133    // Read from src into deflate_buffer.
134    deflate_buffer.resize(deflate.length);
135    TEST_AND_RETURN_FALSE(src->Read(deflate_buffer.data(), deflate.length));
136
137    // Find all the subblocks.
138    BufferBitReader bit_reader(deflate_buffer.data(), deflate.length);
139    BufferPuffWriter puff_writer(nullptr, 0);
140    Error error;
141    vector<BitExtent> subblocks;
142    TEST_AND_RETURN_FALSE(
143        puffer.PuffDeflate(&bit_reader, &puff_writer, &subblocks, &error));
144    TEST_AND_RETURN_FALSE(deflate.length == bit_reader.Offset());
145    for (const auto& subblock : subblocks) {
146      subblock_deflates->emplace_back(subblock.offset + deflate.offset * 8,
147                                      subblock.length);
148    }
149  }
150  return true;
151}
152
153bool LocateDeflatesInZlibBlocks(const string& file_path,
154                                const vector<ByteExtent>& zlibs,
155                                vector<BitExtent>* deflates) {
156  auto src = FileStream::Open(file_path, true, false);
157  TEST_AND_RETURN_FALSE(src);
158
159  Buffer buffer;
160  for (auto& zlib : zlibs) {
161    buffer.resize(zlib.length);
162    TEST_AND_RETURN_FALSE(src->Seek(zlib.offset));
163    TEST_AND_RETURN_FALSE(src->Read(buffer.data(), buffer.size()));
164
165    vector<ByteExtent> deflate_blocks;
166    TEST_AND_RETURN_FALSE(LocateDeflatesInZlib(buffer, &deflate_blocks));
167
168    vector<BitExtent> deflate_subblocks;
169    auto zlib_blc_src = MemoryStream::CreateForRead(buffer);
170    TEST_AND_RETURN_FALSE(
171        FindDeflateSubBlocks(zlib_blc_src, deflate_blocks, &deflate_subblocks));
172
173    // Relocated based on the offset of the zlib.
174    for (const auto& def : deflate_subblocks) {
175      deflates->emplace_back(zlib.offset * 8 + def.offset, def.length);
176    }
177  }
178  return true;
179}
180
181// For more information about gzip format, refer to RFC 1952 located at:
182// https://www.ietf.org/rfc/rfc1952.txt
183bool LocateDeflatesInGzip(const Buffer& data,
184                          vector<ByteExtent>* deflate_blocks) {
185  uint64_t member_start = 0;
186  while (member_start < data.size()) {
187    // Each member entry has the following format
188    // 0      1     0x1F
189    // 1      1     0x8B
190    // 2      1     compression method (8 denotes deflate)
191    // 3      1     set of flags
192    // 4      4     modification time
193    // 8      1     extra flags
194    // 9      1     operating system
195    TEST_AND_RETURN_FALSE(member_start + 10 <= data.size());
196    TEST_AND_RETURN_FALSE(data[member_start + 0] == 0x1F);
197    TEST_AND_RETURN_FALSE(data[member_start + 1] == 0x8B);
198    TEST_AND_RETURN_FALSE(data[member_start + 2] == 8);
199
200    uint64_t offset = member_start + 10;
201    int flag = data[member_start + 3];
202    // Extra field
203    if (flag & 4) {
204      TEST_AND_RETURN_FALSE(offset + 2 <= data.size());
205      uint16_t extra_length = data[offset++];
206      extra_length |= static_cast<uint16_t>(data[offset++]) << 8;
207      TEST_AND_RETURN_FALSE(offset + extra_length <= data.size());
208      offset += extra_length;
209    }
210    // File name field
211    if (flag & 8) {
212      while (true) {
213        TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
214        if (data[offset++] == 0) {
215          break;
216        }
217      }
218    }
219    // File comment field
220    if (flag & 16) {
221      while (true) {
222        TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
223        if (data[offset++] == 0) {
224          break;
225        }
226      }
227    }
228    // CRC16 field
229    if (flag & 2) {
230      offset += 2;
231    }
232
233    uint64_t compressed_size, uncompressed_size;
234    TEST_AND_RETURN_FALSE(CalculateSizeOfDeflateBlock(
235        data, offset, &compressed_size, &uncompressed_size));
236    TEST_AND_RETURN_FALSE(offset + compressed_size <= data.size());
237    deflate_blocks->push_back(ByteExtent(offset, compressed_size));
238    offset += compressed_size;
239
240    // Ignore CRC32;
241    TEST_AND_RETURN_FALSE(offset + 8 <= data.size());
242    offset += 4;
243    uint32_t u_size = 0;
244    for (size_t i = 0; i < 4; i++) {
245      u_size |= static_cast<uint32_t>(data[offset++]) << (i * 8);
246    }
247    TEST_AND_RETURN_FALSE(uncompressed_size % (1 << 31) == u_size);
248    member_start = offset;
249  }
250  return true;
251}
252
253// For more information about the zip format, refer to
254// https://support.pkware.com/display/PKZIP/APPNOTE
255bool LocateDeflatesInZipArchive(const Buffer& data,
256                                vector<ByteExtent>* deflate_blocks) {
257  uint64_t pos = 0;
258  while (pos <= data.size() - 30) {
259    // TODO(xunchang) add support for big endian system when searching for
260    // magic numbers.
261    if (get_unaligned<uint32_t>(data.data() + pos) != 0x04034b50) {
262      pos++;
263      continue;
264    }
265
266    // local file header format
267    // 0      4     0x04034b50
268    // 4      2     minimum version needed to extract
269    // 6      2     general purpose bit flag
270    // 8      2     compression method
271    // 10     4     file last modification date & time
272    // 14     4     CRC-32
273    // 18     4     compressed size
274    // 22     4     uncompressed size
275    // 26     2     file name length
276    // 28     2     extra field length
277    // 30     n     file name
278    // 30+n   m     extra field
279    auto compression_method = get_unaligned<uint16_t>(data.data() + pos + 8);
280    if (compression_method != 8) {  // non-deflate type
281      pos += 4;
282      continue;
283    }
284
285    auto compressed_size = get_unaligned<uint32_t>(data.data() + pos + 18);
286    auto uncompressed_size = get_unaligned<uint32_t>(data.data() + pos + 22);
287    auto file_name_length = get_unaligned<uint16_t>(data.data() + pos + 26);
288    auto extra_field_length = get_unaligned<uint16_t>(data.data() + pos + 28);
289    uint64_t header_size = 30 + file_name_length + extra_field_length;
290
291    // sanity check
292    if (static_cast<uint64_t>(header_size) + compressed_size > data.size() ||
293        pos > data.size() - header_size - compressed_size) {
294      pos += 4;
295      continue;
296    }
297
298    uint64_t calculated_compressed_size;
299    uint64_t calculated_uncompressed_size;
300    if (!CalculateSizeOfDeflateBlock(data, pos + header_size,
301                                     &calculated_compressed_size,
302                                     &calculated_uncompressed_size)) {
303      LOG(ERROR) << "Failed to decompress the zip entry starting from: " << pos
304                 << ", skip adding deflates for this entry.";
305      pos += 4;
306      continue;
307    }
308
309    // Double check the compressed size and uncompressed size if they are
310    // available in the file header.
311    if (compressed_size > 0 && compressed_size != calculated_compressed_size) {
312      LOG(WARNING) << "Compressed size in the file header: " << compressed_size
313                   << " doesn't equal the real size: "
314                   << calculated_compressed_size;
315    }
316
317    if (uncompressed_size > 0 &&
318        uncompressed_size != calculated_uncompressed_size) {
319      LOG(WARNING) << "Uncompressed size in the file header: "
320                   << uncompressed_size << " doesn't equal the real size: "
321                   << calculated_uncompressed_size;
322    }
323
324    deflate_blocks->emplace_back(pos + header_size, calculated_compressed_size);
325    pos += header_size + calculated_compressed_size;
326  }
327
328  return true;
329}
330
331bool LocateDeflateSubBlocksInZipArchive(const Buffer& data,
332                                        vector<BitExtent>* deflates) {
333  vector<ByteExtent> deflate_blocks;
334  if (!LocateDeflatesInZipArchive(data, &deflate_blocks)) {
335    return false;
336  }
337
338  auto src = MemoryStream::CreateForRead(data);
339  return FindDeflateSubBlocks(src, deflate_blocks, deflates);
340}
341
342bool FindPuffLocations(const UniqueStreamPtr& src,
343                       const vector<BitExtent>& deflates,
344                       vector<ByteExtent>* puffs,
345                       uint64_t* out_puff_size) {
346  Puffer puffer;
347  Buffer deflate_buffer;
348
349  // Here accumulate the size difference between each corresponding deflate and
350  // puff. At the end we add this cummulative size difference to the size of the
351  // deflate stream to get the size of the puff stream. We use signed size
352  // because puff size could be smaller than deflate size.
353  int64_t total_size_difference = 0;
354  for (auto deflate = deflates.begin(); deflate != deflates.end(); ++deflate) {
355    // Read from src into deflate_buffer.
356    auto start_byte = deflate->offset / 8;
357    auto end_byte = (deflate->offset + deflate->length + 7) / 8;
358    deflate_buffer.resize(end_byte - start_byte);
359    TEST_AND_RETURN_FALSE(src->Seek(start_byte));
360    TEST_AND_RETURN_FALSE(
361        src->Read(deflate_buffer.data(), deflate_buffer.size()));
362    // Find the size of the puff.
363    BufferBitReader bit_reader(deflate_buffer.data(), deflate_buffer.size());
364    uint64_t bits_to_skip = deflate->offset % 8;
365    TEST_AND_RETURN_FALSE(bit_reader.CacheBits(bits_to_skip));
366    bit_reader.DropBits(bits_to_skip);
367
368    BufferPuffWriter puff_writer(nullptr, 0);
369    Error error;
370    TEST_AND_RETURN_FALSE(
371        puffer.PuffDeflate(&bit_reader, &puff_writer, nullptr, &error));
372    TEST_AND_RETURN_FALSE(deflate_buffer.size() == bit_reader.Offset());
373
374    // 1 if a deflate ends at the same byte that the next deflate starts and
375    // there is a few bits gap between them. In practice this may never happen,
376    // but it is a good idea to support it anyways. If there is a gap, the value
377    // of the gap will be saved as an integer byte to the puff stream. The parts
378    // of the byte that belogs to the deflates are shifted out.
379    int gap = 0;
380    if (deflate != deflates.begin()) {
381      auto prev_deflate = std::prev(deflate);
382      if ((prev_deflate->offset + prev_deflate->length == deflate->offset)
383          // If deflates are on byte boundary the gap will not be counted later,
384          // so we won't worry about it.
385          && (deflate->offset % 8 != 0)) {
386        gap = 1;
387      }
388    }
389
390    start_byte = ((deflate->offset + 7) / 8);
391    end_byte = (deflate->offset + deflate->length) / 8;
392    int64_t deflate_length_in_bytes = end_byte - start_byte;
393
394    // If there was no gap bits between the current and previous deflates, there
395    // will be no extra gap byte, so the offset will be shifted one byte back.
396    auto puff_offset = start_byte - gap + total_size_difference;
397    auto puff_size = puff_writer.Size();
398    // Add the location into puff.
399    puffs->emplace_back(puff_offset, puff_size);
400    total_size_difference +=
401        static_cast<int64_t>(puff_size) - deflate_length_in_bytes - gap;
402  }
403
404  uint64_t src_size;
405  TEST_AND_RETURN_FALSE(src->GetSize(&src_size));
406  auto final_size = static_cast<int64_t>(src_size) + total_size_difference;
407  TEST_AND_RETURN_FALSE(final_size >= 0);
408  *out_puff_size = final_size;
409  return true;
410}
411
412}  // namespace puffin
413