1// Copyright 2017 The Chromium OS Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "puffin/src/include/puffin/utils.h" 6 7#include <inttypes.h> 8 9#include <string> 10#include <vector> 11 12#include <zlib.h> 13 14#include "puffin/src/bit_reader.h" 15#include "puffin/src/file_stream.h" 16#include "puffin/src/include/puffin/common.h" 17#include "puffin/src/include/puffin/errors.h" 18#include "puffin/src/include/puffin/puffer.h" 19#include "puffin/src/memory_stream.h" 20#include "puffin/src/puff_writer.h" 21#include "puffin/src/set_errors.h" 22 23namespace { 24// Use memcpy to access the unaligned data of type |T|. 25template <typename T> 26inline T get_unaligned(const void* address) { 27 T result; 28 memcpy(&result, address, sizeof(T)); 29 return result; 30} 31 32// Calculate both the compressed size and uncompressed size of the deflate 33// block that starts from the offset |start| of buffer |data|. 34bool CalculateSizeOfDeflateBlock(const puffin::Buffer& data, 35 uint64_t start, 36 uint64_t* compressed_size, 37 uint64_t* uncompressed_size) { 38 TEST_AND_RETURN_FALSE(compressed_size != nullptr && 39 uncompressed_size != nullptr); 40 41 TEST_AND_RETURN_FALSE(start < data.size()); 42 43 z_stream strm = {}; 44 strm.avail_in = data.size() - start; 45 strm.next_in = data.data() + start; 46 47 // -15 means we are decoding a 'raw' stream without zlib headers. 48 if (inflateInit2(&strm, -15)) { 49 LOG(ERROR) << "Failed to initialize inflate: " << strm.msg; 50 return false; 51 } 52 53 const unsigned int kBufferSize = 32768; 54 std::vector<uint8_t> uncompressed_data(kBufferSize); 55 *uncompressed_size = 0; 56 int status = Z_OK; 57 do { 58 // Overwrite the same buffer since we don't need the uncompressed data. 59 strm.avail_out = kBufferSize; 60 strm.next_out = uncompressed_data.data(); 61 status = inflate(&strm, Z_NO_FLUSH); 62 if (status < 0) { 63 LOG(ERROR) << "Inflate failed: " << strm.msg << ", has decompressed " 64 << *uncompressed_size << " bytes."; 65 return false; 66 } 67 *uncompressed_size += kBufferSize - strm.avail_out; 68 } while (status != Z_STREAM_END); 69 70 *compressed_size = data.size() - start - strm.avail_in; 71 TEST_AND_RETURN_FALSE(inflateEnd(&strm) == Z_OK); 72 return true; 73} 74 75} // namespace 76 77namespace puffin { 78 79using std::string; 80using std::vector; 81 82uint64_t BytesInByteExtents(const vector<ByteExtent>& extents) { 83 uint64_t bytes = 0; 84 for (const auto& extent : extents) { 85 bytes += extent.length; 86 } 87 return bytes; 88} 89 90// This function uses RFC1950 (https://www.ietf.org/rfc/rfc1950.txt) for the 91// definition of a zlib stream. For finding the deflate blocks, we relying on 92// the proper size of the zlib stream in |data|. Basically the size of the zlib 93// stream should be known before hand. Otherwise we need to parse the stream and 94// find the location of compressed blocks using CalculateSizeOfDeflateBlock(). 95bool LocateDeflatesInZlib(const Buffer& data, 96 std::vector<ByteExtent>* deflate_blocks) { 97 // A zlib stream has the following format: 98 // 0 1 compression method and flag 99 // 1 1 flag 100 // 2 4 preset dictionary (optional) 101 // 2 or 6 n compressed data 102 // n+(2 or 6) 4 Adler-32 checksum 103 TEST_AND_RETURN_FALSE(data.size() >= 6 + 4); // Header + Footer 104 uint16_t cmf = data[0]; 105 auto compression_method = cmf & 0x0F; 106 // For deflate compression_method should be 8. 107 TEST_AND_RETURN_FALSE(compression_method == 8); 108 109 auto cinfo = (cmf & 0xF0) >> 4; 110 // Value greater than 7 is not allowed in deflate. 111 TEST_AND_RETURN_FALSE(cinfo <= 7); 112 113 auto flag = data[1]; 114 TEST_AND_RETURN_FALSE(((cmf << 8) + flag) % 31 == 0); 115 116 uint64_t header_len = 2; 117 if (flag & 0x20) { 118 header_len += 4; // 4 bytes for the preset dictionary. 119 } 120 121 // 4 is for ADLER32. 122 deflate_blocks->emplace_back(header_len, data.size() - header_len - 4); 123 return true; 124} 125 126bool FindDeflateSubBlocks(const UniqueStreamPtr& src, 127 const vector<ByteExtent>& deflates, 128 vector<BitExtent>* subblock_deflates) { 129 Puffer puffer; 130 Buffer deflate_buffer; 131 for (const auto& deflate : deflates) { 132 TEST_AND_RETURN_FALSE(src->Seek(deflate.offset)); 133 // Read from src into deflate_buffer. 134 deflate_buffer.resize(deflate.length); 135 TEST_AND_RETURN_FALSE(src->Read(deflate_buffer.data(), deflate.length)); 136 137 // Find all the subblocks. 138 BufferBitReader bit_reader(deflate_buffer.data(), deflate.length); 139 BufferPuffWriter puff_writer(nullptr, 0); 140 Error error; 141 vector<BitExtent> subblocks; 142 TEST_AND_RETURN_FALSE( 143 puffer.PuffDeflate(&bit_reader, &puff_writer, &subblocks, &error)); 144 TEST_AND_RETURN_FALSE(deflate.length == bit_reader.Offset()); 145 for (const auto& subblock : subblocks) { 146 subblock_deflates->emplace_back(subblock.offset + deflate.offset * 8, 147 subblock.length); 148 } 149 } 150 return true; 151} 152 153bool LocateDeflatesInZlibBlocks(const string& file_path, 154 const vector<ByteExtent>& zlibs, 155 vector<BitExtent>* deflates) { 156 auto src = FileStream::Open(file_path, true, false); 157 TEST_AND_RETURN_FALSE(src); 158 159 Buffer buffer; 160 for (auto& zlib : zlibs) { 161 buffer.resize(zlib.length); 162 TEST_AND_RETURN_FALSE(src->Seek(zlib.offset)); 163 TEST_AND_RETURN_FALSE(src->Read(buffer.data(), buffer.size())); 164 165 vector<ByteExtent> deflate_blocks; 166 TEST_AND_RETURN_FALSE(LocateDeflatesInZlib(buffer, &deflate_blocks)); 167 168 vector<BitExtent> deflate_subblocks; 169 auto zlib_blc_src = MemoryStream::CreateForRead(buffer); 170 TEST_AND_RETURN_FALSE( 171 FindDeflateSubBlocks(zlib_blc_src, deflate_blocks, &deflate_subblocks)); 172 173 // Relocated based on the offset of the zlib. 174 for (const auto& def : deflate_subblocks) { 175 deflates->emplace_back(zlib.offset * 8 + def.offset, def.length); 176 } 177 } 178 return true; 179} 180 181// For more information about gzip format, refer to RFC 1952 located at: 182// https://www.ietf.org/rfc/rfc1952.txt 183bool LocateDeflatesInGzip(const Buffer& data, 184 vector<ByteExtent>* deflate_blocks) { 185 uint64_t member_start = 0; 186 while (member_start < data.size()) { 187 // Each member entry has the following format 188 // 0 1 0x1F 189 // 1 1 0x8B 190 // 2 1 compression method (8 denotes deflate) 191 // 3 1 set of flags 192 // 4 4 modification time 193 // 8 1 extra flags 194 // 9 1 operating system 195 TEST_AND_RETURN_FALSE(member_start + 10 <= data.size()); 196 TEST_AND_RETURN_FALSE(data[member_start + 0] == 0x1F); 197 TEST_AND_RETURN_FALSE(data[member_start + 1] == 0x8B); 198 TEST_AND_RETURN_FALSE(data[member_start + 2] == 8); 199 200 uint64_t offset = member_start + 10; 201 int flag = data[member_start + 3]; 202 // Extra field 203 if (flag & 4) { 204 TEST_AND_RETURN_FALSE(offset + 2 <= data.size()); 205 uint16_t extra_length = data[offset++]; 206 extra_length |= static_cast<uint16_t>(data[offset++]) << 8; 207 TEST_AND_RETURN_FALSE(offset + extra_length <= data.size()); 208 offset += extra_length; 209 } 210 // File name field 211 if (flag & 8) { 212 while (true) { 213 TEST_AND_RETURN_FALSE(offset + 1 <= data.size()); 214 if (data[offset++] == 0) { 215 break; 216 } 217 } 218 } 219 // File comment field 220 if (flag & 16) { 221 while (true) { 222 TEST_AND_RETURN_FALSE(offset + 1 <= data.size()); 223 if (data[offset++] == 0) { 224 break; 225 } 226 } 227 } 228 // CRC16 field 229 if (flag & 2) { 230 offset += 2; 231 } 232 233 uint64_t compressed_size, uncompressed_size; 234 TEST_AND_RETURN_FALSE(CalculateSizeOfDeflateBlock( 235 data, offset, &compressed_size, &uncompressed_size)); 236 TEST_AND_RETURN_FALSE(offset + compressed_size <= data.size()); 237 deflate_blocks->push_back(ByteExtent(offset, compressed_size)); 238 offset += compressed_size; 239 240 // Ignore CRC32; 241 TEST_AND_RETURN_FALSE(offset + 8 <= data.size()); 242 offset += 4; 243 uint32_t u_size = 0; 244 for (size_t i = 0; i < 4; i++) { 245 u_size |= static_cast<uint32_t>(data[offset++]) << (i * 8); 246 } 247 TEST_AND_RETURN_FALSE(uncompressed_size % (1 << 31) == u_size); 248 member_start = offset; 249 } 250 return true; 251} 252 253// For more information about the zip format, refer to 254// https://support.pkware.com/display/PKZIP/APPNOTE 255bool LocateDeflatesInZipArchive(const Buffer& data, 256 vector<ByteExtent>* deflate_blocks) { 257 uint64_t pos = 0; 258 while (pos <= data.size() - 30) { 259 // TODO(xunchang) add support for big endian system when searching for 260 // magic numbers. 261 if (get_unaligned<uint32_t>(data.data() + pos) != 0x04034b50) { 262 pos++; 263 continue; 264 } 265 266 // local file header format 267 // 0 4 0x04034b50 268 // 4 2 minimum version needed to extract 269 // 6 2 general purpose bit flag 270 // 8 2 compression method 271 // 10 4 file last modification date & time 272 // 14 4 CRC-32 273 // 18 4 compressed size 274 // 22 4 uncompressed size 275 // 26 2 file name length 276 // 28 2 extra field length 277 // 30 n file name 278 // 30+n m extra field 279 auto compression_method = get_unaligned<uint16_t>(data.data() + pos + 8); 280 if (compression_method != 8) { // non-deflate type 281 pos += 4; 282 continue; 283 } 284 285 auto compressed_size = get_unaligned<uint32_t>(data.data() + pos + 18); 286 auto uncompressed_size = get_unaligned<uint32_t>(data.data() + pos + 22); 287 auto file_name_length = get_unaligned<uint16_t>(data.data() + pos + 26); 288 auto extra_field_length = get_unaligned<uint16_t>(data.data() + pos + 28); 289 uint64_t header_size = 30 + file_name_length + extra_field_length; 290 291 // sanity check 292 if (static_cast<uint64_t>(header_size) + compressed_size > data.size() || 293 pos > data.size() - header_size - compressed_size) { 294 pos += 4; 295 continue; 296 } 297 298 uint64_t calculated_compressed_size; 299 uint64_t calculated_uncompressed_size; 300 if (!CalculateSizeOfDeflateBlock(data, pos + header_size, 301 &calculated_compressed_size, 302 &calculated_uncompressed_size)) { 303 LOG(ERROR) << "Failed to decompress the zip entry starting from: " << pos 304 << ", skip adding deflates for this entry."; 305 pos += 4; 306 continue; 307 } 308 309 // Double check the compressed size and uncompressed size if they are 310 // available in the file header. 311 if (compressed_size > 0 && compressed_size != calculated_compressed_size) { 312 LOG(WARNING) << "Compressed size in the file header: " << compressed_size 313 << " doesn't equal the real size: " 314 << calculated_compressed_size; 315 } 316 317 if (uncompressed_size > 0 && 318 uncompressed_size != calculated_uncompressed_size) { 319 LOG(WARNING) << "Uncompressed size in the file header: " 320 << uncompressed_size << " doesn't equal the real size: " 321 << calculated_uncompressed_size; 322 } 323 324 deflate_blocks->emplace_back(pos + header_size, calculated_compressed_size); 325 pos += header_size + calculated_compressed_size; 326 } 327 328 return true; 329} 330 331bool LocateDeflateSubBlocksInZipArchive(const Buffer& data, 332 vector<BitExtent>* deflates) { 333 vector<ByteExtent> deflate_blocks; 334 if (!LocateDeflatesInZipArchive(data, &deflate_blocks)) { 335 return false; 336 } 337 338 auto src = MemoryStream::CreateForRead(data); 339 return FindDeflateSubBlocks(src, deflate_blocks, deflates); 340} 341 342bool FindPuffLocations(const UniqueStreamPtr& src, 343 const vector<BitExtent>& deflates, 344 vector<ByteExtent>* puffs, 345 uint64_t* out_puff_size) { 346 Puffer puffer; 347 Buffer deflate_buffer; 348 349 // Here accumulate the size difference between each corresponding deflate and 350 // puff. At the end we add this cummulative size difference to the size of the 351 // deflate stream to get the size of the puff stream. We use signed size 352 // because puff size could be smaller than deflate size. 353 int64_t total_size_difference = 0; 354 for (auto deflate = deflates.begin(); deflate != deflates.end(); ++deflate) { 355 // Read from src into deflate_buffer. 356 auto start_byte = deflate->offset / 8; 357 auto end_byte = (deflate->offset + deflate->length + 7) / 8; 358 deflate_buffer.resize(end_byte - start_byte); 359 TEST_AND_RETURN_FALSE(src->Seek(start_byte)); 360 TEST_AND_RETURN_FALSE( 361 src->Read(deflate_buffer.data(), deflate_buffer.size())); 362 // Find the size of the puff. 363 BufferBitReader bit_reader(deflate_buffer.data(), deflate_buffer.size()); 364 uint64_t bits_to_skip = deflate->offset % 8; 365 TEST_AND_RETURN_FALSE(bit_reader.CacheBits(bits_to_skip)); 366 bit_reader.DropBits(bits_to_skip); 367 368 BufferPuffWriter puff_writer(nullptr, 0); 369 Error error; 370 TEST_AND_RETURN_FALSE( 371 puffer.PuffDeflate(&bit_reader, &puff_writer, nullptr, &error)); 372 TEST_AND_RETURN_FALSE(deflate_buffer.size() == bit_reader.Offset()); 373 374 // 1 if a deflate ends at the same byte that the next deflate starts and 375 // there is a few bits gap between them. In practice this may never happen, 376 // but it is a good idea to support it anyways. If there is a gap, the value 377 // of the gap will be saved as an integer byte to the puff stream. The parts 378 // of the byte that belogs to the deflates are shifted out. 379 int gap = 0; 380 if (deflate != deflates.begin()) { 381 auto prev_deflate = std::prev(deflate); 382 if ((prev_deflate->offset + prev_deflate->length == deflate->offset) 383 // If deflates are on byte boundary the gap will not be counted later, 384 // so we won't worry about it. 385 && (deflate->offset % 8 != 0)) { 386 gap = 1; 387 } 388 } 389 390 start_byte = ((deflate->offset + 7) / 8); 391 end_byte = (deflate->offset + deflate->length) / 8; 392 int64_t deflate_length_in_bytes = end_byte - start_byte; 393 394 // If there was no gap bits between the current and previous deflates, there 395 // will be no extra gap byte, so the offset will be shifted one byte back. 396 auto puff_offset = start_byte - gap + total_size_difference; 397 auto puff_size = puff_writer.Size(); 398 // Add the location into puff. 399 puffs->emplace_back(puff_offset, puff_size); 400 total_size_difference += 401 static_cast<int64_t>(puff_size) - deflate_length_in_bytes - gap; 402 } 403 404 uint64_t src_size; 405 TEST_AND_RETURN_FALSE(src->GetSize(&src_size)); 406 auto final_size = static_cast<int64_t>(src_size) + total_size_difference; 407 TEST_AND_RETURN_FALSE(final_size >= 0); 408 *out_puff_size = final_size; 409 return true; 410} 411 412} // namespace puffin 413