ZipFileRO.cpp revision 7c1b96a165f970a09ed239bb4fb3f1b0d8f2a407
1/*
2 * Copyright (C) 2007 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17//
18// Read-only access to Zip archives, with minimal heap allocation.
19//
20#define LOG_TAG "zipro"
21//#define LOG_NDEBUG 0
22#include "utils/ZipFileRO.h"
23#include "utils/Log.h"
24#include "utils/misc.h"
25
26#include <zlib.h>
27
28#include <string.h>
29#include <fcntl.h>
30#include <errno.h>
31#include <assert.h>
32
33using namespace android;
34
35/*
36 * Zip file constants.
37 */
38#define kEOCDSignature      0x06054b50
39#define kEOCDLen            22
40#define kEOCDNumEntries     8               // offset to #of entries in file
41#define kEOCDFileOffset     16              // offset to central directory
42
43#define kMaxCommentLen      65535           // longest possible in ushort
44#define kMaxEOCDSearch      (kMaxCommentLen + kEOCDLen)
45
46#define kLFHSignature       0x04034b50
47#define kLFHLen             30              // excluding variable-len fields
48#define kLFHNameLen         26              // offset to filename length
49#define kLFHExtraLen        28              // offset to extra length
50
51#define kCDESignature       0x02014b50
52#define kCDELen             46              // excluding variable-len fields
53#define kCDEMethod          10              // offset to compression method
54#define kCDEModWhen         12              // offset to modification timestamp
55#define kCDECRC             16              // offset to entry CRC
56#define kCDECompLen         20              // offset to compressed length
57#define kCDEUncompLen       24              // offset to uncompressed length
58#define kCDENameLen         28              // offset to filename length
59#define kCDEExtraLen        30              // offset to extra length
60#define kCDECommentLen      32              // offset to comment length
61#define kCDELocalOffset     42              // offset to local hdr
62
63/*
64 * The values we return for ZipEntryRO use 0 as an invalid value, so we
65 * want to adjust the hash table index by a fixed amount.  Using a large
66 * value helps insure that people don't mix & match arguments, e.g. to
67 * findEntryByIndex().
68 */
69#define kZipEntryAdj        10000
70
71/*
72 * Convert a ZipEntryRO to a hash table index, verifying that it's in a
73 * valid range.
74 */
75int ZipFileRO::entryToIndex(const ZipEntryRO entry) const
76{
77    long ent = ((long) entry) - kZipEntryAdj;
78    if (ent < 0 || ent >= mHashTableSize || mHashTable[ent].name == NULL) {
79        LOGW("Invalid ZipEntryRO %p (%ld)\n", entry, ent);
80        return -1;
81    }
82    return ent;
83}
84
85
86/*
87 * Open the specified file read-only.  We memory-map the entire thing and
88 * close the file before returning.
89 */
90status_t ZipFileRO::open(const char* zipFileName)
91{
92    int fd = -1;
93    off_t length;
94
95    assert(mFileMap == NULL);
96
97    /*
98     * Open and map the specified file.
99     */
100    fd = ::open(zipFileName, O_RDONLY);
101    if (fd < 0) {
102        LOGW("Unable to open zip '%s': %s\n", zipFileName, strerror(errno));
103        return NAME_NOT_FOUND;
104    }
105
106    length = lseek(fd, 0, SEEK_END);
107    if (length < 0) {
108        close(fd);
109        return UNKNOWN_ERROR;
110    }
111
112    mFileMap = new FileMap();
113    if (mFileMap == NULL) {
114        close(fd);
115        return NO_MEMORY;
116    }
117    if (!mFileMap->create(zipFileName, fd, 0, length, true)) {
118        LOGW("Unable to map '%s': %s\n", zipFileName, strerror(errno));
119        close(fd);
120        return UNKNOWN_ERROR;
121    }
122
123    mFd = fd;
124
125    /*
126     * Got it mapped, verify it and create data structures for fast access.
127     */
128    if (!parseZipArchive()) {
129        mFileMap->release();
130        mFileMap = NULL;
131        return UNKNOWN_ERROR;
132    }
133
134    return OK;
135}
136
137/*
138 * Parse the Zip archive, verifying its contents and initializing internal
139 * data structures.
140 */
141bool ZipFileRO::parseZipArchive(void)
142{
143#define CHECK_OFFSET(_off) {                                                \
144        if ((unsigned int) (_off) >= maxOffset) {                           \
145            LOGE("ERROR: bad offset %u (max %d): %s\n",                     \
146                (unsigned int) (_off), maxOffset, #_off);                   \
147            goto bail;                                                      \
148        }                                                                   \
149    }
150    const unsigned char* basePtr = (const unsigned char*)mFileMap->getDataPtr();
151    const unsigned char* ptr;
152    size_t length = mFileMap->getDataLength();
153    bool result = false;
154    unsigned int i, numEntries, cdOffset;
155    unsigned int val;
156
157    /*
158     * The first 4 bytes of the file will either be the local header
159     * signature for the first file (kLFHSignature) or, if the archive doesn't
160     * have any files in it, the end-of-central-directory signature
161     * (kEOCDSignature).
162     */
163    val = get4LE(basePtr);
164    if (val == kEOCDSignature) {
165        LOGI("Found Zip archive, but it looks empty\n");
166        goto bail;
167    } else if (val != kLFHSignature) {
168        LOGV("Not a Zip archive (found 0x%08x)\n", val);
169        goto bail;
170    }
171
172    /*
173     * Find the EOCD.  We'll find it immediately unless they have a file
174     * comment.
175     */
176    ptr = basePtr + length - kEOCDLen;
177
178    while (ptr >= basePtr) {
179        if (*ptr == (kEOCDSignature & 0xff) && get4LE(ptr) == kEOCDSignature)
180            break;
181        ptr--;
182    }
183    if (ptr < basePtr) {
184        LOGI("Could not find end-of-central-directory in Zip\n");
185        goto bail;
186    }
187
188    /*
189     * There are two interesting items in the EOCD block: the number of
190     * entries in the file, and the file offset of the start of the
191     * central directory.
192     *
193     * (There's actually a count of the #of entries in this file, and for
194     * all files which comprise a spanned archive, but for our purposes
195     * we're only interested in the current file.  Besides, we expect the
196     * two to be equivalent for our stuff.)
197     */
198    numEntries = get2LE(ptr + kEOCDNumEntries);
199    cdOffset = get4LE(ptr + kEOCDFileOffset);
200
201    /* valid offsets are [0,EOCD] */
202    unsigned int maxOffset;
203    maxOffset = (ptr - basePtr) +1;
204
205    LOGV("+++ numEntries=%d cdOffset=%d\n", numEntries, cdOffset);
206    if (numEntries == 0 || cdOffset >= length) {
207        LOGW("Invalid entries=%d offset=%d (len=%zd)\n",
208            numEntries, cdOffset, length);
209        goto bail;
210    }
211
212    /*
213     * Create hash table.  We have a minimum 75% load factor, possibly as
214     * low as 50% after we round off to a power of 2.
215     */
216    mNumEntries = numEntries;
217    mHashTableSize = roundUpPower2(1 + ((numEntries * 4) / 3));
218    mHashTable = (HashEntry*) calloc(1, sizeof(HashEntry) * mHashTableSize);
219
220    /*
221     * Walk through the central directory, adding entries to the hash
222     * table.
223     */
224    ptr = basePtr + cdOffset;
225    for (i = 0; i < numEntries; i++) {
226        unsigned int fileNameLen, extraLen, commentLen, localHdrOffset;
227        const unsigned char* localHdr;
228        unsigned int hash;
229
230        if (get4LE(ptr) != kCDESignature) {
231            LOGW("Missed a central dir sig (at %d)\n", i);
232            goto bail;
233        }
234        if (ptr + kCDELen > basePtr + length) {
235            LOGW("Ran off the end (at %d)\n", i);
236            goto bail;
237        }
238
239        localHdrOffset = get4LE(ptr + kCDELocalOffset);
240        CHECK_OFFSET(localHdrOffset);
241        fileNameLen = get2LE(ptr + kCDENameLen);
242        extraLen = get2LE(ptr + kCDEExtraLen);
243        commentLen = get2LE(ptr + kCDECommentLen);
244
245        //LOGV("+++ %d: localHdr=%d fnl=%d el=%d cl=%d\n",
246        //    i, localHdrOffset, fileNameLen, extraLen, commentLen);
247        //LOGV(" '%.*s'\n", fileNameLen, ptr + kCDELen);
248
249        /* add the CDE filename to the hash table */
250        hash = computeHash((const char*)ptr + kCDELen, fileNameLen);
251        addToHash((const char*)ptr + kCDELen, fileNameLen, hash);
252
253        localHdr = basePtr + localHdrOffset;
254        if (get4LE(localHdr) != kLFHSignature) {
255            LOGW("Bad offset to local header: %d (at %d)\n",
256                localHdrOffset, i);
257            goto bail;
258        }
259
260        ptr += kCDELen + fileNameLen + extraLen + commentLen;
261        CHECK_OFFSET(ptr - basePtr);
262    }
263
264    result = true;
265
266bail:
267    return result;
268#undef CHECK_OFFSET
269}
270
271
272/*
273 * Simple string hash function for non-null-terminated strings.
274 */
275/*static*/ unsigned int ZipFileRO::computeHash(const char* str, int len)
276{
277    unsigned int hash = 0;
278
279    while (len--)
280        hash = hash * 31 + *str++;
281
282    return hash;
283}
284
285/*
286 * Add a new entry to the hash table.
287 */
288void ZipFileRO::addToHash(const char* str, int strLen, unsigned int hash)
289{
290    int ent = hash & (mHashTableSize-1);
291
292    /*
293     * We over-allocate the table, so we're guaranteed to find an empty slot.
294     */
295    while (mHashTable[ent].name != NULL)
296        ent = (ent + 1) & (mHashTableSize-1);
297
298    mHashTable[ent].name = str;
299    mHashTable[ent].nameLen = strLen;
300}
301
302/*
303 * Find a matching entry.
304 *
305 * Returns 0 if not found.
306 */
307ZipEntryRO ZipFileRO::findEntryByName(const char* fileName) const
308{
309    int nameLen = strlen(fileName);
310    unsigned int hash = computeHash(fileName, nameLen);
311    int ent = hash & (mHashTableSize-1);
312
313    while (mHashTable[ent].name != NULL) {
314        if (mHashTable[ent].nameLen == nameLen &&
315            memcmp(mHashTable[ent].name, fileName, nameLen) == 0)
316        {
317            /* match */
318            return (ZipEntryRO) (ent + kZipEntryAdj);
319        }
320
321        ent = (ent + 1) & (mHashTableSize-1);
322    }
323
324    return NULL;
325}
326
327/*
328 * Find the Nth entry.
329 *
330 * This currently involves walking through the sparse hash table, counting
331 * non-empty entries.  If we need to speed this up we can either allocate
332 * a parallel lookup table or (perhaps better) provide an iterator interface.
333 */
334ZipEntryRO ZipFileRO::findEntryByIndex(int idx) const
335{
336    if (idx < 0 || idx >= mNumEntries) {
337        LOGW("Invalid index %d\n", idx);
338        return NULL;
339    }
340
341    for (int ent = 0; ent < mHashTableSize; ent++) {
342        if (mHashTable[ent].name != NULL) {
343            if (idx-- == 0)
344                return (ZipEntryRO) (ent + kZipEntryAdj);
345        }
346    }
347
348    return NULL;
349}
350
351/*
352 * Get the useful fields from the zip entry.
353 *
354 * Returns "false" if the offsets to the fields or the contents of the fields
355 * appear to be bogus.
356 */
357bool ZipFileRO::getEntryInfo(ZipEntryRO entry, int* pMethod, long* pUncompLen,
358    long* pCompLen, off_t* pOffset, long* pModWhen, long* pCrc32) const
359{
360    int ent = entryToIndex(entry);
361    if (ent < 0)
362        return false;
363
364    /*
365     * Recover the start of the central directory entry from the filename
366     * pointer.
367     */
368    const unsigned char* basePtr = (const unsigned char*)mFileMap->getDataPtr();
369    const unsigned char* ptr = (const unsigned char*) mHashTable[ent].name;
370    size_t zipLength = mFileMap->getDataLength();
371
372    ptr -= kCDELen;
373
374    int method = get2LE(ptr + kCDEMethod);
375    if (pMethod != NULL)
376        *pMethod = method;
377
378    if (pModWhen != NULL)
379        *pModWhen = get4LE(ptr + kCDEModWhen);
380    if (pCrc32 != NULL)
381        *pCrc32 = get4LE(ptr + kCDECRC);
382
383    /*
384     * We need to make sure that the lengths are not so large that somebody
385     * trying to map the compressed or uncompressed data runs off the end
386     * of the mapped region.
387     */
388    unsigned long localHdrOffset = get4LE(ptr + kCDELocalOffset);
389    if (localHdrOffset + kLFHLen >= zipLength) {
390        LOGE("ERROR: bad local hdr offset in zip\n");
391        return false;
392    }
393    const unsigned char* localHdr = basePtr + localHdrOffset;
394    off_t dataOffset = localHdrOffset + kLFHLen
395        + get2LE(localHdr + kLFHNameLen) + get2LE(localHdr + kLFHExtraLen);
396    if ((unsigned long) dataOffset >= zipLength) {
397        LOGE("ERROR: bad data offset in zip\n");
398        return false;
399    }
400
401    if (pCompLen != NULL) {
402        *pCompLen = get4LE(ptr + kCDECompLen);
403        if (*pCompLen < 0 || (size_t)(dataOffset + *pCompLen) >= zipLength) {
404            LOGE("ERROR: bad compressed length in zip\n");
405            return false;
406        }
407    }
408    if (pUncompLen != NULL) {
409        *pUncompLen = get4LE(ptr + kCDEUncompLen);
410        if (*pUncompLen < 0) {
411            LOGE("ERROR: negative uncompressed length in zip\n");
412            return false;
413        }
414        if (method == kCompressStored &&
415            (size_t)(dataOffset + *pUncompLen) >= zipLength)
416        {
417            LOGE("ERROR: bad uncompressed length in zip\n");
418            return false;
419        }
420    }
421
422    if (pOffset != NULL) {
423        *pOffset = dataOffset;
424    }
425    return true;
426}
427
428/*
429 * Copy the entry's filename to the buffer.
430 */
431int ZipFileRO::getEntryFileName(ZipEntryRO entry, char* buffer, int bufLen)
432    const
433{
434    int ent = entryToIndex(entry);
435    if (ent < 0)
436        return -1;
437
438    int nameLen = mHashTable[ent].nameLen;
439    if (bufLen < nameLen+1)
440        return nameLen+1;
441
442    memcpy(buffer, mHashTable[ent].name, nameLen);
443    buffer[nameLen] = '\0';
444    return 0;
445}
446
447/*
448 * Create a new FileMap object that spans the data in "entry".
449 */
450FileMap* ZipFileRO::createEntryFileMap(ZipEntryRO entry) const
451{
452    /*
453     * TODO: the efficient way to do this is to modify FileMap to allow
454     * sub-regions of a file to be mapped.  A reference-counting scheme
455     * can manage the base memory mapping.  For now, we just create a brand
456     * new mapping off of the Zip archive file descriptor.
457     */
458
459    FileMap* newMap;
460    long compLen;
461    off_t offset;
462
463    if (!getEntryInfo(entry, NULL, NULL, &compLen, &offset, NULL, NULL))
464        return NULL;
465
466    newMap = new FileMap();
467    if (!newMap->create(mFileMap->getFileName(), mFd, offset, compLen, true)) {
468        newMap->release();
469        return NULL;
470    }
471
472    return newMap;
473}
474
475/*
476 * Uncompress an entry, in its entirety, into the provided output buffer.
477 *
478 * This doesn't verify the data's CRC, which might be useful for
479 * uncompressed data.  The caller should be able to manage it.
480 */
481bool ZipFileRO::uncompressEntry(ZipEntryRO entry, void* buffer) const
482{
483    const int kSequentialMin = 32768;
484    bool result = false;
485    int ent = entryToIndex(entry);
486    if (ent < 0)
487        return -1;
488
489    const unsigned char* basePtr = (const unsigned char*)mFileMap->getDataPtr();
490    int method;
491    long uncompLen, compLen;
492    off_t offset;
493
494    getEntryInfo(entry, &method, &uncompLen, &compLen, &offset, NULL, NULL);
495
496    /*
497     * Experiment with madvise hint.  When we want to uncompress a file,
498     * we pull some stuff out of the central dir entry and then hit a
499     * bunch of compressed or uncompressed data sequentially.  The CDE
500     * visit will cause a limited amount of read-ahead because it's at
501     * the end of the file.  We could end up doing lots of extra disk
502     * access if the file we're prying open is small.  Bottom line is we
503     * probably don't want to turn MADV_SEQUENTIAL on and leave it on.
504     *
505     * So, if the compressed size of the file is above a certain minimum
506     * size, temporarily boost the read-ahead in the hope that the extra
507     * pair of system calls are negated by a reduction in page faults.
508     */
509    if (compLen > kSequentialMin)
510        mFileMap->advise(FileMap::SEQUENTIAL);
511
512    if (method == kCompressStored) {
513        memcpy(buffer, basePtr + offset, uncompLen);
514    } else {
515        if (!inflateBuffer(buffer, basePtr + offset, uncompLen, compLen))
516            goto bail;
517    }
518
519    if (compLen > kSequentialMin)
520        mFileMap->advise(FileMap::NORMAL);
521
522    result = true;
523
524bail:
525    return result;
526}
527
528/*
529 * Uncompress an entry, in its entirety, to an open file descriptor.
530 *
531 * This doesn't verify the data's CRC, but probably should.
532 */
533bool ZipFileRO::uncompressEntry(ZipEntryRO entry, int fd) const
534{
535    bool result = false;
536    int ent = entryToIndex(entry);
537    if (ent < 0)
538        return -1;
539
540    const unsigned char* basePtr = (const unsigned char*)mFileMap->getDataPtr();
541    int method;
542    long uncompLen, compLen;
543    off_t offset;
544
545    getEntryInfo(entry, &method, &uncompLen, &compLen, &offset, NULL, NULL);
546
547    if (method == kCompressStored) {
548        ssize_t actual;
549
550        actual = write(fd, basePtr + offset, uncompLen);
551        if (actual < 0) {
552            LOGE("Write failed: %s\n", strerror(errno));
553            goto bail;
554        } else if (actual != uncompLen) {
555            LOGE("Partial write during uncompress (%d of %ld)\n",
556                (int)actual, uncompLen);
557            goto bail;
558        } else {
559            LOGI("+++ successful write\n");
560        }
561    } else {
562        if (!inflateBuffer(fd, basePtr+offset, uncompLen, compLen))
563            goto bail;
564    }
565
566    result = true;
567
568bail:
569    return result;
570}
571
572/*
573 * Uncompress "deflate" data from one buffer to another.
574 */
575/*static*/ bool ZipFileRO::inflateBuffer(void* outBuf, const void* inBuf,
576    long uncompLen, long compLen)
577{
578    bool result = false;
579    z_stream zstream;
580    int zerr;
581
582    /*
583     * Initialize the zlib stream struct.
584     */
585	memset(&zstream, 0, sizeof(zstream));
586    zstream.zalloc = Z_NULL;
587    zstream.zfree = Z_NULL;
588    zstream.opaque = Z_NULL;
589    zstream.next_in = (Bytef*)inBuf;
590    zstream.avail_in = compLen;
591    zstream.next_out = (Bytef*) outBuf;
592    zstream.avail_out = uncompLen;
593    zstream.data_type = Z_UNKNOWN;
594
595	/*
596	 * Use the undocumented "negative window bits" feature to tell zlib
597	 * that there's no zlib header waiting for it.
598	 */
599    zerr = inflateInit2(&zstream, -MAX_WBITS);
600    if (zerr != Z_OK) {
601        if (zerr == Z_VERSION_ERROR) {
602            LOGE("Installed zlib is not compatible with linked version (%s)\n",
603                ZLIB_VERSION);
604        } else {
605            LOGE("Call to inflateInit2 failed (zerr=%d)\n", zerr);
606        }
607        goto bail;
608    }
609
610    /*
611     * Expand data.
612     */
613    zerr = inflate(&zstream, Z_FINISH);
614    if (zerr != Z_STREAM_END) {
615        LOGW("Zip inflate failed, zerr=%d (nIn=%p aIn=%u nOut=%p aOut=%u)\n",
616            zerr, zstream.next_in, zstream.avail_in,
617            zstream.next_out, zstream.avail_out);
618        goto z_bail;
619    }
620
621    /* paranoia */
622    if ((long) zstream.total_out != uncompLen) {
623        LOGW("Size mismatch on inflated file (%ld vs %ld)\n",
624            zstream.total_out, uncompLen);
625        goto z_bail;
626    }
627
628    result = true;
629
630z_bail:
631    inflateEnd(&zstream);        /* free up any allocated structures */
632
633bail:
634    return result;
635}
636
637/*
638 * Uncompress "deflate" data from one buffer to an open file descriptor.
639 */
640/*static*/ bool ZipFileRO::inflateBuffer(int fd, const void* inBuf,
641    long uncompLen, long compLen)
642{
643    bool result = false;
644    const int kWriteBufSize = 32768;
645    unsigned char writeBuf[kWriteBufSize];
646    z_stream zstream;
647    int zerr;
648
649    /*
650     * Initialize the zlib stream struct.
651     */
652	memset(&zstream, 0, sizeof(zstream));
653    zstream.zalloc = Z_NULL;
654    zstream.zfree = Z_NULL;
655    zstream.opaque = Z_NULL;
656    zstream.next_in = (Bytef*)inBuf;
657    zstream.avail_in = compLen;
658    zstream.next_out = (Bytef*) writeBuf;
659    zstream.avail_out = sizeof(writeBuf);
660    zstream.data_type = Z_UNKNOWN;
661
662	/*
663	 * Use the undocumented "negative window bits" feature to tell zlib
664	 * that there's no zlib header waiting for it.
665	 */
666    zerr = inflateInit2(&zstream, -MAX_WBITS);
667    if (zerr != Z_OK) {
668        if (zerr == Z_VERSION_ERROR) {
669            LOGE("Installed zlib is not compatible with linked version (%s)\n",
670                ZLIB_VERSION);
671        } else {
672            LOGE("Call to inflateInit2 failed (zerr=%d)\n", zerr);
673        }
674        goto bail;
675    }
676
677    /*
678     * Loop while we have more to do.
679     */
680    do {
681        /*
682         * Expand data.
683         */
684        zerr = inflate(&zstream, Z_NO_FLUSH);
685        if (zerr != Z_OK && zerr != Z_STREAM_END) {
686            LOGW("zlib inflate: zerr=%d (nIn=%p aIn=%u nOut=%p aOut=%u)\n",
687                zerr, zstream.next_in, zstream.avail_in,
688                zstream.next_out, zstream.avail_out);
689            goto z_bail;
690        }
691
692        /* write when we're full or when we're done */
693        if (zstream.avail_out == 0 ||
694            (zerr == Z_STREAM_END && zstream.avail_out != sizeof(writeBuf)))
695        {
696            long writeSize = zstream.next_out - writeBuf;
697            int cc = write(fd, writeBuf, writeSize);
698            if (cc != (int) writeSize) {
699                LOGW("write failed in inflate (%d vs %ld)\n", cc, writeSize);
700                goto z_bail;
701            }
702
703            zstream.next_out = writeBuf;
704            zstream.avail_out = sizeof(writeBuf);
705        }
706    } while (zerr == Z_OK);
707
708    assert(zerr == Z_STREAM_END);       /* other errors should've been caught */
709
710    /* paranoia */
711    if ((long) zstream.total_out != uncompLen) {
712        LOGW("Size mismatch on inflated file (%ld vs %ld)\n",
713            zstream.total_out, uncompLen);
714        goto z_bail;
715    }
716
717    result = true;
718
719z_bail:
720    inflateEnd(&zstream);        /* free up any allocated structures */
721
722bail:
723    return result;
724}
725