libs/utils/ZipFileRO.cpp

/*
 * Copyright (C) 2007 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

//
// Read-only access to Zip archives, with minimal heap allocation.
//
#define LOG_TAG "zipro"
//#define LOG_NDEBUG 0
#include "utils/ZipFileRO.h"
#include "utils/Log.h"
#include "utils/misc.h"

#include <zlib.h>

#include <string.h>
#include <fcntl.h>
#include <errno.h>
#include <assert.h>

using namespace android;

/*
 * Zip file constants.
 */
#define kEOCDSignature      0x06054b50
#define kEOCDLen            22
#define kEOCDNumEntries     8               // offset to #of entries in file
#define kEOCDFileOffset     16              // offset to central directory

#define kMaxCommentLen      65535           // longest possible in ushort
#define kMaxEOCDSearch      (kMaxCommentLen + kEOCDLen)

#define kLFHSignature       0x04034b50
#define kLFHLen             30              // excluding variable-len fields
#define kLFHNameLen         26              // offset to filename length
#define kLFHExtraLen        28              // offset to extra length

#define kCDESignature       0x02014b50
#define kCDELen             46              // excluding variable-len fields
#define kCDEMethod          10              // offset to compression method
#define kCDEModWhen         12              // offset to modification timestamp
#define kCDECRC             16              // offset to entry CRC
#define kCDECompLen         20              // offset to compressed length
#define kCDEUncompLen       24              // offset to uncompressed length
#define kCDENameLen         28              // offset to filename length
#define kCDEExtraLen        30              // offset to extra length
#define kCDECommentLen      32              // offset to comment length
#define kCDELocalOffset     42              // offset to local hdr

/*
 * The values we return for ZipEntryRO use 0 as an invalid value, so we
 * want to adjust the hash table index by a fixed amount.  Using a large
 * value helps insure that people don't mix & match arguments, e.g. to
 * findEntryByIndex().
 */
#define kZipEntryAdj        10000

/*
 * Convert a ZipEntryRO to a hash table index, verifying that it's in a
 * valid range.
 */
int ZipFileRO::entryToIndex(const ZipEntryRO entry) const
{
    long ent = ((long) entry) - kZipEntryAdj;
    if (ent < 0 || ent >= mHashTableSize || mHashTable[ent].name == NULL) {
        LOGW("Invalid ZipEntryRO %p (%ld)\n", entry, ent);
        return -1;
    }
    return ent;
}


/*
 * Open the specified file read-only.  We memory-map the entire thing and
 * close the file before returning.
 */
status_t ZipFileRO::open(const char* zipFileName)
{
    int fd = -1;
    off_t length;

    assert(mFileMap == NULL);

    /*
     * Open and map the specified file.
     */
    fd = ::open(zipFileName, O_RDONLY);
    if (fd < 0) {
        LOGW("Unable to open zip '%s': %s\n", zipFileName, strerror(errno));
        return NAME_NOT_FOUND;
    }

    length = lseek(fd, 0, SEEK_END);
    if (length < 0) {
        close(fd);
        return UNKNOWN_ERROR;
    }

    mFileMap = new FileMap();
    if (mFileMap == NULL) {
        close(fd);
        return NO_MEMORY;
    }
    if (!mFileMap->create(zipFileName, fd, 0, length, true)) {
        LOGW("Unable to map '%s': %s\n", zipFileName, strerror(errno));
        close(fd);
        return UNKNOWN_ERROR;
    }

    mFd = fd;

    /*
     * Got it mapped, verify it and create data structures for fast access.
     */
    if (!parseZipArchive()) {
        mFileMap->release();
        mFileMap = NULL;
        return UNKNOWN_ERROR;
    }

    return OK;
}

/*
 * Parse the Zip archive, verifying its contents and initializing internal
 * data structures.
 */
bool ZipFileRO::parseZipArchive(void)
{
#define CHECK_OFFSET(_off) {                                                \
        if ((unsigned int) (_off) >= maxOffset) {                           \
            LOGE("ERROR: bad offset %u (max %d): %s\n",                     \
                (unsigned int) (_off), maxOffset, #_off);                   \
            goto bail;                                                      \
        }                                                                   \
    }
    const unsigned char* basePtr = (const unsigned char*)mFileMap->getDataPtr();
    const unsigned char* ptr;
    size_t length = mFileMap->getDataLength();
    bool result = false;
    unsigned int i, numEntries, cdOffset;
    unsigned int val;

    /*
     * The first 4 bytes of the file will either be the local header
     * signature for the first file (kLFHSignature) or, if the archive doesn't
     * have any files in it, the end-of-central-directory signature
     * (kEOCDSignature).
     */
    val = get4LE(basePtr);
    if (val == kEOCDSignature) {
        LOGI("Found Zip archive, but it looks empty\n");
        goto bail;
    } else if (val != kLFHSignature) {
        LOGV("Not a Zip archive (found 0x%08x)\n", val);
        goto bail;
    }

    /*
     * Find the EOCD.  We'll find it immediately unless they have a file
     * comment.
     */
    ptr = basePtr + length - kEOCDLen;

    while (ptr >= basePtr) {
        if (*ptr == (kEOCDSignature & 0xff) && get4LE(ptr) == kEOCDSignature)
            break;
        ptr--;
    }
    if (ptr < basePtr) {
        LOGI("Could not find end-of-central-directory in Zip\n");
        goto bail;
    }

    /*
     * There are two interesting items in the EOCD block: the number of
     * entries in the file, and the file offset of the start of the
     * central directory.
     *
     * (There's actually a count of the #of entries in this file, and for
     * all files which comprise a spanned archive, but for our purposes
     * we're only interested in the current file.  Besides, we expect the
     * two to be equivalent for our stuff.)
     */
    numEntries = get2LE(ptr + kEOCDNumEntries);
    cdOffset = get4LE(ptr + kEOCDFileOffset);

    /* valid offsets are [0,EOCD] */
    unsigned int maxOffset;
    maxOffset = (ptr - basePtr) +1;

    LOGV("+++ numEntries=%d cdOffset=%d\n", numEntries, cdOffset);
    if (numEntries == 0 || cdOffset >= length) {
        LOGW("Invalid entries=%d offset=%d (len=%zd)\n",
            numEntries, cdOffset, length);
        goto bail;
    }

    /*
     * Create hash table.  We have a minimum 75% load factor, possibly as
     * low as 50% after we round off to a power of 2.
     */
    mNumEntries = numEntries;
    mHashTableSize = roundUpPower2(1 + ((numEntries * 4) / 3));
    mHashTable = (HashEntry*) calloc(1, sizeof(HashEntry) * mHashTableSize);

    /*
     * Walk through the central directory, adding entries to the hash
     * table.
     */
    ptr = basePtr + cdOffset;
    for (i = 0; i < numEntries; i++) {
        unsigned int fileNameLen, extraLen, commentLen, localHdrOffset;
        const unsigned char* localHdr;
        unsigned int hash;

        if (get4LE(ptr) != kCDESignature) {
            LOGW("Missed a central dir sig (at %d)\n", i);
            goto bail;
        }
        if (ptr + kCDELen > basePtr + length) {
            LOGW("Ran off the end (at %d)\n", i);
            goto bail;
        }

        localHdrOffset = get4LE(ptr + kCDELocalOffset);
        CHECK_OFFSET(localHdrOffset);
        fileNameLen = get2LE(ptr + kCDENameLen);
        extraLen = get2LE(ptr + kCDEExtraLen);
        commentLen = get2LE(ptr + kCDECommentLen);

        //LOGV("+++ %d: localHdr=%d fnl=%d el=%d cl=%d\n",
        //    i, localHdrOffset, fileNameLen, extraLen, commentLen);
        //LOGV(" '%.*s'\n", fileNameLen, ptr + kCDELen);

        /* add the CDE filename to the hash table */
        hash = computeHash((const char*)ptr + kCDELen, fileNameLen);
        addToHash((const char*)ptr + kCDELen, fileNameLen, hash);

        localHdr = basePtr + localHdrOffset;
        if (get4LE(localHdr) != kLFHSignature) {
            LOGW("Bad offset to local header: %d (at %d)\n",
                localHdrOffset, i);
            goto bail;
        }

        ptr += kCDELen + fileNameLen + extraLen + commentLen;
        CHECK_OFFSET(ptr - basePtr);
    }

    result = true;

bail:
    return result;
#undef CHECK_OFFSET
}


/*
 * Simple string hash function for non-null-terminated strings.
 */
/*static*/ unsigned int ZipFileRO::computeHash(const char* str, int len)
{
    unsigned int hash = 0;

    while (len--)
        hash = hash * 31 + *str++;

    return hash;
}

/*
 * Add a new entry to the hash table.
 */
void ZipFileRO::addToHash(const char* str, int strLen, unsigned int hash)
{
    int ent = hash & (mHashTableSize-1);

    /*
     * We over-allocate the table, so we're guaranteed to find an empty slot.
     */
    while (mHashTable[ent].name != NULL)
        ent = (ent + 1) & (mHashTableSize-1);

    mHashTable[ent].name = str;
    mHashTable[ent].nameLen = strLen;
}

/*
 * Find a matching entry.
 *
 * Returns 0 if not found.
 */
ZipEntryRO ZipFileRO::findEntryByName(const char* fileName) const
{
    int nameLen = strlen(fileName);
    unsigned int hash = computeHash(fileName, nameLen);
    int ent = hash & (mHashTableSize-1);

    while (mHashTable[ent].name != NULL) {
        if (mHashTable[ent].nameLen == nameLen &&
            memcmp(mHashTable[ent].name, fileName, nameLen) == 0)
        {
            /* match */
            return (ZipEntryRO) (ent + kZipEntryAdj);
        }

        ent = (ent + 1) & (mHashTableSize-1);
    }

    return NULL;
}

/*
 * Find the Nth entry.
 *
 * This currently involves walking through the sparse hash table, counting
 * non-empty entries.  If we need to speed this up we can either allocate
 * a parallel lookup table or (perhaps better) provide an iterator interface.
 */
ZipEntryRO ZipFileRO::findEntryByIndex(int idx) const
{
    if (idx < 0 || idx >= mNumEntries) {
        LOGW("Invalid index %d\n", idx);
        return NULL;
    }

    for (int ent = 0; ent < mHashTableSize; ent++) {
        if (mHashTable[ent].name != NULL) {
            if (idx-- == 0)
                return (ZipEntryRO) (ent + kZipEntryAdj);
        }
    }

    return NULL;
}

/*
 * Get the useful fields from the zip entry.
 *
 * Returns "false" if the offsets to the fields or the contents of the fields
 * appear to be bogus.
 */
bool ZipFileRO::getEntryInfo(ZipEntryRO entry, int* pMethod, long* pUncompLen,
    long* pCompLen, off_t* pOffset, long* pModWhen, long* pCrc32) const
{
    int ent = entryToIndex(entry);
    if (ent < 0)
        return false;

    /*
     * Recover the start of the central directory entry from the filename
     * pointer.
     */
    const unsigned char* basePtr = (const unsigned char*)mFileMap->getDataPtr();
    const unsigned char* ptr = (const unsigned char*) mHashTable[ent].name;
    size_t zipLength = mFileMap->getDataLength();

    ptr -= kCDELen;

    int method = get2LE(ptr + kCDEMethod);
    if (pMethod != NULL)
        *pMethod = method;

    if (pModWhen != NULL)
        *pModWhen = get4LE(ptr + kCDEModWhen);
    if (pCrc32 != NULL)
        *pCrc32 = get4LE(ptr + kCDECRC);

    /*
     * We need to make sure that the lengths are not so large that somebody
     * trying to map the compressed or uncompressed data runs off the end
     * of the mapped region.
     */
    unsigned long localHdrOffset = get4LE(ptr + kCDELocalOffset);
    if (localHdrOffset + kLFHLen >= zipLength) {
        LOGE("ERROR: bad local hdr offset in zip\n");
        return false;
    }
    const unsigned char* localHdr = basePtr + localHdrOffset;
    off_t dataOffset = localHdrOffset + kLFHLen
        + get2LE(localHdr + kLFHNameLen) + get2LE(localHdr + kLFHExtraLen);
    if ((unsigned long) dataOffset >= zipLength) {
        LOGE("ERROR: bad data offset in zip\n");
        return false;
    }

    if (pCompLen != NULL) {
        *pCompLen = get4LE(ptr + kCDECompLen);
        if (*pCompLen < 0 || (size_t)(dataOffset + *pCompLen) >= zipLength) {
            LOGE("ERROR: bad compressed length in zip\n");
            return false;
        }
    }
    if (pUncompLen != NULL) {
        *pUncompLen = get4LE(ptr + kCDEUncompLen);
        if (*pUncompLen < 0) {
            LOGE("ERROR: negative uncompressed length in zip\n");
            return false;
        }
        if (method == kCompressStored &&
            (size_t)(dataOffset + *pUncompLen) >= zipLength)
        {
            LOGE("ERROR: bad uncompressed length in zip\n");
            return false;
        }
    }

    if (pOffset != NULL) {
        *pOffset = dataOffset;
    }
    return true;
}

/*
 * Copy the entry's filename to the buffer.
 */
int ZipFileRO::getEntryFileName(ZipEntryRO entry, char* buffer, int bufLen)
    const
{
    int ent = entryToIndex(entry);
    if (ent < 0)
        return -1;

    int nameLen = mHashTable[ent].nameLen;
    if (bufLen < nameLen+1)
        return nameLen+1;

    memcpy(buffer, mHashTable[ent].name, nameLen);
    buffer[nameLen] = '\0';
    return 0;
}

/*
 * Create a new FileMap object that spans the data in "entry".
 */
FileMap* ZipFileRO::createEntryFileMap(ZipEntryRO entry) const
{
    /*
     * TODO: the efficient way to do this is to modify FileMap to allow
     * sub-regions of a file to be mapped.  A reference-counting scheme
     * can manage the base memory mapping.  For now, we just create a brand
     * new mapping off of the Zip archive file descriptor.
     */

    FileMap* newMap;
    long compLen;
    off_t offset;

    if (!getEntryInfo(entry, NULL, NULL, &compLen, &offset, NULL, NULL))
        return NULL;

    newMap = new FileMap();
    if (!newMap->create(mFileMap->getFileName(), mFd, offset, compLen, true)) {
        newMap->release();
        return NULL;
    }

    return newMap;
}

/*
 * Uncompress an entry, in its entirety, into the provided output buffer.
 *
 * This doesn't verify the data's CRC, which might be useful for
 * uncompressed data.  The caller should be able to manage it.
 */
bool ZipFileRO::uncompressEntry(ZipEntryRO entry, void* buffer) const
{
    const int kSequentialMin = 32768;
    bool result = false;
    int ent = entryToIndex(entry);
    if (ent < 0)
        return -1;

    const unsigned char* basePtr = (const unsigned char*)mFileMap->getDataPtr();
    int method;
    long uncompLen, compLen;
    off_t offset;

    getEntryInfo(entry, &method, &uncompLen, &compLen, &offset, NULL, NULL);

    /*
     * Experiment with madvise hint.  When we want to uncompress a file,
     * we pull some stuff out of the central dir entry and then hit a
     * bunch of compressed or uncompressed data sequentially.  The CDE
     * visit will cause a limited amount of read-ahead because it's at
     * the end of the file.  We could end up doing lots of extra disk
     * access if the file we're prying open is small.  Bottom line is we
     * probably don't want to turn MADV_SEQUENTIAL on and leave it on.
     *
     * So, if the compressed size of the file is above a certain minimum
     * size, temporarily boost the read-ahead in the hope that the extra
     * pair of system calls are negated by a reduction in page faults.
     */
    if (compLen > kSequentialMin)
        mFileMap->advise(FileMap::SEQUENTIAL);

    if (method == kCompressStored) {
        memcpy(buffer, basePtr + offset, uncompLen);
    } else {
        if (!inflateBuffer(buffer, basePtr + offset, uncompLen, compLen))
            goto bail;
    }

    if (compLen > kSequentialMin)
        mFileMap->advise(FileMap::NORMAL);

    result = true;

bail:
    return result;
}

/*
 * Uncompress an entry, in its entirety, to an open file descriptor.
 *
 * This doesn't verify the data's CRC, but probably should.
 */
bool ZipFileRO::uncompressEntry(ZipEntryRO entry, int fd) const
{
    bool result = false;
    int ent = entryToIndex(entry);
    if (ent < 0)
        return -1;

    const unsigned char* basePtr = (const unsigned char*)mFileMap->getDataPtr();
    int method;
    long uncompLen, compLen;
    off_t offset;

    getEntryInfo(entry, &method, &uncompLen, &compLen, &offset, NULL, NULL);

    if (method == kCompressStored) {
        ssize_t actual;

        actual = write(fd, basePtr + offset, uncompLen);
        if (actual < 0) {
            LOGE("Write failed: %s\n", strerror(errno));
            goto bail;
        } else if (actual != uncompLen) {
            LOGE("Partial write during uncompress (%d of %ld)\n",
                (int)actual, uncompLen);
            goto bail;
        } else {
            LOGI("+++ successful write\n");
        }
    } else {
        if (!inflateBuffer(fd, basePtr+offset, uncompLen, compLen))
            goto bail;
    }

    result = true;

bail:
    return result;
}

/*
 * Uncompress "deflate" data from one buffer to another.
 */
/*static*/ bool ZipFileRO::inflateBuffer(void* outBuf, const void* inBuf,
    long uncompLen, long compLen)
{
    bool result = false;
    z_stream zstream;
    int zerr;

    /*
     * Initialize the zlib stream struct.
     */
	memset(&zstream, 0, sizeof(zstream));
    zstream.zalloc = Z_NULL;
    zstream.zfree = Z_NULL;
    zstream.opaque = Z_NULL;
    zstream.next_in = (Bytef*)inBuf;
    zstream.avail_in = compLen;
    zstream.next_out = (Bytef*) outBuf;
    zstream.avail_out = uncompLen;
    zstream.data_type = Z_UNKNOWN;

	/*
	 * Use the undocumented "negative window bits" feature to tell zlib
	 * that there's no zlib header waiting for it.
	 */
    zerr = inflateInit2(&zstream, -MAX_WBITS);
    if (zerr != Z_OK) {
        if (zerr == Z_VERSION_ERROR) {
            LOGE("Installed zlib is not compatible with linked version (%s)\n",
                ZLIB_VERSION);
        } else {
            LOGE("Call to inflateInit2 failed (zerr=%d)\n", zerr);
        }
        goto bail;
    }

    /*
     * Expand data.
     */
    zerr = inflate(&zstream, Z_FINISH);
    if (zerr != Z_STREAM_END) {
        LOGW("Zip inflate failed, zerr=%d (nIn=%p aIn=%u nOut=%p aOut=%u)\n",
            zerr, zstream.next_in, zstream.avail_in,
            zstream.next_out, zstream.avail_out);
        goto z_bail;
    }

    /* paranoia */
    if ((long) zstream.total_out != uncompLen) {
        LOGW("Size mismatch on inflated file (%ld vs %ld)\n",
            zstream.total_out, uncompLen);
        goto z_bail;
    }

    result = true;

z_bail:
    inflateEnd(&zstream);        /* free up any allocated structures */

bail:
    return result;
}

/*
 * Uncompress "deflate" data from one buffer to an open file descriptor.
 */
/*static*/ bool ZipFileRO::inflateBuffer(int fd, const void* inBuf,
    long uncompLen, long compLen)
{
    bool result = false;
    const int kWriteBufSize = 32768;
    unsigned char writeBuf[kWriteBufSize];
    z_stream zstream;
    int zerr;

    /*
     * Initialize the zlib stream struct.
     */
	memset(&zstream, 0, sizeof(zstream));
    zstream.zalloc = Z_NULL;
    zstream.zfree = Z_NULL;
    zstream.opaque = Z_NULL;
    zstream.next_in = (Bytef*)inBuf;
    zstream.avail_in = compLen;
    zstream.next_out = (Bytef*) writeBuf;
    zstream.avail_out = sizeof(writeBuf);
    zstream.data_type = Z_UNKNOWN;

	/*
	 * Use the undocumented "negative window bits" feature to tell zlib
	 * that there's no zlib header waiting for it.
	 */
    zerr = inflateInit2(&zstream, -MAX_WBITS);
    if (zerr != Z_OK) {
        if (zerr == Z_VERSION_ERROR) {
            LOGE("Installed zlib is not compatible with linked version (%s)\n",
                ZLIB_VERSION);
        } else {
            LOGE("Call to inflateInit2 failed (zerr=%d)\n", zerr);
        }
        goto bail;
    }

    /*
     * Loop while we have more to do.
     */
    do {
        /*
         * Expand data.
         */
        zerr = inflate(&zstream, Z_NO_FLUSH);
        if (zerr != Z_OK && zerr != Z_STREAM_END) {
            LOGW("zlib inflate: zerr=%d (nIn=%p aIn=%u nOut=%p aOut=%u)\n",
                zerr, zstream.next_in, zstream.avail_in,
                zstream.next_out, zstream.avail_out);
            goto z_bail;
        }

        /* write when we're full or when we're done */
        if (zstream.avail_out == 0 ||
            (zerr == Z_STREAM_END && zstream.avail_out != sizeof(writeBuf)))
        {
            long writeSize = zstream.next_out - writeBuf;
            int cc = write(fd, writeBuf, writeSize);
            if (cc != (int) writeSize) {
                LOGW("write failed in inflate (%d vs %ld)\n", cc, writeSize);
                goto z_bail;
            }

            zstream.next_out = writeBuf;
            zstream.avail_out = sizeof(writeBuf);
        }
    } while (zerr == Z_OK);

    assert(zerr == Z_STREAM_END);       /* other errors should've been caught */

    /* paranoia */
    if ((long) zstream.total_out != uncompLen) {
        LOGW("Size mismatch on inflated file (%ld vs %ld)\n",
            zstream.total_out, uncompLen);
        goto z_bail;
    }

    result = true;

z_bail:
    inflateEnd(&zstream);        /* free up any allocated structures */

bail:
    return result;
}