11f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/* inffas8664.c is a hand tuned assembler version of inffast.c - fast decoding
21f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * version for AMD64 on Windows using Microsoft C compiler
31f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher *
41f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * Copyright (C) 1995-2003 Mark Adler
51f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * For conditions of distribution and use, see copyright notice in zlib.h
6037726ca44408bf897791f5197b652defd45f5bePhillip Lougher *
793fce13e13cbbbaa652028c627ce37e96a9679e8Phillip Lougher * Copyright (C) 2003 Chris Anderson <christop@charm.net>
883d42a3fc898962aa1f1e8387f2ccb1114e0d294Phillip Lougher * Please use the copyright conditions above.
91f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher *
101f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * 2005 - Adaptation to Microsoft C Compiler for AMD64 by Gilles Vollant
111f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher *
121f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * inffas8664.c call function inffas8664fnc in inffasx64.asm
131f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher *  inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
141f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher *
151f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * Dec-29-2003 -- I added AMD64 inflate asm support.  This version is also
161f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * slightly quicker on x86 systems because, instead of using rep movsb to copy
171f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * data, it uses rep movsw, which moves data in 2-byte chunks instead of single
181f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * bytes.  I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates
191f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * from http://fedora.linux.duke.edu/fc1_x86_64
201f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with
211f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * 1GB ram.  The 64-bit version is about 4% faster than the 32-bit version,
221f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * when decompressing mozilla-source-1.3.tar.gz.
231f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher *
241f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
251f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * the gcc -S output of zlib-1.2.0/inffast.c.  Zlib-1.2.0 is in beta release at
261f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * the moment.  I have successfully compiled and tested this code with gcc2.96,
270e45365737bf5283627e32253f2279c4d9fa32d0plougher * gcc3.2, icc5.0, msvc6.0.  It is very close to the speed of inffast.S
281b899fc316f7eba7a31da12dc0c9b69ada441059plougher * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
291b899fc316f7eba7a31da12dc0c9b69ada441059plougher * enabled.  I will attempt to merge the MMX code into this version.  Newer
301f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * versions of this and inffast.S can be found at
311f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
321f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher *
331f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher */
341f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
351f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher#include <stdio.h>
361f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher#include "zutil.h"
371f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher#include "inftrees.h"
381f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher#include "inflate.h"
394c99cb7f458d8e1c598f1c80793daf3696c9b528plougher#include "inffast.h"
401f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
414c99cb7f458d8e1c598f1c80793daf3696c9b528plougher/* Mark Adler's comments from inffast.c: */
422b1aa06131c7d8d4361d79172afb9594e66a7280Phillip Lougher
431f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/*
441f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher   Decode literal, length, and distance codes and write out the resulting
451b899fc316f7eba7a31da12dc0c9b69ada441059plougher   literal and match bytes until either not enough input or output is
461f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher   available, an end-of-block is encountered, or a data error is encountered.
471f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher   When large enough input and output buffers are supplied to inflate(), for
481f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher   example, a 16K input buffer and a 64K output buffer, more than 95% of the
491f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher   inflate execution time is spent in this routine.
501f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
511f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher   Entry assumptions:
52e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher
531f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher        state->mode == LEN
541f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher        strm->avail_in >= 6
551f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher        strm->avail_out >= 258
561f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher        start >= strm->avail_out
571f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher        state->bits < 8
581f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
591f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher   On return, state->mode is one of:
601f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
611f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher        LEN -- ran out of enough output space or enough available input
621f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher        TYPE -- reached end of block code, inflate() to interpret next block
631f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher        BAD -- error in block data
640e45365737bf5283627e32253f2279c4d9fa32d0plougher
653b75d2fa82ec06cc4f8716643d538d2db662e1bdplougher   Notes:
66ae9d58cc7aba3f8d40ef0fcef36e992ec34ac1bbplougher
67a017a9770707e1b0afb81db40c4923ae56caf898plougher    - The maximum input bits used by a length/distance pair is 15 bits for the
681f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher      length code, 5 bits for the length extra, 15 bits for the distance code,
691f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher      and 13 bits for the distance extra.  This totals 48 bits, or six bytes.
701f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher      Therefore if strm->avail_in >= 6, then there is enough input to avoid
711f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher      checking for available input while decoding.
721f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
731f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    - The maximum bytes that a single length/distance pair can output is 258
741f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher      bytes, which is the maximum length that can be coded.  inflate_fast()
751f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher      requires strm->avail_out >= 258 for each loop to avoid checking for
761f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher      output space.
771f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher */
781f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
791f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
801f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
811f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    typedef struct inffast_ar {
821f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/* 64   32                               x86  x86_64 */
831f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/* ar offset                              register */
841f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/*  0    0 */ void *esp;                /* esp save */
851f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/*  8    4 */ void *ebp;                /* ebp save */
861f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/* 16    8 */ unsigned char FAR *in;    /* esi rsi  local strm->next_in */
871f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/* 24   12 */ unsigned char FAR *last;  /*     r9   while in < last */
881f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/* 32   16 */ unsigned char FAR *out;   /* edi rdi  local strm->next_out */
890e45365737bf5283627e32253f2279c4d9fa32d0plougher/* 40   20 */ unsigned char FAR *beg;   /*          inflate()'s init next_out */
900e45365737bf5283627e32253f2279c4d9fa32d0plougher/* 48   24 */ unsigned char FAR *end;   /*     r10  while out < end */
910e45365737bf5283627e32253f2279c4d9fa32d0plougher/* 56   28 */ unsigned char FAR *window;/*          size of window, wsize!=0 */
92ae9d58cc7aba3f8d40ef0fcef36e992ec34ac1bbplougher/* 64   32 */ code const FAR *lcode;    /* ebp rbp  local strm->lencode */
93ae9d58cc7aba3f8d40ef0fcef36e992ec34ac1bbplougher/* 72   36 */ code const FAR *dcode;    /*     r11  local strm->distcode */
94ae9d58cc7aba3f8d40ef0fcef36e992ec34ac1bbplougher/* 80   40 */ size_t /*unsigned long */hold;       /* edx rdx  local strm->hold */
95ae9d58cc7aba3f8d40ef0fcef36e992ec34ac1bbplougher/* 88   44 */ unsigned bits;            /* ebx rbx  local strm->bits */
96ae9d58cc7aba3f8d40ef0fcef36e992ec34ac1bbplougher/* 92   48 */ unsigned wsize;           /*          window size */
97ae9d58cc7aba3f8d40ef0fcef36e992ec34ac1bbplougher/* 96   52 */ unsigned write;           /*          window write index */
98a017a9770707e1b0afb81db40c4923ae56caf898plougher/*100   56 */ unsigned lmask;           /*     r12  mask for lcode */
99a017a9770707e1b0afb81db40c4923ae56caf898plougher/*104   60 */ unsigned dmask;           /*     r13  mask for dcode */
100a017a9770707e1b0afb81db40c4923ae56caf898plougher/*108   64 */ unsigned len;             /*     r14  match length */
1013b75d2fa82ec06cc4f8716643d538d2db662e1bdplougher/*112   68 */ unsigned dist;            /*     r15  match distance */
102a017a9770707e1b0afb81db40c4923ae56caf898plougher/*116   72 */ unsigned status;          /*          set when state chng*/
103a017a9770707e1b0afb81db40c4923ae56caf898plougher    } type_ar;
104a017a9770707e1b0afb81db40c4923ae56caf898plougher#ifdef ASMINF
105a017a9770707e1b0afb81db40c4923ae56caf898plougher
106a017a9770707e1b0afb81db40c4923ae56caf898ploughervoid inflate_fast(strm, start)
1071f413c84d736495fd61ff05ebe52c3a01a4d95c2plougherz_streamp strm;
1081f413c84d736495fd61ff05ebe52c3a01a4d95c2plougherunsigned start;         /* inflate()'s starting value for strm->avail_out */
1091f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher{
1101f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    struct inflate_state FAR *state;
1111f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    type_ar ar;
1121f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    void inffas8664fnc(struct inffast_ar * par);
1131f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
1141f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
1151f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
1161f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher#if (defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )) || (defined(_MSC_VER) && defined(_M_AMD64))
1171f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher#define PAD_AVAIL_IN 6
1183f23833eafaba866c1d35b10e67dc6945786e7f6plougher#define PAD_AVAIL_OUT 258
1193f23833eafaba866c1d35b10e67dc6945786e7f6plougher#else
1203f23833eafaba866c1d35b10e67dc6945786e7f6plougher#define PAD_AVAIL_IN 5
1213f23833eafaba866c1d35b10e67dc6945786e7f6plougher#define PAD_AVAIL_OUT 257
1223f23833eafaba866c1d35b10e67dc6945786e7f6plougher#endif
1231f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
124e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher    /* copy state to local variables */
125e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher    state = (struct inflate_state FAR *)strm->state;
126e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher
127e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher    ar.in = strm->next_in;
128e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher    ar.last = ar.in + (strm->avail_in - PAD_AVAIL_IN);
129e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher    ar.out = strm->next_out;
130e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher    ar.beg = ar.out - (start - strm->avail_out);
1311f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    ar.end = ar.out + (strm->avail_out - PAD_AVAIL_OUT);
1321f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    ar.wsize = state->wsize;
1331f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    ar.write = state->wnext;
1341f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    ar.window = state->window;
1351f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    ar.hold = state->hold;
1361f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    ar.bits = state->bits;
1371f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    ar.lcode = state->lencode;
1381f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    ar.dcode = state->distcode;
1391f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    ar.lmask = (1U << state->lenbits) - 1;
1401f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    ar.dmask = (1U << state->distbits) - 1;
1411f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
142f03d02e0d5c5f7d97d63adc665dd9f524e134c23plougher    /* decode literals and length/distances until end-of-block or not enough
143f03d02e0d5c5f7d97d63adc665dd9f524e134c23plougher       input data or output space */
1441f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
1451f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    /* align in on 1/2 hold size boundary */
1461f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    while (((size_t)(void *)ar.in & (sizeof(ar.hold) / 2 - 1)) != 0) {
1471f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher        ar.hold += (unsigned long)*ar.in++ << ar.bits;
1481f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher        ar.bits += 8;
1491f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    }
1501f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
1511f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    inffas8664fnc(&ar);
1521f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
1531f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    if (ar.status > 1) {
1541f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher        if (ar.status == 2)
15511fb64f595e7c989993ba0c3a7d86fa1b0249e12plougher            strm->msg = "invalid literal/length code";
1561f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher        else if (ar.status == 3)
1571f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher            strm->msg = "invalid distance code";
1581f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher        else
1591f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher            strm->msg = "invalid distance too far back";
1601f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher        state->mode = BAD;
1611f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    }
1621f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    else if ( ar.status == 1 ) {
1631f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher        state->mode = TYPE;
1641f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    }
1651f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
1661b81edc57975521caed67438f8b1af9ff3c8a25aPhillip Lougher    /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
1671b81edc57975521caed67438f8b1af9ff3c8a25aPhillip Lougher    ar.len = ar.bits >> 3;
1681f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    ar.in -= ar.len;
1691f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    ar.bits -= ar.len << 3;
1701f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    ar.hold &= (1U << ar.bits) - 1;
1711f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher
1721f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    /* update state and return */
1731f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    strm->next_in = ar.in;
1741f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    strm->next_out = ar.out;
1751f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    strm->avail_in = (unsigned)(ar.in < ar.last ?
1761f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher                                PAD_AVAIL_IN + (ar.last - ar.in) :
1771f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher                                PAD_AVAIL_IN - (ar.in - ar.last));
1781f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    strm->avail_out = (unsigned)(ar.out < ar.end ?
1791f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher                                 PAD_AVAIL_OUT + (ar.end - ar.out) :
1801f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher                                 PAD_AVAIL_OUT - (ar.out - ar.end));
1811f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher    state->hold = (unsigned long)ar.hold;
1820e45365737bf5283627e32253f2279c4d9fa32d0plougher    state->bits = ar.bits;
18311fb64f595e7c989993ba0c3a7d86fa1b0249e12plougher    return;
1840e45365737bf5283627e32253f2279c4d9fa32d0plougher}
1850e45365737bf5283627e32253f2279c4d9fa32d0plougher
1860e45365737bf5283627e32253f2279c4d9fa32d0plougher#endif
1870e45365737bf5283627e32253f2279c4d9fa32d0plougher