11f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/* inffas8664.c is a hand tuned assembler version of inffast.c - fast decoding 21f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * version for AMD64 on Windows using Microsoft C compiler 31f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * 41f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * Copyright (C) 1995-2003 Mark Adler 51f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * For conditions of distribution and use, see copyright notice in zlib.h 6037726ca44408bf897791f5197b652defd45f5bePhillip Lougher * 793fce13e13cbbbaa652028c627ce37e96a9679e8Phillip Lougher * Copyright (C) 2003 Chris Anderson <christop@charm.net> 883d42a3fc898962aa1f1e8387f2ccb1114e0d294Phillip Lougher * Please use the copyright conditions above. 91f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * 101f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * 2005 - Adaptation to Microsoft C Compiler for AMD64 by Gilles Vollant 111f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * 121f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * inffas8664.c call function inffas8664fnc in inffasx64.asm 131f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * inffasx64.asm is automatically convert from AMD64 portion of inffas86.c 141f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * 151f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * Dec-29-2003 -- I added AMD64 inflate asm support. This version is also 161f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * slightly quicker on x86 systems because, instead of using rep movsb to copy 171f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * data, it uses rep movsw, which moves data in 2-byte chunks instead of single 181f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates 191f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * from http://fedora.linux.duke.edu/fc1_x86_64 201f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with 211f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * 1GB ram. The 64-bit version is about 4% faster than the 32-bit version, 221f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * when decompressing mozilla-source-1.3.tar.gz. 231f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * 241f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from 251f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at 261f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * the moment. I have successfully compiled and tested this code with gcc2.96, 270e45365737bf5283627e32253f2279c4d9fa32d0plougher * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S 281b899fc316f7eba7a31da12dc0c9b69ada441059plougher * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX 291b899fc316f7eba7a31da12dc0c9b69ada441059plougher * enabled. I will attempt to merge the MMX code into this version. Newer 301f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * versions of this and inffast.S can be found at 311f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/ 321f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher * 331f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher */ 341f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 351f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher#include <stdio.h> 361f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher#include "zutil.h" 371f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher#include "inftrees.h" 381f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher#include "inflate.h" 394c99cb7f458d8e1c598f1c80793daf3696c9b528plougher#include "inffast.h" 401f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 414c99cb7f458d8e1c598f1c80793daf3696c9b528plougher/* Mark Adler's comments from inffast.c: */ 422b1aa06131c7d8d4361d79172afb9594e66a7280Phillip Lougher 431f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/* 441f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher Decode literal, length, and distance codes and write out the resulting 451b899fc316f7eba7a31da12dc0c9b69ada441059plougher literal and match bytes until either not enough input or output is 461f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher available, an end-of-block is encountered, or a data error is encountered. 471f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher When large enough input and output buffers are supplied to inflate(), for 481f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher example, a 16K input buffer and a 64K output buffer, more than 95% of the 491f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher inflate execution time is spent in this routine. 501f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 511f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher Entry assumptions: 52e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher 531f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher state->mode == LEN 541f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher strm->avail_in >= 6 551f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher strm->avail_out >= 258 561f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher start >= strm->avail_out 571f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher state->bits < 8 581f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 591f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher On return, state->mode is one of: 601f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 611f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher LEN -- ran out of enough output space or enough available input 621f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher TYPE -- reached end of block code, inflate() to interpret next block 631f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher BAD -- error in block data 640e45365737bf5283627e32253f2279c4d9fa32d0plougher 653b75d2fa82ec06cc4f8716643d538d2db662e1bdplougher Notes: 66ae9d58cc7aba3f8d40ef0fcef36e992ec34ac1bbplougher 67a017a9770707e1b0afb81db40c4923ae56caf898plougher - The maximum input bits used by a length/distance pair is 15 bits for the 681f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher length code, 5 bits for the length extra, 15 bits for the distance code, 691f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher and 13 bits for the distance extra. This totals 48 bits, or six bytes. 701f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher Therefore if strm->avail_in >= 6, then there is enough input to avoid 711f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher checking for available input while decoding. 721f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 731f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher - The maximum bytes that a single length/distance pair can output is 258 741f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher bytes, which is the maximum length that can be coded. inflate_fast() 751f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher requires strm->avail_out >= 258 for each loop to avoid checking for 761f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher output space. 771f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher */ 781f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 791f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 801f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 811f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher typedef struct inffast_ar { 821f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/* 64 32 x86 x86_64 */ 831f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/* ar offset register */ 841f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/* 0 0 */ void *esp; /* esp save */ 851f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/* 8 4 */ void *ebp; /* ebp save */ 861f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/* 16 8 */ unsigned char FAR *in; /* esi rsi local strm->next_in */ 871f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/* 24 12 */ unsigned char FAR *last; /* r9 while in < last */ 881f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher/* 32 16 */ unsigned char FAR *out; /* edi rdi local strm->next_out */ 890e45365737bf5283627e32253f2279c4d9fa32d0plougher/* 40 20 */ unsigned char FAR *beg; /* inflate()'s init next_out */ 900e45365737bf5283627e32253f2279c4d9fa32d0plougher/* 48 24 */ unsigned char FAR *end; /* r10 while out < end */ 910e45365737bf5283627e32253f2279c4d9fa32d0plougher/* 56 28 */ unsigned char FAR *window;/* size of window, wsize!=0 */ 92ae9d58cc7aba3f8d40ef0fcef36e992ec34ac1bbplougher/* 64 32 */ code const FAR *lcode; /* ebp rbp local strm->lencode */ 93ae9d58cc7aba3f8d40ef0fcef36e992ec34ac1bbplougher/* 72 36 */ code const FAR *dcode; /* r11 local strm->distcode */ 94ae9d58cc7aba3f8d40ef0fcef36e992ec34ac1bbplougher/* 80 40 */ size_t /*unsigned long */hold; /* edx rdx local strm->hold */ 95ae9d58cc7aba3f8d40ef0fcef36e992ec34ac1bbplougher/* 88 44 */ unsigned bits; /* ebx rbx local strm->bits */ 96ae9d58cc7aba3f8d40ef0fcef36e992ec34ac1bbplougher/* 92 48 */ unsigned wsize; /* window size */ 97ae9d58cc7aba3f8d40ef0fcef36e992ec34ac1bbplougher/* 96 52 */ unsigned write; /* window write index */ 98a017a9770707e1b0afb81db40c4923ae56caf898plougher/*100 56 */ unsigned lmask; /* r12 mask for lcode */ 99a017a9770707e1b0afb81db40c4923ae56caf898plougher/*104 60 */ unsigned dmask; /* r13 mask for dcode */ 100a017a9770707e1b0afb81db40c4923ae56caf898plougher/*108 64 */ unsigned len; /* r14 match length */ 1013b75d2fa82ec06cc4f8716643d538d2db662e1bdplougher/*112 68 */ unsigned dist; /* r15 match distance */ 102a017a9770707e1b0afb81db40c4923ae56caf898plougher/*116 72 */ unsigned status; /* set when state chng*/ 103a017a9770707e1b0afb81db40c4923ae56caf898plougher } type_ar; 104a017a9770707e1b0afb81db40c4923ae56caf898plougher#ifdef ASMINF 105a017a9770707e1b0afb81db40c4923ae56caf898plougher 106a017a9770707e1b0afb81db40c4923ae56caf898ploughervoid inflate_fast(strm, start) 1071f413c84d736495fd61ff05ebe52c3a01a4d95c2plougherz_streamp strm; 1081f413c84d736495fd61ff05ebe52c3a01a4d95c2plougherunsigned start; /* inflate()'s starting value for strm->avail_out */ 1091f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher{ 1101f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher struct inflate_state FAR *state; 1111f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher type_ar ar; 1121f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher void inffas8664fnc(struct inffast_ar * par); 1131f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 1141f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 1151f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 1161f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher#if (defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )) || (defined(_MSC_VER) && defined(_M_AMD64)) 1171f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher#define PAD_AVAIL_IN 6 1183f23833eafaba866c1d35b10e67dc6945786e7f6plougher#define PAD_AVAIL_OUT 258 1193f23833eafaba866c1d35b10e67dc6945786e7f6plougher#else 1203f23833eafaba866c1d35b10e67dc6945786e7f6plougher#define PAD_AVAIL_IN 5 1213f23833eafaba866c1d35b10e67dc6945786e7f6plougher#define PAD_AVAIL_OUT 257 1223f23833eafaba866c1d35b10e67dc6945786e7f6plougher#endif 1231f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 124e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher /* copy state to local variables */ 125e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher state = (struct inflate_state FAR *)strm->state; 126e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher 127e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher ar.in = strm->next_in; 128e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher ar.last = ar.in + (strm->avail_in - PAD_AVAIL_IN); 129e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher ar.out = strm->next_out; 130e6e0e1bdf98ad6faa63527e5bbdd3bd5e7e97a9eplougher ar.beg = ar.out - (start - strm->avail_out); 1311f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher ar.end = ar.out + (strm->avail_out - PAD_AVAIL_OUT); 1321f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher ar.wsize = state->wsize; 1331f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher ar.write = state->wnext; 1341f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher ar.window = state->window; 1351f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher ar.hold = state->hold; 1361f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher ar.bits = state->bits; 1371f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher ar.lcode = state->lencode; 1381f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher ar.dcode = state->distcode; 1391f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher ar.lmask = (1U << state->lenbits) - 1; 1401f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher ar.dmask = (1U << state->distbits) - 1; 1411f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 142f03d02e0d5c5f7d97d63adc665dd9f524e134c23plougher /* decode literals and length/distances until end-of-block or not enough 143f03d02e0d5c5f7d97d63adc665dd9f524e134c23plougher input data or output space */ 1441f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 1451f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher /* align in on 1/2 hold size boundary */ 1461f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher while (((size_t)(void *)ar.in & (sizeof(ar.hold) / 2 - 1)) != 0) { 1471f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher ar.hold += (unsigned long)*ar.in++ << ar.bits; 1481f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher ar.bits += 8; 1491f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher } 1501f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 1511f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher inffas8664fnc(&ar); 1521f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 1531f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher if (ar.status > 1) { 1541f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher if (ar.status == 2) 15511fb64f595e7c989993ba0c3a7d86fa1b0249e12plougher strm->msg = "invalid literal/length code"; 1561f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher else if (ar.status == 3) 1571f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher strm->msg = "invalid distance code"; 1581f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher else 1591f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher strm->msg = "invalid distance too far back"; 1601f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher state->mode = BAD; 1611f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher } 1621f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher else if ( ar.status == 1 ) { 1631f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher state->mode = TYPE; 1641f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher } 1651f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 1661b81edc57975521caed67438f8b1af9ff3c8a25aPhillip Lougher /* return unused bytes (on entry, bits < 8, so in won't go too far back) */ 1671b81edc57975521caed67438f8b1af9ff3c8a25aPhillip Lougher ar.len = ar.bits >> 3; 1681f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher ar.in -= ar.len; 1691f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher ar.bits -= ar.len << 3; 1701f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher ar.hold &= (1U << ar.bits) - 1; 1711f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher 1721f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher /* update state and return */ 1731f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher strm->next_in = ar.in; 1741f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher strm->next_out = ar.out; 1751f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher strm->avail_in = (unsigned)(ar.in < ar.last ? 1761f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher PAD_AVAIL_IN + (ar.last - ar.in) : 1771f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher PAD_AVAIL_IN - (ar.in - ar.last)); 1781f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher strm->avail_out = (unsigned)(ar.out < ar.end ? 1791f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher PAD_AVAIL_OUT + (ar.end - ar.out) : 1801f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher PAD_AVAIL_OUT - (ar.out - ar.end)); 1811f413c84d736495fd61ff05ebe52c3a01a4d95c2plougher state->hold = (unsigned long)ar.hold; 1820e45365737bf5283627e32253f2279c4d9fa32d0plougher state->bits = ar.bits; 18311fb64f595e7c989993ba0c3a7d86fa1b0249e12plougher return; 1840e45365737bf5283627e32253f2279c4d9fa32d0plougher} 1850e45365737bf5283627e32253f2279c4d9fa32d0plougher 1860e45365737bf5283627e32253f2279c4d9fa32d0plougher#endif 1870e45365737bf5283627e32253f2279c4d9fa32d0plougher