1/* match.s -- Pentium-optimized version of longest_match()
2 * Written for zlib 1.1.2
3 * Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
4 *
5 * This is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License.
7 */
8
9#ifndef NO_UNDERLINE
10#define	match_init	_match_init
11#define	longest_match	_longest_match
12#endif
13
14#define	MAX_MATCH	(258)
15#define	MIN_MATCH	(3)
16#define	MIN_LOOKAHEAD	(MAX_MATCH + MIN_MATCH + 1)
17#define	MAX_MATCH_8	((MAX_MATCH + 7) & ~7)
18
19/* stack frame offsets */
20
21#define	wmask			0	/* local copy of s->wmask	*/
22#define	window			4	/* local copy of s->window	*/
23#define	windowbestlen		8	/* s->window + bestlen		*/
24#define	chainlenscanend		12	/* high word: current chain len	*/
25					/* low word: last bytes sought	*/
26#define	scanstart		16	/* first two bytes of string	*/
27#define	scanalign		20	/* dword-misalignment of string	*/
28#define	nicematch		24	/* a good enough match size	*/
29#define	bestlen			28	/* size of best match so far	*/
30#define	scan			32	/* ptr to string wanting match	*/
31
32#define	LocalVarsSize		(36)
33/*	saved ebx		36 */
34/*	saved edi		40 */
35/*	saved esi		44 */
36/*	saved ebp		48 */
37/*	return address		52 */
38#define	deflatestate		56	/* the function arguments	*/
39#define	curmatch		60
40
41/* Offsets for fields in the deflate_state structure. These numbers
42 * are calculated from the definition of deflate_state, with the
43 * assumption that the compiler will dword-align the fields. (Thus,
44 * changing the definition of deflate_state could easily cause this
45 * program to crash horribly, without so much as a warning at
46 * compile time. Sigh.)
47 */
48
49/* All the +zlib1222add offsets are due to the addition of fields
50 *  in zlib in the deflate_state structure since the asm code was first written
51 * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
52 * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
53 * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
54 */
55
56#define zlib1222add		(8)
57
58#define	dsWSize			(36+zlib1222add)
59#define	dsWMask			(44+zlib1222add)
60#define	dsWindow		(48+zlib1222add)
61#define	dsPrev			(56+zlib1222add)
62#define	dsMatchLen		(88+zlib1222add)
63#define	dsPrevMatch		(92+zlib1222add)
64#define	dsStrStart		(100+zlib1222add)
65#define	dsMatchStart		(104+zlib1222add)
66#define	dsLookahead		(108+zlib1222add)
67#define	dsPrevLen		(112+zlib1222add)
68#define	dsMaxChainLen		(116+zlib1222add)
69#define	dsGoodMatch		(132+zlib1222add)
70#define	dsNiceMatch		(136+zlib1222add)
71
72
73.file "match.S"
74
75.globl	match_init, longest_match
76
77.text
78
79/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
80
81longest_match:
82
83/* Save registers that the compiler may be using, and adjust %esp to	*/
84/* make room for our stack frame.					*/
85
86		pushl	%ebp
87		pushl	%edi
88		pushl	%esi
89		pushl	%ebx
90		subl	$LocalVarsSize, %esp
91
92/* Retrieve the function arguments. %ecx will hold cur_match		*/
93/* throughout the entire function. %edx will hold the pointer to the	*/
94/* deflate_state structure during the function's setup (before		*/
95/* entering the main loop).						*/
96
97		movl	deflatestate(%esp), %edx
98		movl	curmatch(%esp), %ecx
99
100/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;	*/
101
102		movl	dsNiceMatch(%edx), %eax
103		movl	dsLookahead(%edx), %ebx
104		cmpl	%eax, %ebx
105		jl	LookaheadLess
106		movl	%eax, %ebx
107LookaheadLess:	movl	%ebx, nicematch(%esp)
108
109/* register Bytef *scan = s->window + s->strstart;			*/
110
111		movl	dsWindow(%edx), %esi
112		movl	%esi, window(%esp)
113		movl	dsStrStart(%edx), %ebp
114		lea	(%esi,%ebp), %edi
115		movl	%edi, scan(%esp)
116
117/* Determine how many bytes the scan ptr is off from being		*/
118/* dword-aligned.							*/
119
120		movl	%edi, %eax
121		negl	%eax
122		andl	$3, %eax
123		movl	%eax, scanalign(%esp)
124
125/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ?			*/
126/*     s->strstart - (IPos)MAX_DIST(s) : NIL;				*/
127
128		movl	dsWSize(%edx), %eax
129		subl	$MIN_LOOKAHEAD, %eax
130		subl	%eax, %ebp
131		jg	LimitPositive
132		xorl	%ebp, %ebp
133LimitPositive:
134
135/* unsigned chain_length = s->max_chain_length;				*/
136/* if (s->prev_length >= s->good_match) {				*/
137/*     chain_length >>= 2;						*/
138/* }									*/
139
140		movl	dsPrevLen(%edx), %eax
141		movl	dsGoodMatch(%edx), %ebx
142		cmpl	%ebx, %eax
143		movl	dsMaxChainLen(%edx), %ebx
144		jl	LastMatchGood
145		shrl	$2, %ebx
146LastMatchGood:
147
148/* chainlen is decremented once beforehand so that the function can	*/
149/* use the sign flag instead of the zero flag for the exit test.	*/
150/* It is then shifted into the high word, to make room for the scanend	*/
151/* scanend value, which it will always accompany.			*/
152
153		decl	%ebx
154		shll	$16, %ebx
155
156/* int best_len = s->prev_length;					*/
157
158		movl	dsPrevLen(%edx), %eax
159		movl	%eax, bestlen(%esp)
160
161/* Store the sum of s->window + best_len in %esi locally, and in %esi.	*/
162
163		addl	%eax, %esi
164		movl	%esi, windowbestlen(%esp)
165
166/* register ush scan_start = *(ushf*)scan;				*/
167/* register ush scan_end   = *(ushf*)(scan+best_len-1);			*/
168
169		movw	(%edi), %bx
170		movw	%bx, scanstart(%esp)
171		movw	-1(%edi,%eax), %bx
172		movl	%ebx, chainlenscanend(%esp)
173
174/* Posf *prev = s->prev;						*/
175/* uInt wmask = s->w_mask;						*/
176
177		movl	dsPrev(%edx), %edi
178		movl	dsWMask(%edx), %edx
179		mov	%edx, wmask(%esp)
180
181/* Jump into the main loop.						*/
182
183		jmp	LoopEntry
184
185.balign 16
186
187/* do {
188 *     match = s->window + cur_match;
189 *     if (*(ushf*)(match+best_len-1) != scan_end ||
190 *         *(ushf*)match != scan_start) continue;
191 *     [...]
192 * } while ((cur_match = prev[cur_match & wmask]) > limit
193 *          && --chain_length != 0);
194 *
195 * Here is the inner loop of the function. The function will spend the
196 * majority of its time in this loop, and majority of that time will
197 * be spent in the first ten instructions.
198 *
199 * Within this loop:
200 * %ebx = chainlenscanend - i.e., ((chainlen << 16) | scanend)
201 * %ecx = curmatch
202 * %edx = curmatch & wmask
203 * %esi = windowbestlen - i.e., (window + bestlen)
204 * %edi = prev
205 * %ebp = limit
206 *
207 * Two optimization notes on the choice of instructions:
208 *
209 * The first instruction uses a 16-bit address, which costs an extra,
210 * unpairable cycle. This is cheaper than doing a 32-bit access and
211 * zeroing the high word, due to the 3-cycle misalignment penalty which
212 * would occur half the time. This also turns out to be cheaper than
213 * doing two separate 8-bit accesses, as the memory is so rarely in the
214 * L1 cache.
215 *
216 * The window buffer, however, apparently spends a lot of time in the
217 * cache, and so it is faster to retrieve the word at the end of the
218 * match string with two 8-bit loads. The instructions that test the
219 * word at the beginning of the match string, however, are executed
220 * much less frequently, and there it was cheaper to use 16-bit
221 * instructions, which avoided the necessity of saving off and
222 * subsequently reloading one of the other registers.
223 */
224LookupLoop:
225							/* 1 U & V  */
226		movw	(%edi,%edx,2), %cx		/* 2 U pipe */
227		movl	wmask(%esp), %edx		/* 2 V pipe */
228		cmpl	%ebp, %ecx			/* 3 U pipe */
229		jbe	LeaveNow			/* 3 V pipe */
230		subl	$0x00010000, %ebx		/* 4 U pipe */
231		js	LeaveNow			/* 4 V pipe */
232LoopEntry:	movb	-1(%esi,%ecx), %al		/* 5 U pipe */
233		andl	%ecx, %edx			/* 5 V pipe */
234		cmpb	%bl, %al			/* 6 U pipe */
235		jnz	LookupLoop			/* 6 V pipe */
236		movb	(%esi,%ecx), %ah
237		cmpb	%bh, %ah
238		jnz	LookupLoop
239		movl	window(%esp), %eax
240		movw	(%eax,%ecx), %ax
241		cmpw	scanstart(%esp), %ax
242		jnz	LookupLoop
243
244/* Store the current value of chainlen.					*/
245
246		movl	%ebx, chainlenscanend(%esp)
247
248/* Point %edi to the string under scrutiny, and %esi to the string we	*/
249/* are hoping to match it up with. In actuality, %esi and %edi are	*/
250/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is	*/
251/* initialized to -(MAX_MATCH_8 - scanalign).				*/
252
253		movl	window(%esp), %esi
254		movl	scan(%esp), %edi
255		addl	%ecx, %esi
256		movl	scanalign(%esp), %eax
257		movl	$(-MAX_MATCH_8), %edx
258		lea	MAX_MATCH_8(%edi,%eax), %edi
259		lea	MAX_MATCH_8(%esi,%eax), %esi
260
261/* Test the strings for equality, 8 bytes at a time. At the end,
262 * adjust %edx so that it is offset to the exact byte that mismatched.
263 *
264 * We already know at this point that the first three bytes of the
265 * strings match each other, and they can be safely passed over before
266 * starting the compare loop. So what this code does is skip over 0-3
267 * bytes, as much as necessary in order to dword-align the %edi
268 * pointer. (%esi will still be misaligned three times out of four.)
269 *
270 * It should be confessed that this loop usually does not represent
271 * much of the total running time. Replacing it with a more
272 * straightforward "rep cmpsb" would not drastically degrade
273 * performance.
274 */
275LoopCmps:
276		movl	(%esi,%edx), %eax
277		movl	(%edi,%edx), %ebx
278		xorl	%ebx, %eax
279		jnz	LeaveLoopCmps
280		movl	4(%esi,%edx), %eax
281		movl	4(%edi,%edx), %ebx
282		xorl	%ebx, %eax
283		jnz	LeaveLoopCmps4
284		addl	$8, %edx
285		jnz	LoopCmps
286		jmp	LenMaximum
287LeaveLoopCmps4:	addl	$4, %edx
288LeaveLoopCmps:	testl	$0x0000FFFF, %eax
289		jnz	LenLower
290		addl	$2, %edx
291		shrl	$16, %eax
292LenLower:	subb	$1, %al
293		adcl	$0, %edx
294
295/* Calculate the length of the match. If it is longer than MAX_MATCH,	*/
296/* then automatically accept it as the best possible match and leave.	*/
297
298		lea	(%edi,%edx), %eax
299		movl	scan(%esp), %edi
300		subl	%edi, %eax
301		cmpl	$MAX_MATCH, %eax
302		jge	LenMaximum
303
304/* If the length of the match is not longer than the best match we	*/
305/* have so far, then forget it and return to the lookup loop.		*/
306
307		movl	deflatestate(%esp), %edx
308		movl	bestlen(%esp), %ebx
309		cmpl	%ebx, %eax
310		jg	LongerMatch
311		movl	chainlenscanend(%esp), %ebx
312		movl	windowbestlen(%esp), %esi
313		movl	dsPrev(%edx), %edi
314		movl	wmask(%esp), %edx
315		andl	%ecx, %edx
316		jmp	LookupLoop
317
318/*         s->match_start = cur_match;					*/
319/*         best_len = len;						*/
320/*         if (len >= nice_match) break;				*/
321/*         scan_end = *(ushf*)(scan+best_len-1);			*/
322
323LongerMatch:	movl	nicematch(%esp), %ebx
324		movl	%eax, bestlen(%esp)
325		movl	%ecx, dsMatchStart(%edx)
326		cmpl	%ebx, %eax
327		jge	LeaveNow
328		movl	window(%esp), %esi
329		addl	%eax, %esi
330		movl	%esi, windowbestlen(%esp)
331		movl	chainlenscanend(%esp), %ebx
332		movw	-1(%edi,%eax), %bx
333		movl	dsPrev(%edx), %edi
334		movl	%ebx, chainlenscanend(%esp)
335		movl	wmask(%esp), %edx
336		andl	%ecx, %edx
337		jmp	LookupLoop
338
339/* Accept the current string, with the maximum possible length.		*/
340
341LenMaximum:	movl	deflatestate(%esp), %edx
342		movl	$MAX_MATCH, bestlen(%esp)
343		movl	%ecx, dsMatchStart(%edx)
344
345/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len;		*/
346/* return s->lookahead;							*/
347
348LeaveNow:
349		movl	deflatestate(%esp), %edx
350		movl	bestlen(%esp), %ebx
351		movl	dsLookahead(%edx), %eax
352		cmpl	%eax, %ebx
353		jg	LookaheadRet
354		movl	%ebx, %eax
355LookaheadRet:
356
357/* Restore the stack and return from whence we came.			*/
358
359		addl	$LocalVarsSize, %esp
360		popl	%ebx
361		popl	%esi
362		popl	%edi
363		popl	%ebp
364match_init:	ret
365