1;uInt longest_match_x64(
2;    deflate_state *s,
3;    IPos cur_match);                             /* current match */
4
5; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86
6; Copyright (C) 1995-2005 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
7;
8; File written by Gilles Vollant, by converting to assembly the longest_match
9;  from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
10;
11;  and by taking inspiration on asm686 with masm, optimised assembly code
12;        from Brian Raiter, written 1998
13;
14;         http://www.zlib.net
15;         http://www.winimage.com/zLibDll
16;         http://www.muppetlabs.com/~breadbox/software/assembly.html
17;
18; to compile this file for infozip Zip, I use option:
19;   ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
20;
21; to compile this file for zLib, I use option:
22;   ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
23; Be carrefull to adapt zlib1222add below to your version of zLib
24;   (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
25;    value of zlib1222add later)
26;
27; This file compile with Microsoft Macro Assembler (x64) for AMD64
28;
29;   ml64.exe is given with Visual Studio 2005 and Windows 2003 server DDK
30;
31;   (you can get Windows 2003 server DDK with ml64 and cl for AMD64 from
32;      http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)
33;
34
35
36;uInt longest_match(s, cur_match)
37;    deflate_state *s;
38;    IPos cur_match;                             /* current match */
39bits 64
40	section .text
41	
42longest_match:	; PROC
43
44
45;%define LocalVarsSize  88
46 %define LocalVarsSize  72
47
48; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
49; free register :  r14,r15
50; register can be saved : rsp
51
52 %define chainlenwmask   rsp + 8 - LocalVarsSize    ; high word: current chain len
53                                                 ; low word: s->wmask
54;%define window   rsp + xx - LocalVarsSize   ; local copy of s->window ; stored in r10
55;%define windowbestlen   rsp + xx - LocalVarsSize   ; s->window + bestlen , use r10+r11
56;%define scanstart   rsp + xx - LocalVarsSize   ; first two bytes of string ; stored in r12w
57;%define scanend   rsp + xx - LocalVarsSize   ; last two bytes of string use ebx
58;%define scanalign   rsp + xx - LocalVarsSize   ; dword-misalignment of string r13
59;%define bestlen   rsp + xx - LocalVarsSize   ; size of best match so far -> r11d
60;%define scan   rsp + xx - LocalVarsSize   ; ptr to string wanting match -> r9
61%ifdef INFOZIP
62%else
63 %define nicematch   (rsp + 16 - LocalVarsSize) ; a good enough match size
64%endif
65
66%define save_rdi   rsp + 24 - LocalVarsSize
67%define save_rsi   rsp + 32 - LocalVarsSize
68%define save_rbx   rsp + 40 - LocalVarsSize
69%define save_rbp   rsp + 48 - LocalVarsSize
70%define save_r12   rsp + 56 - LocalVarsSize
71%define save_r13   rsp + 64 - LocalVarsSize
72;%define save_r14   rsp + 72 - LocalVarsSize
73;%define save_r15   rsp + 80 - LocalVarsSize
74
75
76
77;  all the +4 offsets are due to the addition of pending_buf_size (in zlib
78;  in the deflate_state structure since the asm code was first written
79;  (if you compile with zlib 1.0.4 or older, remove the +4).
80;  Note : these value are good with a 8 bytes boundary pack structure
81
82
83    %define MAX_MATCH      258
84    %define MIN_MATCH      3
85    %define MIN_LOOKAHEAD      (MAX_MATCH+MIN_MATCH+1)
86
87
88;;; Offsets for fields in the deflate_state structure. These numbers
89;;; are calculated from the definition of deflate_state, with the
90;;; assumption that the compiler will dword-align the fields. (Thus,
91;;; changing the definition of deflate_state could easily cause this
92;;; program to crash horribly, without so much as a warning at
93;;; compile time. Sigh.)
94
95;  all the +zlib1222add offsets are due to the addition of fields
96;  in zlib in the deflate_state structure since the asm code was first written
97;  (if you compile with zlib 1.0.4 or older, use "%define zlib1222add  (-4)").
98;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "%define zlib1222add  0").
99;  if you compile with zlib 1.2.2.2 or later , use "%define zlib1222add  8").
100
101%ifdef INFOZIP
102	section .data
103
104extern    window_size:DWORD
105; WMask ; 7fff
106extern    window:BYTE:010040H
107extern    prev:WORD:08000H
108; MatchLen : unused
109; PrevMatch : unused
110extern    strstart:DWORD
111extern    match_start:DWORD
112; Lookahead : ignore
113extern    prev_length:DWORD ; PrevLen
114extern    max_chain_length:DWORD
115extern    good_match:DWORD
116extern    nice_match:DWORD
117%define prev_ad  OFFSET prev
118%define window_ad  OFFSET window
119%define nicematch  nice_match
120
121%define WMask  07fffh
122
123%else
124
125  %ifndef zlib1222add
126    %define zlib1222add  8
127  %endif
128%define dsWSize  56+zlib1222add+(zlib1222add/2)
129%define dsWMask  64+zlib1222add+(zlib1222add/2)
130%define dsWindow  72+zlib1222add
131%define dsPrev  88+zlib1222add
132%define dsMatchLen  128+zlib1222add
133%define dsPrevMatch  132+zlib1222add
134%define dsStrStart  140+zlib1222add
135%define dsMatchStart  144+zlib1222add
136%define dsLookahead  148+zlib1222add
137%define dsPrevLen  152+zlib1222add
138%define dsMaxChainLen  156+zlib1222add
139%define dsGoodMatch  172+zlib1222add
140%define dsNiceMatch  176+zlib1222add
141
142%define window_size  [ rcx + dsWSize]
143%define WMask  [ rcx + dsWMask]
144%define window_ad  [ rcx + dsWindow]
145%define prev_ad  [ rcx + dsPrev]
146%define strstart  [ rcx + dsStrStart]
147%define match_start  [ rcx + dsMatchStart]
148%define Lookahead  [ rcx + dsLookahead] ; 0ffffffffh on infozip
149%define prev_length  [ rcx + dsPrevLen]
150%define max_chain_length  [ rcx + dsMaxChainLen]
151%define good_match  [ rcx + dsGoodMatch]
152%define nice_match  [ rcx + dsNiceMatch]
153%endif
154
155; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
156
157; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
158; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
159;
160; All registers must be preserved across the call, except for
161;   rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
162
163	section .text
164
165;;; Save registers that the compiler may be using, and adjust esp to
166;;; make room for our stack frame.
167
168
169;;; Retrieve the function arguments. r8d will hold cur_match
170;;; throughout the entire function. edx will hold the pointer to the
171;;; deflate_state structure during the function's setup (before
172;;; entering the main loop.
173
174; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
175
176; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
177
178        mov [save_rdi],rdi
179        mov [save_rsi],rsi
180        mov [save_rbx],rbx
181        mov [save_rbp],rbp
182%ifdef INFOZIP
183        mov r8d,ecx
184%else
185        mov r8d,edx
186%endif
187        mov [save_r12],r12
188        mov [save_r13],r13
189;        mov [save_r14],r14
190;        mov [save_r15],r15
191
192
193;;; uInt wmask = s->w_mask;
194;;; unsigned chain_length = s->max_chain_length;
195;;; if (s->prev_length >= s->good_match) {
196;;;     chain_length >>= 2;
197;;; }
198
199        mov edi, prev_length
200        mov esi, good_match
201        mov eax, WMask
202        mov ebx, max_chain_length
203        cmp edi, esi
204        jl  LastMatchGood
205        shr ebx, 2
206LastMatchGood:
207
208;;; chainlen is decremented once beforehand so that the function can
209;;; use the sign flag instead of the zero flag for the exit test.
210;;; It is then shifted into the high word, to make room for the wmask
211;;; value, which it will always accompany.
212
213        dec ebx
214        shl ebx, 16
215        or  ebx, eax
216
217;;; on zlib only
218;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
219
220%ifdef INFOZIP
221        mov [chainlenwmask], ebx
222; on infozip nice_match = [nice_match]
223%else
224        mov eax, nice_match
225        mov [chainlenwmask], ebx
226        mov r10d, Lookahead
227        cmp r10d, eax
228        cmovnl r10d, eax
229        mov [nicematch],r10d
230%endif
231
232;;; register Bytef *scan = s->window + s->strstart;
233        mov r10, window_ad
234        mov ebp, strstart
235        lea r13, [r10 + rbp]
236
237;;; Determine how many bytes the scan ptr is off from being
238;;; dword-aligned.
239
240         mov r9,r13
241         neg r13
242         and r13,3
243
244;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
245;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
246%ifdef INFOZIP
247        mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
248%else
249        mov eax, window_size
250        sub eax, MIN_LOOKAHEAD
251%endif
252        xor edi,edi
253        sub ebp, eax
254
255        mov r11d, prev_length
256
257        cmovng ebp,edi
258
259;;; int best_len = s->prev_length;
260
261
262;;; Store the sum of s->window + best_len in esi locally, and in esi.
263
264       lea  rsi,[r10+r11]
265
266;;; register ush scan_start = *(ushf*)scan;
267;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
268;;; Posf *prev = s->prev;
269
270        movzx r12d,word [r9]
271        movzx ebx, word [r9 + r11 - 1]
272
273        mov rdi, prev_ad
274
275;;; Jump into the main loop.
276
277        mov edx, [chainlenwmask]
278
279        cmp bx,word [rsi + r8 - 1]
280        jz  LookupLoopIsZero
281
282LookupLoop1:
283        and r8d, edx
284
285        movzx   r8d, word [rdi + r8*2]
286        cmp r8d, ebp
287        jbe LeaveNow
288        sub edx, 00010000h
289        js  LeaveNow
290
291LoopEntry1:
292        cmp bx,word [rsi + r8 - 1]
293        jz  LookupLoopIsZero
294
295LookupLoop2:
296        and r8d, edx
297
298        movzx   r8d, word [rdi + r8*2]
299        cmp r8d, ebp
300        jbe LeaveNow
301        sub edx, 00010000h
302        js  LeaveNow
303
304LoopEntry2:
305        cmp bx,word [rsi + r8 - 1]
306        jz  LookupLoopIsZero
307
308LookupLoop4:
309        and r8d, edx
310
311        movzx   r8d, word [rdi + r8*2]
312        cmp r8d, ebp
313        jbe LeaveNow
314        sub edx, 00010000h
315        js  LeaveNow
316
317LoopEntry4:
318
319        cmp bx,word [rsi + r8 - 1]
320        jnz LookupLoop1
321        jmp LookupLoopIsZero
322
323
324;;; do {
325;;;     match = s->window + cur_match;
326;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
327;;;         *(ushf*)match != scan_start) continue;
328;;;     [...]
329;;; } while ((cur_match = prev[cur_match & wmask]) > limit
330;;;          && --chain_length != 0);
331;;;
332;;; Here is the inner loop of the function. The function will spend the
333;;; majority of its time in this loop, and majority of that time will
334;;; be spent in the first ten instructions.
335;;;
336;;; Within this loop:
337;;; ebx = scanend
338;;; r8d = curmatch
339;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
340;;; esi = windowbestlen - i.e., (window + bestlen)
341;;; edi = prev
342;;; ebp = limit
343
344LookupLoop:
345        and r8d, edx
346
347        movzx   r8d, word [rdi + r8*2]
348        cmp r8d, ebp
349        jbe LeaveNow
350        sub edx, 00010000h
351        js  LeaveNow
352
353LoopEntry:
354
355        cmp bx,word [rsi + r8 - 1]
356        jnz LookupLoop1
357LookupLoopIsZero:
358        cmp     r12w, word [r10 + r8]
359        jnz LookupLoop1
360
361
362;;; Store the current value of chainlen.
363        mov [chainlenwmask], edx
364
365;;; Point edi to the string under scrutiny, and esi to the string we
366;;; are hoping to match it up with. In actuality, esi and edi are
367;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
368;;; initialized to -(MAX_MATCH_8 - scanalign).
369
370        lea rsi,[r8+r10]
371        mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
372        lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
373        lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
374
375        prefetcht1 [rsi+rdx]
376        prefetcht1 [rdi+rdx]
377
378
379;;; Test the strings %define for ality, 8 bytes at a time. At the end,
380;;; adjust rdx so that it is offset to the exact byte that mismatched.
381;;;
382;;; We already know at this point that the first three bytes of the
383;;; strings match each other, and they can be safely passed over before
384;;; starting the compare loop. So what this code does is skip over 0-3
385;;; bytes, as much as necessary in order to dword-align the edi
386;;; pointer. (rsi will still be misaligned three times out of four.)
387;;;
388;;; It should be confessed that this loop usually does not represent
389;;; much of the total running time. Replacing it with a more
390;;; straightforward "rep cmpsb" would not drastically degrade
391;;; performance.
392
393LoopCmps:
394        mov rax, [rsi + rdx]
395        xor rax, [rdi + rdx]
396        jnz LeaveLoopCmps
397
398        mov rax, [rsi + rdx + 8]
399        xor rax, [rdi + rdx + 8]
400        jnz LeaveLoopCmps8
401
402
403        mov rax, [rsi + rdx + 8+8]
404        xor rax, [rdi + rdx + 8+8]
405        jnz LeaveLoopCmps16
406
407        add rdx,8+8+8
408
409        jmp short LoopCmps
410LeaveLoopCmps16: add rdx,8
411LeaveLoopCmps8: add rdx,8
412LeaveLoopCmps:
413
414        test    eax, 0000FFFFh
415        jnz LenLower
416
417        test eax,0ffffffffh
418
419        jnz LenLower32
420
421        add rdx,4
422        shr rax,32
423        or ax,ax
424        jnz LenLower
425
426LenLower32:
427        shr eax,16
428        add rdx,2
429LenLower:   sub al, 1
430        adc rdx, 0
431;;; Calculate the length of the match. If it is longer than MAX_MATCH,
432;;; then automatically accept it as the best possible match and leave.
433
434        lea rax, [rdi + rdx]
435        sub rax, r9
436        cmp eax, MAX_MATCH
437        jge LenMaximum
438
439;;; If the length of the match is not longer than the best match we
440;;; have so far, then forget it and return to the lookup loop.
441;///////////////////////////////////
442
443        cmp eax, r11d
444        jg  LongerMatch
445
446        lea rsi,[r10+r11]
447
448        mov rdi, prev_ad
449        mov edx, [chainlenwmask]
450        jmp LookupLoop
451
452;;;         s->match_start = cur_match;
453;;;         best_len = len;
454;;;         if (len >= nice_match) break;
455;;;         scan_end = *(ushf*)(scan+best_len-1);
456
457LongerMatch:
458        mov r11d, eax
459        mov match_start, r8d
460        cmp eax, [nicematch]
461        jge LeaveNow
462
463        lea rsi,[r10+rax]
464
465        movzx   ebx, word [r9 + rax - 1]
466        mov rdi, prev_ad
467        mov edx, [chainlenwmask]
468        jmp LookupLoop
469
470;;; Accept the current string, with the maximum possible length.
471
472LenMaximum:
473        mov r11d,MAX_MATCH
474        mov match_start, r8d
475;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
476;;; return s->lookahead;
477
478LeaveNow:
479%ifdef INFOZIP
480        mov eax,r11d
481%else
482        mov eax, Lookahead
483        cmp r11d, eax
484        cmovng eax, r11d
485%endif
486
487;;; Restore the stack and return from whence we came.
488
489        mov rsi,[save_rsi]
490        mov rdi,[save_rdi]
491        mov rbx,[save_rbx]
492        mov rbp,[save_rbp]
493        mov r12,[save_r12]
494        mov r13,[save_r13]
495;        mov r14,[save_r14]
496;        mov r15,[save_r15]
497
498
499        ret 0
500; please don't remove this string !
501; Your can freely use gvmat64 in any free or externercial app
502; but it is far better don't remove the string in the binary!
503    db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
504%if 0
505longest_match   ENDP
506%endif
507
508match_init:	; PROC
509  ret 0
510 %if 0
511match_init ENDP
512%endif
513
514END
515