1;uInt longest_match_x64(
2;    deflate_state *s,
3;    IPos cur_match);                             /* current match */
4
5; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64
6;  (AMD64 on Athlon 64, Opteron, Phenom
7;     and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
8; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
9;
10; File written by Gilles Vollant, by converting to assembly the longest_match
11;  from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
12;
13;  and by taking inspiration on asm686 with masm, optimised assembly code
14;        from Brian Raiter, written 1998
15;
16;  This software is provided 'as-is', without any express or implied
17;  warranty.  In no event will the authors be held liable for any damages
18;  arising from the use of this software.
19;
20;  Permission is granted to anyone to use this software for any purpose,
21;  including commercial applications, and to alter it and redistribute it
22;  freely, subject to the following restrictions:
23;
24;  1. The origin of this software must not be misrepresented; you must not
25;     claim that you wrote the original software. If you use this software
26;     in a product, an acknowledgment in the product documentation would be
27;     appreciated but is not required.
28;  2. Altered source versions must be plainly marked as such, and must not be
29;     misrepresented as being the original software
30;  3. This notice may not be removed or altered from any source distribution.
31;
32;
33;
34;         http://www.zlib.net
35;         http://www.winimage.com/zLibDll
36;         http://www.muppetlabs.com/~breadbox/software/assembly.html
37;
38; to compile this file for infozip Zip, I use option:
39;   ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
40;
41; to compile this file for zLib, I use option:
42;   ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
43; Be carrefull to adapt zlib1222add below to your version of zLib
44;   (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
45;    value of zlib1222add later)
46;
47; This file compile with Microsoft Macro Assembler (x64) for AMD64
48;
49;   ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
50;
51;   (you can get Windows WDK with ml64 for AMD64 from
52;      http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
53;
54
55
56;uInt longest_match(s, cur_match)
57;    deflate_state *s;
58;    IPos cur_match;                             /* current match */
59.code
60longest_match PROC
61
62
63;LocalVarsSize   equ 88
64 LocalVarsSize   equ 72
65
66; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
67; free register :  r14,r15
68; register can be saved : rsp
69
70 chainlenwmask   equ  rsp + 8 - LocalVarsSize    ; high word: current chain len
71                                                 ; low word: s->wmask
72;window          equ  rsp + xx - LocalVarsSize   ; local copy of s->window ; stored in r10
73;windowbestlen   equ  rsp + xx - LocalVarsSize   ; s->window + bestlen , use r10+r11
74;scanstart       equ  rsp + xx - LocalVarsSize   ; first two bytes of string ; stored in r12w
75;scanend         equ  rsp + xx - LocalVarsSize   ; last two bytes of string use ebx
76;scanalign       equ  rsp + xx - LocalVarsSize   ; dword-misalignment of string r13
77;bestlen         equ  rsp + xx - LocalVarsSize   ; size of best match so far -> r11d
78;scan            equ  rsp + xx - LocalVarsSize   ; ptr to string wanting match -> r9
79IFDEF INFOZIP
80ELSE
81 nicematch       equ  (rsp + 16 - LocalVarsSize) ; a good enough match size
82ENDIF
83
84save_rdi        equ  rsp + 24 - LocalVarsSize
85save_rsi        equ  rsp + 32 - LocalVarsSize
86save_rbx        equ  rsp + 40 - LocalVarsSize
87save_rbp        equ  rsp + 48 - LocalVarsSize
88save_r12        equ  rsp + 56 - LocalVarsSize
89save_r13        equ  rsp + 64 - LocalVarsSize
90;save_r14        equ  rsp + 72 - LocalVarsSize
91;save_r15        equ  rsp + 80 - LocalVarsSize
92
93
94; summary of register usage
95; scanend     ebx
96; scanendw    bx
97; chainlenwmask   edx
98; curmatch    rsi
99; curmatchd   esi
100; windowbestlen   r8
101; scanalign   r9
102; scanalignd  r9d
103; window      r10
104; bestlen     r11
105; bestlend    r11d
106; scanstart   r12d
107; scanstartw  r12w
108; scan        r13
109; nicematch   r14d
110; limit       r15
111; limitd      r15d
112; prev        rcx
113
114;  all the +4 offsets are due to the addition of pending_buf_size (in zlib
115;  in the deflate_state structure since the asm code was first written
116;  (if you compile with zlib 1.0.4 or older, remove the +4).
117;  Note : these value are good with a 8 bytes boundary pack structure
118
119
120    MAX_MATCH           equ     258
121    MIN_MATCH           equ     3
122    MIN_LOOKAHEAD       equ     (MAX_MATCH+MIN_MATCH+1)
123
124
125;;; Offsets for fields in the deflate_state structure. These numbers
126;;; are calculated from the definition of deflate_state, with the
127;;; assumption that the compiler will dword-align the fields. (Thus,
128;;; changing the definition of deflate_state could easily cause this
129;;; program to crash horribly, without so much as a warning at
130;;; compile time. Sigh.)
131
132;  all the +zlib1222add offsets are due to the addition of fields
133;  in zlib in the deflate_state structure since the asm code was first written
134;  (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
135;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
136;  if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
137
138
139IFDEF INFOZIP
140
141_DATA   SEGMENT
142COMM    window_size:DWORD
143; WMask ; 7fff
144COMM    window:BYTE:010040H
145COMM    prev:WORD:08000H
146; MatchLen : unused
147; PrevMatch : unused
148COMM    strstart:DWORD
149COMM    match_start:DWORD
150; Lookahead : ignore
151COMM    prev_length:DWORD ; PrevLen
152COMM    max_chain_length:DWORD
153COMM    good_match:DWORD
154COMM    nice_match:DWORD
155prev_ad equ OFFSET prev
156window_ad equ OFFSET window
157nicematch equ nice_match
158_DATA ENDS
159WMask equ 07fffh
160
161ELSE
162
163  IFNDEF zlib1222add
164    zlib1222add equ 8
165  ENDIF
166dsWSize         equ 56+zlib1222add+(zlib1222add/2)
167dsWMask         equ 64+zlib1222add+(zlib1222add/2)
168dsWindow        equ 72+zlib1222add
169dsPrev          equ 88+zlib1222add
170dsMatchLen      equ 128+zlib1222add
171dsPrevMatch     equ 132+zlib1222add
172dsStrStart      equ 140+zlib1222add
173dsMatchStart    equ 144+zlib1222add
174dsLookahead     equ 148+zlib1222add
175dsPrevLen       equ 152+zlib1222add
176dsMaxChainLen   equ 156+zlib1222add
177dsGoodMatch     equ 172+zlib1222add
178dsNiceMatch     equ 176+zlib1222add
179
180window_size     equ [ rcx + dsWSize]
181WMask           equ [ rcx + dsWMask]
182window_ad       equ [ rcx + dsWindow]
183prev_ad         equ [ rcx + dsPrev]
184strstart        equ [ rcx + dsStrStart]
185match_start     equ [ rcx + dsMatchStart]
186Lookahead       equ [ rcx + dsLookahead] ; 0ffffffffh on infozip
187prev_length     equ [ rcx + dsPrevLen]
188max_chain_length equ [ rcx + dsMaxChainLen]
189good_match      equ [ rcx + dsGoodMatch]
190nice_match      equ [ rcx + dsNiceMatch]
191ENDIF
192
193; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
194
195; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
196; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
197;
198; All registers must be preserved across the call, except for
199;   rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
200
201
202
203;;; Save registers that the compiler may be using, and adjust esp to
204;;; make room for our stack frame.
205
206
207;;; Retrieve the function arguments. r8d will hold cur_match
208;;; throughout the entire function. edx will hold the pointer to the
209;;; deflate_state structure during the function's setup (before
210;;; entering the main loop.
211
212; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
213
214; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
215
216        mov [save_rdi],rdi
217        mov [save_rsi],rsi
218        mov [save_rbx],rbx
219        mov [save_rbp],rbp
220IFDEF INFOZIP
221        mov r8d,ecx
222ELSE
223        mov r8d,edx
224ENDIF
225        mov [save_r12],r12
226        mov [save_r13],r13
227;        mov [save_r14],r14
228;        mov [save_r15],r15
229
230
231;;; uInt wmask = s->w_mask;
232;;; unsigned chain_length = s->max_chain_length;
233;;; if (s->prev_length >= s->good_match) {
234;;;     chain_length >>= 2;
235;;; }
236
237        mov edi, prev_length
238        mov esi, good_match
239        mov eax, WMask
240        mov ebx, max_chain_length
241        cmp edi, esi
242        jl  LastMatchGood
243        shr ebx, 2
244LastMatchGood:
245
246;;; chainlen is decremented once beforehand so that the function can
247;;; use the sign flag instead of the zero flag for the exit test.
248;;; It is then shifted into the high word, to make room for the wmask
249;;; value, which it will always accompany.
250
251        dec ebx
252        shl ebx, 16
253        or  ebx, eax
254
255;;; on zlib only
256;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
257
258IFDEF INFOZIP
259        mov [chainlenwmask], ebx
260; on infozip nice_match = [nice_match]
261ELSE
262        mov eax, nice_match
263        mov [chainlenwmask], ebx
264        mov r10d, Lookahead
265        cmp r10d, eax
266        cmovnl r10d, eax
267        mov [nicematch],r10d
268ENDIF
269
270;;; register Bytef *scan = s->window + s->strstart;
271        mov r10, window_ad
272        mov ebp, strstart
273        lea r13, [r10 + rbp]
274
275;;; Determine how many bytes the scan ptr is off from being
276;;; dword-aligned.
277
278         mov r9,r13
279         neg r13
280         and r13,3
281
282;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
283;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
284IFDEF INFOZIP
285        mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
286ELSE
287        mov eax, window_size
288        sub eax, MIN_LOOKAHEAD
289ENDIF
290        xor edi,edi
291        sub ebp, eax
292
293        mov r11d, prev_length
294
295        cmovng ebp,edi
296
297;;; int best_len = s->prev_length;
298
299
300;;; Store the sum of s->window + best_len in esi locally, and in esi.
301
302       lea  rsi,[r10+r11]
303
304;;; register ush scan_start = *(ushf*)scan;
305;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
306;;; Posf *prev = s->prev;
307
308        movzx r12d,word ptr [r9]
309        movzx ebx, word ptr [r9 + r11 - 1]
310
311        mov rdi, prev_ad
312
313;;; Jump into the main loop.
314
315        mov edx, [chainlenwmask]
316
317        cmp bx,word ptr [rsi + r8 - 1]
318        jz  LookupLoopIsZero
319
320LookupLoop1:
321        and r8d, edx
322
323        movzx   r8d, word ptr [rdi + r8*2]
324        cmp r8d, ebp
325        jbe LeaveNow
326        sub edx, 00010000h
327        js  LeaveNow
328
329LoopEntry1:
330        cmp bx,word ptr [rsi + r8 - 1]
331        jz  LookupLoopIsZero
332
333LookupLoop2:
334        and r8d, edx
335
336        movzx   r8d, word ptr [rdi + r8*2]
337        cmp r8d, ebp
338        jbe LeaveNow
339        sub edx, 00010000h
340        js  LeaveNow
341
342LoopEntry2:
343        cmp bx,word ptr [rsi + r8 - 1]
344        jz  LookupLoopIsZero
345
346LookupLoop4:
347        and r8d, edx
348
349        movzx   r8d, word ptr [rdi + r8*2]
350        cmp r8d, ebp
351        jbe LeaveNow
352        sub edx, 00010000h
353        js  LeaveNow
354
355LoopEntry4:
356
357        cmp bx,word ptr [rsi + r8 - 1]
358        jnz LookupLoop1
359        jmp LookupLoopIsZero
360
361
362;;; do {
363;;;     match = s->window + cur_match;
364;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
365;;;         *(ushf*)match != scan_start) continue;
366;;;     [...]
367;;; } while ((cur_match = prev[cur_match & wmask]) > limit
368;;;          && --chain_length != 0);
369;;;
370;;; Here is the inner loop of the function. The function will spend the
371;;; majority of its time in this loop, and majority of that time will
372;;; be spent in the first ten instructions.
373;;;
374;;; Within this loop:
375;;; ebx = scanend
376;;; r8d = curmatch
377;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
378;;; esi = windowbestlen - i.e., (window + bestlen)
379;;; edi = prev
380;;; ebp = limit
381
382LookupLoop:
383        and r8d, edx
384
385        movzx   r8d, word ptr [rdi + r8*2]
386        cmp r8d, ebp
387        jbe LeaveNow
388        sub edx, 00010000h
389        js  LeaveNow
390
391LoopEntry:
392
393        cmp bx,word ptr [rsi + r8 - 1]
394        jnz LookupLoop1
395LookupLoopIsZero:
396        cmp     r12w, word ptr [r10 + r8]
397        jnz LookupLoop1
398
399
400;;; Store the current value of chainlen.
401        mov [chainlenwmask], edx
402
403;;; Point edi to the string under scrutiny, and esi to the string we
404;;; are hoping to match it up with. In actuality, esi and edi are
405;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
406;;; initialized to -(MAX_MATCH_8 - scanalign).
407
408        lea rsi,[r8+r10]
409        mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
410        lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
411        lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
412
413        prefetcht1 [rsi+rdx]
414        prefetcht1 [rdi+rdx]
415
416
417;;; Test the strings for equality, 8 bytes at a time. At the end,
418;;; adjust rdx so that it is offset to the exact byte that mismatched.
419;;;
420;;; We already know at this point that the first three bytes of the
421;;; strings match each other, and they can be safely passed over before
422;;; starting the compare loop. So what this code does is skip over 0-3
423;;; bytes, as much as necessary in order to dword-align the edi
424;;; pointer. (rsi will still be misaligned three times out of four.)
425;;;
426;;; It should be confessed that this loop usually does not represent
427;;; much of the total running time. Replacing it with a more
428;;; straightforward "rep cmpsb" would not drastically degrade
429;;; performance.
430
431
432LoopCmps:
433        mov rax, [rsi + rdx]
434        xor rax, [rdi + rdx]
435        jnz LeaveLoopCmps
436
437        mov rax, [rsi + rdx + 8]
438        xor rax, [rdi + rdx + 8]
439        jnz LeaveLoopCmps8
440
441
442        mov rax, [rsi + rdx + 8+8]
443        xor rax, [rdi + rdx + 8+8]
444        jnz LeaveLoopCmps16
445
446        add rdx,8+8+8
447
448        jnz short LoopCmps
449        jmp short LenMaximum
450LeaveLoopCmps16: add rdx,8
451LeaveLoopCmps8: add rdx,8
452LeaveLoopCmps:
453
454        test    eax, 0000FFFFh
455        jnz LenLower
456
457        test eax,0ffffffffh
458
459        jnz LenLower32
460
461        add rdx,4
462        shr rax,32
463        or ax,ax
464        jnz LenLower
465
466LenLower32:
467        shr eax,16
468        add rdx,2
469LenLower:   sub al, 1
470        adc rdx, 0
471;;; Calculate the length of the match. If it is longer than MAX_MATCH,
472;;; then automatically accept it as the best possible match and leave.
473
474        lea rax, [rdi + rdx]
475        sub rax, r9
476        cmp eax, MAX_MATCH
477        jge LenMaximum
478
479;;; If the length of the match is not longer than the best match we
480;;; have so far, then forget it and return to the lookup loop.
481;///////////////////////////////////
482
483        cmp eax, r11d
484        jg  LongerMatch
485
486        lea rsi,[r10+r11]
487
488        mov rdi, prev_ad
489        mov edx, [chainlenwmask]
490        jmp LookupLoop
491
492;;;         s->match_start = cur_match;
493;;;         best_len = len;
494;;;         if (len >= nice_match) break;
495;;;         scan_end = *(ushf*)(scan+best_len-1);
496
497LongerMatch:
498        mov r11d, eax
499        mov match_start, r8d
500        cmp eax, [nicematch]
501        jge LeaveNow
502
503        lea rsi,[r10+rax]
504
505        movzx   ebx, word ptr [r9 + rax - 1]
506        mov rdi, prev_ad
507        mov edx, [chainlenwmask]
508        jmp LookupLoop
509
510;;; Accept the current string, with the maximum possible length.
511
512LenMaximum:
513        mov r11d,MAX_MATCH
514        mov match_start, r8d
515
516;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
517;;; return s->lookahead;
518
519LeaveNow:
520IFDEF INFOZIP
521        mov eax,r11d
522ELSE
523        mov eax, Lookahead
524        cmp r11d, eax
525        cmovng eax, r11d
526ENDIF
527
528;;; Restore the stack and return from whence we came.
529
530
531        mov rsi,[save_rsi]
532        mov rdi,[save_rdi]
533        mov rbx,[save_rbx]
534        mov rbp,[save_rbp]
535        mov r12,[save_r12]
536        mov r13,[save_r13]
537;        mov r14,[save_r14]
538;        mov r15,[save_r15]
539
540
541        ret 0
542; please don't remove this string !
543; Your can freely use gvmat64 in any free or commercial app
544; but it is far better don't remove the string in the binary!
545    db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
546longest_match   ENDP
547
548match_init PROC
549  ret 0
550match_init ENDP
551
552
553END
554