1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14; Use of pmaxub instead of psubusb to compute filter mask was seen
15; in ffvp8
16
17%macro LFH_FILTER_AND_HEV_MASK 1
18%if %1
19        movdqa      xmm2,                   [rdi+2*rax]       ; q3
20        movdqa      xmm1,                   [rsi+2*rax]       ; q2
21        movdqa      xmm4,                   [rsi+rax]         ; q1
22        movdqa      xmm5,                   [rsi]             ; q0
23        neg         rax                     ; negate pitch to deal with above border
24%else
25        movlps      xmm2,                   [rsi + rcx*2]     ; q3
26        movlps      xmm1,                   [rsi + rcx]       ; q2
27        movlps      xmm4,                   [rsi]             ; q1
28        movlps      xmm5,                   [rsi + rax]       ; q0
29
30        movhps      xmm2,                   [rdi + rcx*2]
31        movhps      xmm1,                   [rdi + rcx]
32        movhps      xmm4,                   [rdi]
33        movhps      xmm5,                   [rdi + rax]
34
35        lea         rsi,                    [rsi + rax*4]
36        lea         rdi,                    [rdi + rax*4]
37
38        movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2
39        movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1
40%endif
41
42        movdqa      xmm6,                   xmm1              ; q2
43        movdqa      xmm3,                   xmm4              ; q1
44
45        psubusb     xmm1,                   xmm2              ; q2-=q3
46        psubusb     xmm2,                   xmm6              ; q3-=q2
47
48        psubusb     xmm4,                   xmm6              ; q1-=q2
49        psubusb     xmm6,                   xmm3              ; q2-=q1
50
51        por         xmm4,                   xmm6              ; abs(q2-q1)
52        por         xmm1,                   xmm2              ; abs(q3-q2)
53
54        movdqa      xmm0,                   xmm5              ; q0
55        pmaxub      xmm1,                   xmm4
56
57        psubusb     xmm5,                   xmm3              ; q0-=q1
58        psubusb     xmm3,                   xmm0              ; q1-=q0
59
60        por         xmm5,                   xmm3              ; abs(q0-q1)
61        movdqa      t0,                     xmm5              ; save to t0
62
63        pmaxub      xmm1,                   xmm5
64
65%if %1
66        movdqa      xmm2,                   [rsi+4*rax]       ; p3
67        movdqa      xmm4,                   [rdi+4*rax]       ; p2
68        movdqa      xmm6,                   [rsi+2*rax]       ; p1
69%else
70        movlps      xmm2,                   [rsi + rax]       ; p3
71        movlps      xmm4,                   [rsi]             ; p2
72        movlps      xmm6,                   [rsi + rcx]       ; p1
73
74        movhps      xmm2,                   [rdi + rax]
75        movhps      xmm4,                   [rdi]
76        movhps      xmm6,                   [rdi + rcx]
77
78        movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2
79        movdqa      XMMWORD PTR [rsp + 48], xmm6              ; store p1
80%endif
81
82        movdqa      xmm5,                   xmm4              ; p2
83        movdqa      xmm3,                   xmm6              ; p1
84
85        psubusb     xmm4,                   xmm2              ; p2-=p3
86        psubusb     xmm2,                   xmm5              ; p3-=p2
87
88        psubusb     xmm3,                   xmm5              ; p1-=p2
89        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
90
91        psubusb     xmm5,                   xmm6              ; p2-=p1
92        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
93
94        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
95        movdqa      xmm2,                   xmm6              ; p1
96
97        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
98%if %1
99        movdqa      xmm4,                   [rsi+rax]         ; p0
100        movdqa      xmm3,                   [rdi]             ; q1
101%else
102        movlps      xmm4,                   [rsi + rcx*2]     ; p0
103        movhps      xmm4,                   [rdi + rcx*2]
104        movdqa      xmm3,                   q1                ; q1
105%endif
106
107        movdqa      xmm5,                   xmm4              ; p0
108        psubusb     xmm4,                   xmm6              ; p0-=p1
109
110        psubusb     xmm6,                   xmm5              ; p1-=p0
111
112        por         xmm6,                   xmm4              ; abs(p1 - p0)
113        mov         rdx,                    arg(2)            ; get flimit
114
115        movdqa        t1,                   xmm6              ; save to t1
116
117        movdqa      xmm4,                   xmm3              ; q1
118        pmaxub      xmm1,                   xmm6
119
120        psubusb     xmm3,                   xmm2              ; q1-=p1
121        psubusb     xmm2,                   xmm4              ; p1-=q1
122
123        psubusb     xmm1,                   xmm7
124        por         xmm2,                   xmm3              ; abs(p1-q1)
125
126        movdqa      xmm4,                   XMMWORD PTR [rdx] ; flimit
127
128        movdqa      xmm3,                   xmm0              ; q0
129        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
130
131        mov         rdx,                    arg(4)            ; hev get thresh
132
133        movdqa      xmm6,                   xmm5              ; p0
134        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
135
136        psubusb     xmm5,                   xmm3              ; p0-=q0
137        paddb       xmm4,                   xmm4              ; flimit*2 (less than 255)
138
139        psubusb     xmm3,                   xmm6              ; q0-=p0
140        por         xmm5,                   xmm3              ; abs(p0 - q0)
141
142        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
143        paddb       xmm7,                   xmm4              ; flimit * 2 + limit (less than 255)
144
145        movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)
146
147        movdqa      xmm3,                   t1                ; get abs (p1 - p0)
148
149        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
150
151        movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev
152
153        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
154        psubusb     xmm4,                   xmm2              ; hev
155
156        psubusb     xmm3,                   xmm2              ; hev
157        por         xmm1,                   xmm5
158
159        pxor        xmm7,                   xmm7
160        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
161
162        pcmpeqb     xmm4,                   xmm5              ; hev
163        pcmpeqb     xmm3,                   xmm3              ; hev
164
165        pcmpeqb     xmm1,                   xmm7              ; mask xmm1
166        pxor        xmm4,                   xmm3              ; hev
167%endmacro
168
169%macro B_FILTER 1
170%if %1 == 0
171        movdqa      xmm2,                   p1                ; p1
172        movdqa      xmm7,                   q1                ; q1
173%elif %1 == 1
174        movdqa      xmm2,                   [rsi+2*rax]       ; p1
175        movdqa      xmm7,                   [rdi]             ; q1
176%elif %1 == 2
177        lea         rdx,                    srct
178
179        movdqa      xmm2,                   [rdx]             ; p1
180        movdqa      xmm7,                   [rdx+48]          ; q1
181        movdqa      xmm6,                   [rdx+16]          ; p0
182        movdqa      xmm0,                   [rdx+32]          ; q0
183%endif
184
185        pxor        xmm2,                   [GLOBAL(t80)]     ; p1 offset to convert to signed values
186        pxor        xmm7,                   [GLOBAL(t80)]     ; q1 offset to convert to signed values
187
188        psubsb      xmm2,                   xmm7              ; p1 - q1
189        pxor        xmm6,                   [GLOBAL(t80)]     ; offset to convert to signed values
190
191        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
192        pxor        xmm0,                   [GLOBAL(t80)]     ; offset to convert to signed values
193
194        movdqa      xmm3,                   xmm0              ; q0
195        psubsb      xmm0,                   xmm6              ; q0 - p0
196
197        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
198
199        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
200
201        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
202
203        pand        xmm1,                   xmm2              ; mask filter values we don't care about
204
205        movdqa      xmm2,                   xmm1
206
207        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
208        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
209
210        punpckhbw   xmm5,                   xmm2              ; axbxcxdx
211        punpcklbw   xmm2,                   xmm2              ; exfxgxhx
212
213        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
214        psraw       xmm5,                   11                ; sign extended shift right by 3
215
216        punpckhbw   xmm1,                   xmm1              ; axbxcxdx
217        psraw       xmm2,                   11                ; sign extended shift right by 3
218
219        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
220        psraw       xmm0,                   11                ; sign extended shift right by 3
221
222        psraw       xmm1,                   11                ; sign extended shift right by 3
223        movdqa      xmm5,                   xmm0              ; save results
224
225        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
226        paddsw      xmm5,                   [GLOBAL(ones)]
227
228        paddsw      xmm1,                   [GLOBAL(ones)]
229        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
230
231        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
232
233        paddsb      xmm6,                   xmm2              ; p0+= p0 add
234        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
235
236%if %1 == 0
237        movdqa      xmm1,                   p1                ; p1
238%elif %1 == 1
239        movdqa      xmm1,                   [rsi+2*rax]       ; p1
240%elif %1 == 2
241        movdqa      xmm1,                   [rdx]             ; p1
242%endif
243        pandn       xmm4,                   xmm5              ; high edge variance additive
244        pxor        xmm6,                   [GLOBAL(t80)]     ; unoffset
245
246        pxor        xmm1,                   [GLOBAL(t80)]     ; reoffset
247        psubsb      xmm3,                   xmm0              ; q0-= q0 add
248
249        paddsb      xmm1,                   xmm4              ; p1+= p1 add
250        pxor        xmm3,                   [GLOBAL(t80)]     ; unoffset
251
252        pxor        xmm1,                   [GLOBAL(t80)]     ; unoffset
253        psubsb      xmm7,                   xmm4              ; q1-= q1 add
254
255        pxor        xmm7,                   [GLOBAL(t80)]     ; unoffset
256%if %1 == 0
257        lea         rsi,                    [rsi + rcx*2]
258        lea         rdi,                    [rdi + rcx*2]
259        movq        MMWORD PTR [rsi],       xmm6              ; p0
260        movhps      MMWORD PTR [rdi],       xmm6
261        movq        MMWORD PTR [rsi + rax], xmm1              ; p1
262        movhps      MMWORD PTR [rdi + rax], xmm1
263        movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
264        movhps      MMWORD PTR [rdi + rcx], xmm3
265        movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1
266        movhps      MMWORD PTR [rdi + rcx*2],xmm7
267%elif %1 == 1
268        movdqa      [rsi+rax],              xmm6              ; write back
269        movdqa      [rsi+2*rax],            xmm1              ; write back
270        movdqa      [rsi],                  xmm3              ; write back
271        movdqa      [rdi],                  xmm7              ; write back
272%endif
273
274%endmacro
275
276
277;void vp8_loop_filter_horizontal_edge_sse2
278;(
279;    unsigned char *src_ptr,
280;    int            src_pixel_step,
281;    const char    *flimit,
282;    const char    *limit,
283;    const char    *thresh,
284;    int            count
285;)
286global sym(vp8_loop_filter_horizontal_edge_sse2)
287sym(vp8_loop_filter_horizontal_edge_sse2):
288    push        rbp
289    mov         rbp, rsp
290    SHADOW_ARGS_TO_STACK 6
291    SAVE_XMM
292    GET_GOT     rbx
293    push        rsi
294    push        rdi
295    ; end prolog
296
297    ALIGN_STACK 16, rax
298    sub         rsp, 32     ; reserve 32 bytes
299    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
300    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
301
302        mov         rsi,                    arg(0)           ;src_ptr
303        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
304
305        mov         rdx,                    arg(3)           ;limit
306        movdqa      xmm7,                   XMMWORD PTR [rdx]
307
308        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
309
310        ; calculate breakout conditions and high edge variance
311        LFH_FILTER_AND_HEV_MASK 1
312        ; filter and write back the result
313        B_FILTER 1
314
315    add rsp, 32
316    pop rsp
317    ; begin epilog
318    pop rdi
319    pop rsi
320    RESTORE_GOT
321    RESTORE_XMM
322    UNSHADOW_ARGS
323    pop         rbp
324    ret
325
326
327;void vp8_loop_filter_horizontal_edge_uv_sse2
328;(
329;    unsigned char *src_ptr,
330;    int            src_pixel_step,
331;    const char    *flimit,
332;    const char    *limit,
333;    const char    *thresh,
334;    int            count
335;)
336global sym(vp8_loop_filter_horizontal_edge_uv_sse2)
337sym(vp8_loop_filter_horizontal_edge_uv_sse2):
338    push        rbp
339    mov         rbp, rsp
340    SHADOW_ARGS_TO_STACK 6
341    SAVE_XMM
342    GET_GOT     rbx
343    push        rsi
344    push        rdi
345    ; end prolog
346
347    ALIGN_STACK 16, rax
348    sub         rsp, 96       ; reserve 96 bytes
349    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
350    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
351    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
352    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
353    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
354    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
355
356        mov         rsi,                    arg(0)             ; u
357        mov         rdi,                    arg(5)             ; v
358        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
359        mov         rcx,                    rax
360        neg         rax                     ; negate pitch to deal with above border
361
362        mov         rdx,                    arg(3)             ;limit
363        movdqa      xmm7,                   XMMWORD PTR [rdx]
364
365        lea         rsi,                    [rsi + rcx]
366        lea         rdi,                    [rdi + rcx]
367
368        ; calculate breakout conditions and high edge variance
369        LFH_FILTER_AND_HEV_MASK 0
370        ; filter and write back the result
371        B_FILTER 0
372
373    add rsp, 96
374    pop rsp
375    ; begin epilog
376    pop rdi
377    pop rsi
378    RESTORE_GOT
379    RESTORE_XMM
380    UNSHADOW_ARGS
381    pop         rbp
382    ret
383
384
385%macro MB_FILTER_AND_WRITEBACK 1
386%if %1 == 0
387        movdqa      xmm2,                   p1              ; p1
388        movdqa      xmm7,                   q1              ; q1
389%elif %1 == 1
390        movdqa      xmm2,                   [rsi+2*rax]     ; p1
391        movdqa      xmm7,                   [rdi]           ; q1
392
393        mov         rcx,                    rax
394        neg         rcx
395%elif %1 == 2
396        lea         rdx,                    srct
397
398        movdqa      xmm2,                   [rdx+32]        ; p1
399        movdqa      xmm7,                   [rdx+80]        ; q1
400        movdqa      xmm6,                   [rdx+48]        ; p0
401        movdqa      xmm0,                   [rdx+64]        ; q0
402%endif
403
404        pxor        xmm2,                   [GLOBAL(t80)]   ; p1 offset to convert to signed values
405        pxor        xmm7,                   [GLOBAL(t80)]   ; q1 offset to convert to signed values
406        pxor        xmm6,                   [GLOBAL(t80)]   ; offset to convert to signed values
407        pxor        xmm0,                   [GLOBAL(t80)]   ; offset to convert to signed values
408
409        psubsb      xmm2,                   xmm7            ; p1 - q1
410        movdqa      xmm3,                   xmm0            ; q0
411
412        psubsb      xmm0,                   xmm6            ; q0 - p0
413
414        paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
415
416        paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
417
418        paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
419
420        pand        xmm1,                   xmm2            ; mask filter values we don't care about
421
422        movdqa      xmm2,                   xmm1            ; vp8_filter
423
424        pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
425        pxor        xmm0,                   xmm0
426
427        pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
428        pxor        xmm1,                   xmm1
429
430        punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
431        movdqa      xmm5,                   xmm2
432
433        punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
434        paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
435
436        pmulhw      xmm1,                   [GLOBAL(s9)]    ; Filter 2 (lo) * 9
437
438        pmulhw      xmm0,                   [GLOBAL(s9)]    ; Filter 2 (hi) * 9
439
440        punpckhbw   xmm7,                   xmm5            ; axbxcxdx
441        paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
442
443        punpcklbw   xmm5,                   xmm5            ; exfxgxhx
444        psraw       xmm7,                   11              ; sign extended shift right by 3
445
446        psraw       xmm5,                   11              ; sign extended shift right by 3
447        punpckhbw   xmm4,                   xmm2            ; axbxcxdx
448
449        punpcklbw   xmm2,                   xmm2            ; exfxgxhx
450        psraw       xmm4,                   11              ; sign extended shift right by 3
451
452        packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
453        psraw       xmm2,                   11              ; sign extended shift right by 3
454
455        packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
456        movdqa      xmm7,                   xmm1
457
458        paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
459        movdqa      xmm4,                   xmm1
460
461        psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
462        movdqa      xmm5,                   xmm0
463
464        movdqa      xmm2,                   xmm5
465        paddw       xmm0,                   [GLOBAL(s63)]   ; Filter 2 (hi) * 9 + 63
466
467        paddw       xmm1,                   [GLOBAL(s63)]   ; Filter 2 (lo) * 9 + 63
468        paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
469
470        paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
471        paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
472
473        paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
474        paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
475
476        paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
477        psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
478
479        psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
480        psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
481
482        packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
483        psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
484
485        psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
486        packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
487
488        psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7
489
490        packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
491
492        psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
493        paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)
494
495%if %1 == 0
496        movdqa      xmm5,                   q2              ; q2
497        movdqa      xmm1,                   q1              ; q1
498        movdqa      xmm4,                   p1              ; p1
499        movdqa      xmm7,                   p2              ; p2
500
501%elif %1 == 1
502        movdqa      xmm5,                   XMMWORD PTR [rdi+rcx]   ; q2
503        movdqa      xmm1,                   XMMWORD PTR [rdi]       ; q1
504        movdqa      xmm4,                   XMMWORD PTR [rsi+rax*2] ; p1
505        movdqa      xmm7,                   XMMWORD PTR [rdi+rax*4] ; p2
506%elif %1 == 2
507        movdqa      xmm5,                   XMMWORD PTR [rdx+96]    ; q2
508        movdqa      xmm1,                   XMMWORD PTR [rdx+80]    ; q1
509        movdqa      xmm4,                   XMMWORD PTR [rdx+32]    ; p1
510        movdqa      xmm7,                   XMMWORD PTR [rdx+16]    ; p2
511%endif
512
513        pxor        xmm3,                   [GLOBAL(t80)]   ; *oq0 = sq^0x80
514        pxor        xmm6,                   [GLOBAL(t80)]   ; *oq0 = sp^0x80
515
516        pxor        xmm1,                   [GLOBAL(t80)]
517        pxor        xmm4,                   [GLOBAL(t80)]
518
519        psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
520        paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
521
522        pxor        xmm1,                   [GLOBAL(t80)]   ; *oq1 = sq^0x80;
523        pxor        xmm4,                   [GLOBAL(t80)]   ; *op1 = sp^0x80;
524
525        pxor        xmm7,                   [GLOBAL(t80)]
526        pxor        xmm5,                   [GLOBAL(t80)]
527
528        paddsb      xmm7,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
529        psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
530
531        pxor        xmm7,                   [GLOBAL(t80)]   ; *op2 = sp^0x80;
532        pxor        xmm5,                   [GLOBAL(t80)]   ; *oq2 = sq^0x80;
533
534%if %1 == 0
535        lea         rsi,                    [rsi+rcx*2]
536        lea         rdi,                    [rdi+rcx*2]
537
538        movq        MMWORD PTR [rsi],       xmm6            ; p0
539        movhps      MMWORD PTR [rdi],       xmm6
540        movq        MMWORD PTR [rsi + rcx], xmm3            ; q0
541        movhps      MMWORD PTR [rdi + rcx], xmm3
542
543        movq        MMWORD PTR [rsi+rcx*2], xmm1            ; q1
544        movhps      MMWORD PTR [rdi+rcx*2], xmm1
545
546        movq        MMWORD PTR [rsi + rax], xmm4            ; p1
547        movhps      MMWORD PTR [rdi + rax], xmm4
548
549        movq        MMWORD PTR [rsi+rax*2], xmm7            ; p2
550        movhps      MMWORD PTR [rdi+rax*2], xmm7
551
552        lea         rsi,                    [rsi + rcx]
553        lea         rdi,                    [rdi + rcx]
554        movq        MMWORD PTR [rsi+rcx*2], xmm5            ; q2
555        movhps      MMWORD PTR [rdi+rcx*2], xmm5
556%elif %1 == 1
557        movdqa      XMMWORD PTR [rdi+rcx],  xmm5            ; q2
558        movdqa      XMMWORD PTR [rdi],      xmm1            ; q1
559        movdqa      XMMWORD PTR [rsi],      xmm3            ; q0
560        movdqa      XMMWORD PTR [rsi+rax  ],xmm6            ; p0
561        movdqa      XMMWORD PTR [rsi+rax*2],xmm4            ; p1
562        movdqa      XMMWORD PTR [rdi+rax*4],xmm7            ; p2
563%elif %1 == 2
564        movdqa      XMMWORD PTR [rdx+80],   xmm1            ; q1
565        movdqa      XMMWORD PTR [rdx+64],   xmm3            ; q0
566        movdqa      XMMWORD PTR [rdx+48],   xmm6            ; p0
567        movdqa      XMMWORD PTR [rdx+32],   xmm4            ; p1
568%endif
569
570%endmacro
571
572
573;void vp8_mbloop_filter_horizontal_edge_sse2
574;(
575;    unsigned char *src_ptr,
576;    int            src_pixel_step,
577;    const char    *flimit,
578;    const char    *limit,
579;    const char    *thresh,
580;    int            count
581;)
582global sym(vp8_mbloop_filter_horizontal_edge_sse2)
583sym(vp8_mbloop_filter_horizontal_edge_sse2):
584    push        rbp
585    mov         rbp, rsp
586    SHADOW_ARGS_TO_STACK 6
587    SAVE_XMM
588    GET_GOT     rbx
589    push        rsi
590    push        rdi
591    ; end prolog
592
593    ALIGN_STACK 16, rax
594    sub         rsp, 32     ; reserve 32 bytes
595    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
596    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
597
598        mov         rsi,                    arg(0)            ;src_ptr
599        movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
600
601        mov         rdx,                    arg(3)            ;limit
602        movdqa      xmm7,                   XMMWORD PTR [rdx]
603
604        lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
605
606        ; calculate breakout conditions and high edge variance
607        LFH_FILTER_AND_HEV_MASK 1
608        ; filter and write back the results
609        MB_FILTER_AND_WRITEBACK 1
610
611    add rsp, 32
612    pop rsp
613    ; begin epilog
614    pop rdi
615    pop rsi
616    RESTORE_GOT
617    RESTORE_XMM
618    UNSHADOW_ARGS
619    pop         rbp
620    ret
621
622
623;void vp8_mbloop_filter_horizontal_edge_uv_sse2
624;(
625;    unsigned char *u,
626;    int            src_pixel_step,
627;    const char    *flimit,
628;    const char    *limit,
629;    const char    *thresh,
630;    unsigned char *v
631;)
632global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
633sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
634    push        rbp
635    mov         rbp, rsp
636    SHADOW_ARGS_TO_STACK 6
637    SAVE_XMM
638    GET_GOT     rbx
639    push        rsi
640    push        rdi
641    ; end prolog
642
643    ALIGN_STACK 16, rax
644    sub         rsp, 96       ; reserve 96 bytes
645    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
646    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
647    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
648    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
649    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
650    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
651
652        mov         rsi,                    arg(0)             ; u
653        mov         rdi,                    arg(5)             ; v
654        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
655        mov         rcx,                    rax
656        neg         rax                     ; negate pitch to deal with above border
657
658        mov         rdx,                    arg(3)             ;limit
659        movdqa      xmm7,                   XMMWORD PTR [rdx]
660
661        lea         rsi,                    [rsi + rcx]
662        lea         rdi,                    [rdi + rcx]
663
664        ; calculate breakout conditions and high edge variance
665        LFH_FILTER_AND_HEV_MASK 0
666        ; filter and write back the results
667        MB_FILTER_AND_WRITEBACK 0
668
669    add rsp, 96
670    pop rsp
671    ; begin epilog
672    pop rdi
673    pop rsi
674    RESTORE_GOT
675    RESTORE_XMM
676    UNSHADOW_ARGS
677    pop         rbp
678    ret
679
680
681%macro TRANSPOSE_16X8 2
682        movq        xmm4,               QWORD PTR [rsi]        ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
683        movq        xmm1,               QWORD PTR [rdi]        ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
684        movq        xmm0,               QWORD PTR [rsi+2*rax]  ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
685        movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
686        movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
687        movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
688
689        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
690
691        movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
692
693        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
694        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
695
696        movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
697
698        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
699%if %1
700        lea         rsi,                [rsi+rax*8]
701%else
702        mov         rsi,                arg(5)          ; v_ptr
703%endif
704
705        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
706        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
707
708        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
709
710        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
711%if %1
712        lea         rdi,                [rdi+rax*8]
713%else
714        lea         rsi,                [rsi - 4]
715%endif
716
717        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
718%if %1
719        lea         rdx,                srct
720%else
721        lea         rdi,                [rsi + rax]     ; rdi points to row +1 for indirect addressing
722%endif
723
724        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
725        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
726
727        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
728        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
729
730        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
731
732        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
733
734        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
735
736        movdqa      t0,                 xmm2            ; save to free XMM2
737        movq        xmm2,               QWORD PTR [rsi]       ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
738        movq        xmm6,               QWORD PTR [rdi]       ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
739        movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
740        movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
741        movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
742
743        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
744
745        movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
746
747        punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
748
749        movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
750
751        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
752
753        movq        xmm6,               QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
754
755        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
756
757        movdqa      xmm6,               xmm1            ;
758        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
759
760        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
761        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
762
763        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
764
765        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
766
767        movdqa      xmm0,               xmm5
768        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
769
770        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
771        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
772
773        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
774
775        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
776        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
777
778        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
779
780        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
781%if %2
782        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
783        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
784
785        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
786
787        movdqa      [rdx],              xmm2            ; save 2
788
789        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
790        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
791
792        movdqa      [rdx+16],           xmm3            ; save 3
793
794        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
795
796        movdqa      [rdx+32],           xmm4            ; save 4
797        movdqa      [rdx+48],           xmm5            ; save 5
798        movdqa      xmm1,               t0              ; get
799
800        movdqa      xmm2,               xmm1            ;
801        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
802
803        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
804%else
805        movdqa      [rdx+112],          xmm7            ; save 7
806
807        movdqa      [rdx+96],           xmm6            ; save 6
808
809        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
810        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
811
812        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
813
814        movdqa      [rdx+32],           xmm2            ; save 2
815
816        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
817        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
818
819        movdqa      [rdx+48],           xmm3            ; save 3
820
821        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
822
823        movdqa      [rdx+64],           xmm4            ; save 4
824        movdqa      [rdx+80],           xmm5            ; save 5
825        movdqa      xmm1,               t0              ; get
826
827        movdqa      xmm2,               xmm1
828        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
829
830        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
831
832        movdqa      [rdx+16],           xmm1
833
834        movdqa      [rdx],              xmm2
835%endif
836%endmacro
837
838%macro LFV_FILTER_MASK_HEV_MASK 1
839        movdqa      xmm0,               xmm6            ; q2
840        psubusb     xmm0,               xmm7            ; q2-q3
841
842        psubusb     xmm7,               xmm6            ; q3-q2
843        movdqa      xmm4,               xmm5            ; q1
844
845        por         xmm7,               xmm0            ; abs (q3-q2)
846        psubusb     xmm4,               xmm6            ; q1-q2
847
848        movdqa      xmm0,               xmm1
849        psubusb     xmm6,               xmm5            ; q2-q1
850
851        por         xmm6,               xmm4            ; abs (q2-q1)
852        psubusb     xmm0,               xmm2            ; p2 - p3;
853
854        psubusb     xmm2,               xmm1            ; p3 - p2;
855        por         xmm0,               xmm2            ; abs(p2-p3)
856%if %1
857        movdqa      xmm2,               [rdx]           ; p1
858%else
859        movdqa      xmm2,               [rdx+32]        ; p1
860%endif
861        movdqa      xmm5,               xmm2            ; p1
862        pmaxub      xmm0,               xmm7
863
864        psubusb     xmm5,               xmm1            ; p1-p2
865        psubusb     xmm1,               xmm2            ; p2-p1
866
867        movdqa      xmm7,               xmm3            ; p0
868        psubusb     xmm7,               xmm2            ; p0-p1
869
870        por         xmm1,               xmm5            ; abs(p2-p1)
871        pmaxub      xmm0,               xmm6
872
873        pmaxub      xmm0,               xmm1
874        movdqa      xmm1,               xmm2            ; p1
875
876        psubusb     xmm2,               xmm3            ; p1-p0
877        lea         rdx,                srct
878
879        por         xmm2,               xmm7            ; abs(p1-p0)
880
881        movdqa      t0,                 xmm2            ; save abs(p1-p0)
882
883        pmaxub      xmm0,               xmm2
884
885%if %1
886        movdqa      xmm5,               [rdx+32]        ; q0
887        movdqa      xmm7,               [rdx+48]        ; q1
888%else
889        movdqa      xmm5,               [rdx+64]        ; q0
890        movdqa      xmm7,               [rdx+80]        ; q1
891%endif
892        mov         rdx,                arg(3)          ; limit
893
894        movdqa      xmm6,               xmm5            ; q0
895        movdqa      xmm2,               xmm7            ; q1
896
897        psubusb     xmm5,               xmm7            ; q0-q1
898        psubusb     xmm7,               xmm6            ; q1-q0
899
900        por         xmm7,               xmm5            ; abs(q1-q0)
901
902        movdqa      t1,                 xmm7            ; save abs(q1-q0)
903
904        movdqa      xmm4,               XMMWORD PTR [rdx]; limit
905
906        pmaxub      xmm0,               xmm7
907        mov         rdx,                arg(2)          ; flimit
908
909        psubusb     xmm0,               xmm4
910        movdqa      xmm5,               xmm2            ; q1
911
912        psubusb     xmm5,               xmm1            ; q1-=p1
913        psubusb     xmm1,               xmm2            ; p1-=q1
914
915        por         xmm5,               xmm1            ; abs(p1-q1)
916        movdqa      xmm1,               xmm3            ; p0
917
918        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
919        psubusb     xmm1,               xmm6            ; p0-q0
920
921        psrlw       xmm5,               1               ; abs(p1-q1)/2
922        psubusb     xmm6,               xmm3            ; q0-p0
923
924        movdqa      xmm2,               XMMWORD PTR [rdx]; flimit
925
926        mov         rdx,                arg(4)          ; get thresh
927
928        por         xmm1,               xmm6            ; abs(q0-p0)
929        paddb       xmm2,               xmm2            ; flimit*2 (less than 255)
930
931        movdqa      xmm6,               t0              ; get abs (q1 - q0)
932
933        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
934
935        movdqa      xmm3,               t1              ; get abs (p1 - p0)
936
937        movdqa      xmm7,               XMMWORD PTR [rdx]
938
939        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
940        psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh
941
942        paddb       xmm4,               xmm2            ; flimit * 2 + limit (less than 255)
943        psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
944
945        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
946        por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
947
948        por         xmm1,               xmm0            ; mask
949        pcmpeqb     xmm6,               xmm0
950
951        pxor        xmm0,               xmm0
952        pcmpeqb     xmm4,               xmm4
953
954        pcmpeqb     xmm1,               xmm0
955        pxor        xmm4,               xmm6
956%endmacro
957
958%macro BV_TRANSPOSE 0
959        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
960        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
961        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
962        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
963        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
964        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
965
966        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
967        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
968
969        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
970
971        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
972
973        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
974        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
975
976        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
977        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
978
979        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
980
981        punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
982        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
983        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
984        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
985        ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
986%endmacro
987
988%macro BV_WRITEBACK 2
989        movd        [rsi+2],            %1
990        psrldq      %1,                 4
991
992        movd        [rdi+2],            %1
993        psrldq      %1,                 4
994
995        movd        [rsi+2*rax+2],      %1
996        psrldq      %1,                 4
997
998        movd        [rdi+2*rax+2],      %1
999
1000        movd        [rsi+4*rax+2],      %2
1001        psrldq      %2,                 4
1002
1003        movd        [rdi+4*rax+2],      %2
1004        psrldq      %2,                 4
1005
1006        movd        [rsi+2*rcx+2],      %2
1007        psrldq      %2,                 4
1008
1009        movd        [rdi+2*rcx+2],      %2
1010%endmacro
1011
1012
1013;void vp8_loop_filter_vertical_edge_sse2
1014;(
1015;    unsigned char *src_ptr,
1016;    int            src_pixel_step,
1017;    const char    *flimit,
1018;    const char    *limit,
1019;    const char    *thresh,
1020;    int            count
1021;)
1022global sym(vp8_loop_filter_vertical_edge_sse2)
1023sym(vp8_loop_filter_vertical_edge_sse2):
1024    push        rbp
1025    mov         rbp, rsp
1026    SHADOW_ARGS_TO_STACK 6
1027    SAVE_XMM
1028    GET_GOT     rbx
1029    push        rsi
1030    push        rdi
1031    ; end prolog
1032
1033    ALIGN_STACK 16, rax
1034    sub             rsp, 96      ; reserve 96 bytes
1035    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
1036    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
1037    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
1038
1039        mov         rsi,        arg(0)                  ; src_ptr
1040        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
1041
1042        lea         rsi,        [rsi - 4]
1043        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1044        lea         rcx,        [rax*2+rax]
1045
1046        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1047        TRANSPOSE_16X8 1, 1
1048
1049        ; calculate filter mask and high edge variance
1050        LFV_FILTER_MASK_HEV_MASK 1
1051
1052        ; start work on filters
1053        B_FILTER 2
1054
1055        ; tranpose and write back - only work on q1, q0, p0, p1
1056        BV_TRANSPOSE
1057        ; store 16-line result
1058
1059        lea         rdx,        [rax]
1060        neg         rdx
1061
1062        BV_WRITEBACK xmm1, xmm5
1063
1064        lea         rsi,        [rsi+rdx*8]
1065        lea         rdi,        [rdi+rdx*8]
1066        BV_WRITEBACK xmm2, xmm6
1067
1068    add rsp, 96
1069    pop rsp
1070    ; begin epilog
1071    pop rdi
1072    pop rsi
1073    RESTORE_GOT
1074    RESTORE_XMM
1075    UNSHADOW_ARGS
1076    pop         rbp
1077    ret
1078
1079
1080;void vp8_loop_filter_vertical_edge_uv_sse2
1081;(
1082;    unsigned char *u,
1083;    int            src_pixel_step,
1084;    const char    *flimit,
1085;    const char    *limit,
1086;    const char    *thresh,
1087;    unsigned char *v
1088;)
1089global sym(vp8_loop_filter_vertical_edge_uv_sse2)
1090sym(vp8_loop_filter_vertical_edge_uv_sse2):
1091    push        rbp
1092    mov         rbp, rsp
1093    SHADOW_ARGS_TO_STACK 6
1094    SAVE_XMM
1095    GET_GOT     rbx
1096    push        rsi
1097    push        rdi
1098    ; end prolog
1099
1100    ALIGN_STACK 16, rax
1101    sub             rsp, 96      ; reserve 96 bytes
1102    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
1103    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
1104    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
1105
1106        mov         rsi,        arg(0)                  ; u_ptr
1107        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
1108
1109        lea         rsi,        [rsi - 4]
1110        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1111        lea         rcx,        [rax+2*rax]
1112
1113        lea         rdx,        srct
1114
1115        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1116        TRANSPOSE_16X8 0, 1
1117
1118        ; calculate filter mask and high edge variance
1119        LFV_FILTER_MASK_HEV_MASK 1
1120
1121        ; start work on filters
1122        B_FILTER 2
1123
1124        ; tranpose and write back - only work on q1, q0, p0, p1
1125        BV_TRANSPOSE
1126
1127        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1128
1129        ; store 16-line result
1130        BV_WRITEBACK xmm1, xmm5
1131
1132        mov         rsi,        arg(0)                  ; u_ptr
1133        lea         rsi,        [rsi - 4]
1134        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1135        BV_WRITEBACK xmm2, xmm6
1136
1137    add rsp, 96
1138    pop rsp
1139    ; begin epilog
1140    pop rdi
1141    pop rsi
1142    RESTORE_GOT
1143    RESTORE_XMM
1144    UNSHADOW_ARGS
1145    pop         rbp
1146    ret
1147
1148%macro MBV_TRANSPOSE 0
1149        movdqa      xmm0,               [rdx]               ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1150        movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1151
1152        punpcklbw   xmm0,               xmm7                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1153        punpckhbw   xmm1,               xmm7                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1154
1155        movdqa      xmm2,               [rdx+32]            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1156        movdqa      xmm6,               xmm2                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1157
1158        punpcklbw   xmm2,               [rdx+48]            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1159        punpckhbw   xmm6,               [rdx+48]            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1160
1161        movdqa      xmm3,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1162        punpcklwd   xmm0,               xmm2                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1163
1164        punpckhwd   xmm3,               xmm2                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1165        movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1166
1167        punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1168        punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1169
1170        movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1171        punpcklbw   xmm2,               [rdx+80]            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1172
1173        movdqa      xmm6,               xmm5                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
1174        punpcklbw   xmm6,               [rdx+112]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
1175
1176        movdqa      xmm7,               xmm2                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1177        punpcklwd   xmm2,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
1178
1179        punpckhwd   xmm7,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
1180        movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1181
1182        punpckldq   xmm0,               xmm2                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
1183        punpckhdq   xmm6,               xmm2                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
1184%endmacro
1185
1186%macro MBV_WRITEBACK_1 0
1187        movq        QWORD  PTR [rsi],   xmm0
1188        movhps      MMWORD PTR [rdi],   xmm0
1189
1190        movq        QWORD  PTR [rsi+2*rax], xmm6
1191        movhps      MMWORD PTR [rdi+2*rax], xmm6
1192
1193        movdqa      xmm0,               xmm3                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1194        punpckldq   xmm0,               xmm7                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
1195
1196        punpckhdq   xmm3,               xmm7                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
1197
1198        movq        QWORD  PTR [rsi+4*rax], xmm0
1199        movhps      MMWORD PTR [rdi+4*rax], xmm0
1200
1201        movq        QWORD  PTR [rsi+2*rcx], xmm3
1202        movhps      MMWORD PTR [rdi+2*rcx], xmm3
1203
1204        movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1205        punpckhbw   xmm2,               [rdx+80]            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
1206
1207        punpckhbw   xmm5,               [rdx+112]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
1208        movdqa      xmm0,               xmm2
1209
1210        punpcklwd   xmm0,               xmm5                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
1211        punpckhwd   xmm2,               xmm5                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
1212
1213        movdqa      xmm5,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1214        punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
1215
1216        punpckhdq   xmm5,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
1217%endmacro
1218
1219%macro MBV_WRITEBACK_2 0
1220        movq        QWORD  PTR [rsi],   xmm1
1221        movhps      MMWORD PTR [rdi],   xmm1
1222
1223        movq        QWORD  PTR [rsi+2*rax], xmm5
1224        movhps      MMWORD PTR [rdi+2*rax], xmm5
1225
1226        movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1227        punpckldq   xmm1,               xmm2                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
1228        punpckhdq   xmm4,               xmm2                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
1229
1230        movq        QWORD  PTR [rsi+4*rax], xmm1
1231        movhps      MMWORD PTR [rdi+4*rax], xmm1
1232
1233        movq        QWORD  PTR [rsi+2*rcx], xmm4
1234        movhps      MMWORD PTR [rdi+2*rcx], xmm4
1235%endmacro
1236
1237
1238;void vp8_mbloop_filter_vertical_edge_sse2
1239;(
1240;    unsigned char *src_ptr,
1241;    int            src_pixel_step,
1242;    const char    *flimit,
1243;    const char    *limit,
1244;    const char    *thresh,
1245;    int            count
1246;)
1247global sym(vp8_mbloop_filter_vertical_edge_sse2)
1248sym(vp8_mbloop_filter_vertical_edge_sse2):
1249    push        rbp
1250    mov         rbp, rsp
1251    SHADOW_ARGS_TO_STACK 6
1252    SAVE_XMM
1253    GET_GOT     rbx
1254    push        rsi
1255    push        rdi
1256    ; end prolog
1257
1258    ALIGN_STACK 16, rax
1259    sub          rsp, 160     ; reserve 160 bytes
1260    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
1261    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
1262    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
1263
1264        mov         rsi,                arg(0)              ; src_ptr
1265        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
1266
1267        lea         rsi,                [rsi - 4]
1268        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
1269        lea         rcx,                [rax*2+rax]
1270
1271        ; Transpose
1272        TRANSPOSE_16X8 1, 0
1273
1274        ; calculate filter mask and high edge variance
1275        LFV_FILTER_MASK_HEV_MASK 0
1276
1277        neg         rax
1278        ; start work on filters
1279        MB_FILTER_AND_WRITEBACK 2
1280
1281        lea         rsi,                [rsi+rax*8]
1282        lea         rdi,                [rdi+rax*8]
1283
1284        ; transpose and write back
1285        MBV_TRANSPOSE
1286
1287        neg         rax
1288
1289        MBV_WRITEBACK_1
1290
1291        lea         rsi,                [rsi+rax*8]
1292        lea         rdi,                [rdi+rax*8]
1293        MBV_WRITEBACK_2
1294
1295    add rsp, 160
1296    pop rsp
1297    ; begin epilog
1298    pop rdi
1299    pop rsi
1300    RESTORE_GOT
1301    RESTORE_XMM
1302    UNSHADOW_ARGS
1303    pop         rbp
1304    ret
1305
1306
1307;void vp8_mbloop_filter_vertical_edge_uv_sse2
1308;(
1309;    unsigned char *u,
1310;    int            src_pixel_step,
1311;    const char    *flimit,
1312;    const char    *limit,
1313;    const char    *thresh,
1314;    unsigned char *v
1315;)
1316global sym(vp8_mbloop_filter_vertical_edge_uv_sse2)
1317sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
1318    push        rbp
1319    mov         rbp, rsp
1320    SHADOW_ARGS_TO_STACK 6
1321    SAVE_XMM
1322    GET_GOT     rbx
1323    push        rsi
1324    push        rdi
1325    ; end prolog
1326
1327    ALIGN_STACK 16, rax
1328    sub          rsp, 160     ; reserve 160 bytes
1329    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
1330    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
1331    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
1332
1333        mov         rsi,                arg(0)              ; u_ptr
1334        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
1335
1336        lea         rsi,                [rsi - 4]
1337        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
1338        lea         rcx,                [rax+2*rax]
1339
1340        lea         rdx,                srct
1341
1342        ; Transpose
1343        TRANSPOSE_16X8 0, 0
1344
1345        ; calculate filter mask and high edge variance
1346        LFV_FILTER_MASK_HEV_MASK 0
1347
1348        ; start work on filters
1349        MB_FILTER_AND_WRITEBACK 2
1350
1351        ; transpose and write back
1352        MBV_TRANSPOSE
1353
1354        mov         rsi,                arg(0)             ;u_ptr
1355        lea         rsi,                [rsi - 4]
1356        lea         rdi,                [rsi + rax]
1357        MBV_WRITEBACK_1
1358        mov         rsi,                arg(5)             ;v_ptr
1359        lea         rsi,                [rsi - 4]
1360        lea         rdi,                [rsi + rax]
1361        MBV_WRITEBACK_2
1362
1363    add rsp, 160
1364    pop rsp
1365    ; begin epilog
1366    pop rdi
1367    pop rsi
1368    RESTORE_GOT
1369    RESTORE_XMM
1370    UNSHADOW_ARGS
1371    pop         rbp
1372    ret
1373
1374
1375;void vp8_loop_filter_simple_horizontal_edge_sse2
1376;(
1377;    unsigned char *src_ptr,
1378;    int  src_pixel_step,
1379;    const char *flimit,
1380;    const char *limit,
1381;    const char *thresh,
1382;    int count
1383;)
1384global sym(vp8_loop_filter_simple_horizontal_edge_sse2)
1385sym(vp8_loop_filter_simple_horizontal_edge_sse2):
1386    push        rbp
1387    mov         rbp, rsp
1388    SHADOW_ARGS_TO_STACK 6
1389    SAVE_XMM
1390    GET_GOT     rbx
1391    push        rsi
1392    push        rdi
1393    ; end prolog
1394
1395        mov         rsi, arg(0)             ;src_ptr
1396        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
1397        mov         rdx, arg(2) ;flimit     ; get flimit
1398        movdqa      xmm3, XMMWORD PTR [rdx]
1399        mov         rdx, arg(3) ;limit
1400        movdqa      xmm7, XMMWORD PTR [rdx]
1401
1402        paddb       xmm3, xmm3              ; flimit*2 (less than 255)
1403        paddb       xmm3, xmm7              ; flimit * 2 + limit (less than 255)
1404
1405        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
1406        add         rdi, rax
1407        neg         rax
1408
1409        ; calculate mask
1410        movdqu      xmm1, [rsi+2*rax]       ; p1
1411        movdqu      xmm0, [rdi]             ; q1
1412        movdqa      xmm2, xmm1
1413        movdqa      xmm7, xmm0
1414        movdqa      xmm4, xmm0
1415        psubusb     xmm0, xmm1              ; q1-=p1
1416        psubusb     xmm1, xmm4              ; p1-=q1
1417        por         xmm1, xmm0              ; abs(p1-q1)
1418        pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
1419        psrlw       xmm1, 1                 ; abs(p1-q1)/2
1420
1421        movdqu      xmm5, [rsi+rax]         ; p0
1422        movdqu      xmm4, [rsi]             ; q0
1423        movdqa      xmm0, xmm4              ; q0
1424        movdqa      xmm6, xmm5              ; p0
1425        psubusb     xmm5, xmm4              ; p0-=q0
1426        psubusb     xmm4, xmm6              ; q0-=p0
1427        por         xmm5, xmm4              ; abs(p0 - q0)
1428        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
1429        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
1430
1431        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
1432        pxor        xmm3, xmm3
1433        pcmpeqb     xmm5, xmm3
1434
1435        ; start work on filters
1436        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
1437        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
1438        psubsb      xmm2, xmm7              ; p1 - q1
1439
1440        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
1441        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
1442        movdqa      xmm3, xmm0              ; q0
1443        psubsb      xmm0, xmm6              ; q0 - p0
1444        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
1445        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
1446        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
1447        pand        xmm5, xmm2              ; mask filter values we don't care about
1448
1449        ; do + 4 side
1450        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
1451
1452        movdqa      xmm0, xmm5              ; get a copy of filters
1453        psllw       xmm0, 8                 ; shift left 8
1454        psraw       xmm0, 3                 ; arithmetic shift right 11
1455        psrlw       xmm0, 8
1456        movdqa      xmm1, xmm5              ; get a copy of filters
1457        psraw       xmm1, 11                ; arithmetic shift right 11
1458        psllw       xmm1, 8                 ; shift left 8 to put it back
1459
1460        por         xmm0, xmm1              ; put the two together to get result
1461
1462        psubsb      xmm3, xmm0              ; q0-= q0 add
1463        pxor        xmm3, [GLOBAL(t80)]     ; unoffset
1464        movdqu      [rsi], xmm3             ; write back
1465
1466        ; now do +3 side
1467        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
1468
1469        movdqa      xmm0, xmm5              ; get a copy of filters
1470        psllw       xmm0, 8                 ; shift left 8
1471        psraw       xmm0, 3                 ; arithmetic shift right 11
1472        psrlw       xmm0, 8
1473        psraw       xmm5, 11                ; arithmetic shift right 11
1474        psllw       xmm5, 8                 ; shift left 8 to put it back
1475        por         xmm0, xmm5              ; put the two together to get result
1476
1477
1478        paddsb      xmm6, xmm0              ; p0+= p0 add
1479        pxor        xmm6, [GLOBAL(t80)]     ; unoffset
1480        movdqu      [rsi+rax], xmm6         ; write back
1481
1482    ; begin epilog
1483    pop rdi
1484    pop rsi
1485    RESTORE_GOT
1486    RESTORE_XMM
1487    UNSHADOW_ARGS
1488    pop         rbp
1489    ret
1490
1491
1492;void vp8_loop_filter_simple_vertical_edge_sse2
1493;(
1494;    unsigned char *src_ptr,
1495;    int  src_pixel_step,
1496;    const char *flimit,
1497;    const char *limit,
1498;    const char *thresh,
1499;    int count
1500;)
1501global sym(vp8_loop_filter_simple_vertical_edge_sse2)
1502sym(vp8_loop_filter_simple_vertical_edge_sse2):
1503    push        rbp         ; save old base pointer value.
1504    mov         rbp, rsp    ; set new base pointer value.
1505    SHADOW_ARGS_TO_STACK 6
1506    SAVE_XMM
1507    GET_GOT     rbx         ; save callee-saved reg
1508    push        rsi
1509    push        rdi
1510    ; end prolog
1511
1512    ALIGN_STACK 16, rax
1513    sub         rsp, 32                         ; reserve 32 bytes
1514    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
1515    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
1516
1517        mov         rsi, arg(0) ;src_ptr
1518        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
1519
1520        lea         rsi,        [rsi - 2 ]
1521        lea         rdi,        [rsi + rax]
1522        lea         rdx,        [rsi + rax*4]
1523        lea         rcx,        [rdx + rax]
1524
1525        movdqu      xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
1526        movdqu      xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
1527        movdqu      xmm2,       [rdi]                   ; 13 12 11 10
1528        movdqu      xmm3,       [rcx]                   ; 53 52 51 50
1529        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
1530        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
1531
1532        movdqu      xmm4,       [rsi + rax*2]           ; 23 22 21 20
1533        movdqu      xmm5,       [rdx + rax*2]           ; 63 62 61 60
1534        movdqu      xmm6,       [rdi + rax*2]           ; 33 32 31 30
1535        movdqu      xmm7,       [rcx + rax*2]           ; 73 72 71 70
1536        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
1537        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
1538
1539        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
1540        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
1541
1542        movdqa      xmm1,       xmm0
1543        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
1544        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
1545
1546        movdqa      xmm2,       xmm0
1547        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1548        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1549
1550        movdqa      t0,         xmm0                    ; save to t0
1551        movdqa      t1,         xmm2                    ; save to t1
1552
1553        lea         rsi,        [rsi + rax*8]
1554        lea         rdi,        [rsi + rax]
1555        lea         rdx,        [rsi + rax*4]
1556        lea         rcx,        [rdx + rax]
1557
1558        movdqu      xmm4,       [rsi]                   ; 83 82 81 80
1559        movdqu      xmm1,       [rdx]                   ; c3 c2 c1 c0
1560        movdqu      xmm6,       [rdi]                   ; 93 92 91 90
1561        movdqu      xmm3,       [rcx]                   ; d3 d2 d1 d0
1562        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
1563        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
1564
1565        movdqu      xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
1566        movdqu      xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
1567        movdqu      xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
1568        movdqu      xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
1569        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
1570        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
1571
1572        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
1573        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
1574
1575        movdqa      xmm1,       xmm4
1576        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
1577        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
1578
1579        movdqa      xmm6,       xmm4
1580        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
1581        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
1582
1583        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1584        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1585        movdqa      xmm1,       xmm0
1586        movdqa      xmm3,       xmm2
1587
1588        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1589        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1590        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1591        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1592
1593        ; calculate mask
1594        movdqa      xmm6,       xmm0                            ; p1
1595        movdqa      xmm7,       xmm3                            ; q1
1596        psubusb     xmm7,       xmm0                            ; q1-=p1
1597        psubusb     xmm6,       xmm3                            ; p1-=q1
1598        por         xmm6,       xmm7                            ; abs(p1-q1)
1599        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
1600        psrlw       xmm6,       1                               ; abs(p1-q1)/2
1601
1602        movdqa      xmm5,       xmm1                            ; p0
1603        movdqa      xmm4,       xmm2                            ; q0
1604        psubusb     xmm5,       xmm2                            ; p0-=q0
1605        psubusb     xmm4,       xmm1                            ; q0-=p0
1606        por         xmm5,       xmm4                            ; abs(p0 - q0)
1607        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
1608        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
1609
1610        mov         rdx,        arg(2)                          ;flimit
1611        movdqa      xmm7, XMMWORD PTR [rdx]
1612        mov         rdx,        arg(3)                          ; get limit
1613        movdqa      xmm6, XMMWORD PTR [rdx]
1614        paddb       xmm7,        xmm7                           ; flimit*2 (less than 255)
1615        paddb       xmm7,        xmm6                           ; flimit * 2 + limit (less than 255)
1616
1617        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
1618        pxor        xmm7,        xmm7
1619        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
1620
1621        ; start work on filters
1622        movdqa        t0,        xmm0
1623        movdqa        t1,        xmm3
1624
1625        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
1626        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
1627
1628        psubsb      xmm0,        xmm3                           ; p1 - q1
1629        movdqa      xmm6,        xmm1                           ; p0
1630
1631        movdqa      xmm7,        xmm2                           ; q0
1632        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
1633
1634        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
1635        movdqa      xmm3,        xmm7                           ; offseted ; q0
1636
1637        psubsb      xmm7,        xmm6                           ; q0 - p0
1638        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
1639
1640        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
1641        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
1642
1643        pand        xmm5,        xmm0                           ; mask filter values we don't care about
1644
1645
1646        paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
1647
1648        movdqa      xmm0,        xmm5                           ; get a copy of filters
1649        psllw       xmm0,        8                              ; shift left 8
1650
1651        psraw       xmm0,        3                              ; arithmetic shift right 11
1652        psrlw       xmm0,        8
1653
1654        movdqa      xmm7,        xmm5                           ; get a copy of filters
1655        psraw       xmm7,        11                             ; arithmetic shift right 11
1656
1657        psllw       xmm7,        8                              ; shift left 8 to put it back
1658        por         xmm0,        xmm7                           ; put the two together to get result
1659
1660        psubsb      xmm3,        xmm0                           ; q0-= q0sz add
1661        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
1662
1663        ; now do +3 side
1664        psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
1665        movdqa      xmm0,        xmm5                           ; get a copy of filters
1666
1667        psllw       xmm0,        8                              ; shift left 8
1668        psraw       xmm0,        3                              ; arithmetic shift right 11
1669
1670        psrlw       xmm0,        8
1671        psraw       xmm5,        11                             ; arithmetic shift right 11
1672
1673        psllw       xmm5,        8                              ; shift left 8 to put it back
1674        por         xmm0,        xmm5                           ; put the two together to get result
1675
1676        paddsb      xmm6,        xmm0                           ; p0+= p0 add
1677        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
1678
1679        movdqa      xmm0,        t0                             ; p1
1680        movdqa      xmm4,        t1                             ; q1
1681
1682        ; transpose back to write out
1683        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1684        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1685        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1686        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1687        movdqa      xmm1,       xmm0
1688        punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1689        punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1690
1691        movdqa      xmm5,       xmm3
1692        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1693        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1694
1695        movdqa      xmm2,       xmm0
1696        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1697        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1698
1699        movdqa      xmm3,       xmm1
1700        punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1701        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1702
1703        ; write out order: xmm0 xmm2 xmm1 xmm3
1704        lea         rdx,        [rsi + rax*4]
1705
1706        movd        [rsi],      xmm1                               ; write the second 8-line result
1707        psrldq      xmm1,       4
1708        movd        [rdi],      xmm1
1709        psrldq      xmm1,       4
1710        movd        [rsi + rax*2], xmm1
1711        psrldq      xmm1,       4
1712        movd        [rdi + rax*2], xmm1
1713
1714        movd        [rdx],      xmm3
1715        psrldq      xmm3,       4
1716        movd        [rcx],      xmm3
1717        psrldq      xmm3,       4
1718        movd        [rdx + rax*2], xmm3
1719        psrldq      xmm3,       4
1720        movd        [rcx + rax*2], xmm3
1721
1722        neg         rax
1723        lea         rsi,        [rsi + rax*8]
1724        neg         rax
1725        lea         rdi,        [rsi + rax]
1726        lea         rdx,        [rsi + rax*4]
1727        lea         rcx,        [rdx + rax]
1728
1729        movd        [rsi],      xmm0                                ; write the first 8-line result
1730        psrldq      xmm0,       4
1731        movd        [rdi],      xmm0
1732        psrldq      xmm0,       4
1733        movd        [rsi + rax*2], xmm0
1734        psrldq      xmm0,       4
1735        movd        [rdi + rax*2], xmm0
1736
1737        movd        [rdx],      xmm2
1738        psrldq      xmm2,       4
1739        movd        [rcx],      xmm2
1740        psrldq      xmm2,       4
1741        movd        [rdx + rax*2], xmm2
1742        psrldq      xmm2,       4
1743        movd        [rcx + rax*2], xmm2
1744
1745    add rsp, 32
1746    pop rsp
1747    ; begin epilog
1748    pop rdi
1749    pop rsi
1750    RESTORE_GOT
1751    RESTORE_XMM
1752    UNSHADOW_ARGS
1753    pop         rbp
1754    ret
1755
1756SECTION_RODATA
1757align 16
1758tfe:
1759    times 16 db 0xfe
1760align 16
1761t80:
1762    times 16 db 0x80
1763align 16
1764t1s:
1765    times 16 db 0x01
1766align 16
1767t3:
1768    times 16 db 0x03
1769align 16
1770t4:
1771    times 16 db 0x04
1772align 16
1773ones:
1774    times 8 dw 0x0001
1775align 16
1776s9:
1777    times 8 dw 0x0900
1778align 16
1779s63:
1780    times 8 dw 0x003f
1781