1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13%define _t0 0
14%define _t1 _t0 + 16
15%define _p3 _t1 + 16
16%define _p2 _p3 + 16
17%define _p1 _p2 + 16
18%define _p0 _p1 + 16
19%define _q0 _p0 + 16
20%define _q1 _q0 + 16
21%define _q2 _q1 + 16
22%define _q3 _q2 + 16
23%define lf_var_size 160
24
25; Use of pmaxub instead of psubusb to compute filter mask was seen
26; in ffvp8
27
28%macro LFH_FILTER_AND_HEV_MASK 1
29%if %1
30        movdqa      xmm2,                   [rdi+2*rax]       ; q3
31        movdqa      xmm1,                   [rsi+2*rax]       ; q2
32        movdqa      xmm4,                   [rsi+rax]         ; q1
33        movdqa      xmm5,                   [rsi]             ; q0
34        neg         rax                     ; negate pitch to deal with above border
35%else
36        movlps      xmm2,                   [rsi + rcx*2]     ; q3
37        movlps      xmm1,                   [rsi + rcx]       ; q2
38        movlps      xmm4,                   [rsi]             ; q1
39        movlps      xmm5,                   [rsi + rax]       ; q0
40
41        movhps      xmm2,                   [rdi + rcx*2]
42        movhps      xmm1,                   [rdi + rcx]
43        movhps      xmm4,                   [rdi]
44        movhps      xmm5,                   [rdi + rax]
45
46        lea         rsi,                    [rsi + rax*4]
47        lea         rdi,                    [rdi + rax*4]
48
49        movdqa      [rsp+_q2],              xmm1              ; store q2
50        movdqa      [rsp+_q1],              xmm4              ; store q1
51%endif
52        movdqa      xmm7,                   [rdx]             ;limit
53
54        movdqa      xmm6,                   xmm1              ; q2
55        movdqa      xmm3,                   xmm4              ; q1
56
57        psubusb     xmm1,                   xmm2              ; q2-=q3
58        psubusb     xmm2,                   xmm6              ; q3-=q2
59
60        psubusb     xmm4,                   xmm6              ; q1-=q2
61        psubusb     xmm6,                   xmm3              ; q2-=q1
62
63        por         xmm4,                   xmm6              ; abs(q2-q1)
64        por         xmm1,                   xmm2              ; abs(q3-q2)
65
66        movdqa      xmm0,                   xmm5              ; q0
67        pmaxub      xmm1,                   xmm4
68
69        psubusb     xmm5,                   xmm3              ; q0-=q1
70        psubusb     xmm3,                   xmm0              ; q1-=q0
71
72        por         xmm5,                   xmm3              ; abs(q0-q1)
73        movdqa      [rsp+_t0],              xmm5              ; save to t0
74
75        pmaxub      xmm1,                   xmm5
76
77%if %1
78        movdqa      xmm2,                   [rsi+4*rax]       ; p3
79        movdqa      xmm4,                   [rdi+4*rax]       ; p2
80        movdqa      xmm6,                   [rsi+2*rax]       ; p1
81%else
82        movlps      xmm2,                   [rsi + rax]       ; p3
83        movlps      xmm4,                   [rsi]             ; p2
84        movlps      xmm6,                   [rsi + rcx]       ; p1
85
86        movhps      xmm2,                   [rdi + rax]
87        movhps      xmm4,                   [rdi]
88        movhps      xmm6,                   [rdi + rcx]
89
90        movdqa      [rsp+_p2],              xmm4              ; store p2
91        movdqa      [rsp+_p1],              xmm6              ; store p1
92%endif
93
94        movdqa      xmm5,                   xmm4              ; p2
95        movdqa      xmm3,                   xmm6              ; p1
96
97        psubusb     xmm4,                   xmm2              ; p2-=p3
98        psubusb     xmm2,                   xmm5              ; p3-=p2
99
100        psubusb     xmm3,                   xmm5              ; p1-=p2
101        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
102
103        psubusb     xmm5,                   xmm6              ; p2-=p1
104        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
105
106        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
107        movdqa      xmm2,                   xmm6              ; p1
108
109        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
110%if %1
111        movdqa      xmm4,                   [rsi+rax]         ; p0
112        movdqa      xmm3,                   [rdi]             ; q1
113%else
114        movlps      xmm4,                   [rsi + rcx*2]     ; p0
115        movhps      xmm4,                   [rdi + rcx*2]
116        movdqa      xmm3,                   [rsp+_q1]                ; q1
117%endif
118
119        movdqa      xmm5,                   xmm4              ; p0
120        psubusb     xmm4,                   xmm6              ; p0-=p1
121
122        psubusb     xmm6,                   xmm5              ; p1-=p0
123
124        por         xmm6,                   xmm4              ; abs(p1 - p0)
125        mov         rdx,                    arg(2)            ; get blimit
126
127        movdqa     [rsp+_t1],               xmm6              ; save to t1
128
129        movdqa      xmm4,                   xmm3              ; q1
130        pmaxub      xmm1,                   xmm6
131
132        psubusb     xmm3,                   xmm2              ; q1-=p1
133        psubusb     xmm2,                   xmm4              ; p1-=q1
134
135        psubusb     xmm1,                   xmm7
136        por         xmm2,                   xmm3              ; abs(p1-q1)
137
138        movdqa      xmm7,                   [rdx]             ; blimit
139        mov         rdx,                    arg(4)            ; hev get thresh
140
141        movdqa      xmm3,                   xmm0              ; q0
142        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
143
144        movdqa      xmm6,                   xmm5              ; p0
145        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
146
147        psubusb     xmm5,                   xmm3              ; p0-=q0
148        psubusb     xmm3,                   xmm6              ; q0-=p0
149        por         xmm5,                   xmm3              ; abs(p0 - q0)
150
151        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
152
153        movdqa      xmm4,                   [rsp+_t0]                ; hev get abs (q1 - q0)
154        movdqa      xmm3,                   [rsp+_t1]                ; get abs (p1 - p0)
155
156        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
157
158        movdqa      xmm2,                   [rdx]             ; hev
159
160        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
161        psubusb     xmm4,                   xmm2              ; hev
162
163        psubusb     xmm3,                   xmm2              ; hev
164        por         xmm1,                   xmm5
165
166        pxor        xmm7,                   xmm7
167        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
168
169        pcmpeqb     xmm4,                   xmm5              ; hev
170        pcmpeqb     xmm3,                   xmm3              ; hev
171
172        pcmpeqb     xmm1,                   xmm7              ; mask xmm1
173        pxor        xmm4,                   xmm3              ; hev
174%endmacro
175
176%macro B_FILTER 1
177        movdqa      xmm3,                   [GLOBAL(t80)]
178%if %1 == 0
179        movdqa      xmm2,                   [rsp+_p1]                ; p1
180        movdqa      xmm7,                   [rsp+_q1]                ; q1
181%elif %1 == 1
182        movdqa      xmm2,                   [rsi+2*rax]       ; p1
183        movdqa      xmm7,                   [rdi]             ; q1
184%elif %1 == 2
185        movdqa      xmm2,                   [rsp+_p1]         ; p1
186        movdqa      xmm6,                   [rsp+_p0]         ; p0
187        movdqa      xmm0,                   [rsp+_q0]         ; q0
188        movdqa      xmm7,                   [rsp+_q1]         ; q1
189%endif
190
191        pxor        xmm2,                   xmm3              ; p1 offset to convert to signed values
192        pxor        xmm7,                   xmm3              ; q1 offset to convert to signed values
193
194        psubsb      xmm2,                   xmm7              ; p1 - q1
195        pxor        xmm6,                   xmm3              ; offset to convert to signed values
196
197        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
198        pxor        xmm0,                   xmm3              ; offset to convert to signed values
199
200        movdqa      xmm3,                   xmm0              ; q0
201        psubsb      xmm0,                   xmm6              ; q0 - p0
202        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
203        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
204        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
205        pand        xmm1,                   xmm2              ; mask filter values we don't care about
206
207        movdqa      xmm2,                   xmm1
208        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
209        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
210
211        punpckhbw   xmm5,                   xmm2              ; axbxcxdx
212        punpcklbw   xmm2,                   xmm2              ; exfxgxhx
213
214        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
215        psraw       xmm5,                   11                ; sign extended shift right by 3
216
217        punpckhbw   xmm1,                   xmm1              ; axbxcxdx
218        psraw       xmm2,                   11                ; sign extended shift right by 3
219
220        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
221        psraw       xmm0,                   11                ; sign extended shift right by 3
222
223        psraw       xmm1,                   11                ; sign extended shift right by 3
224        movdqa      xmm5,                   xmm0              ; save results
225
226        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
227
228        paddsb      xmm6,                   xmm2              ; p0+= p0 add
229
230        movdqa      xmm2,                   [GLOBAL(ones)]
231        paddsw      xmm5,                   xmm2
232        paddsw      xmm1,                   xmm2
233        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
234        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
235        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
236        movdqa      xmm2,                   [GLOBAL(t80)]
237
238%if %1 == 0
239        movdqa      xmm1,                   [rsp+_p1]         ; p1
240        lea         rsi,                    [rsi + rcx*2]
241        lea         rdi,                    [rdi + rcx*2]
242%elif %1 == 1
243        movdqa      xmm1,                   [rsi+2*rax]       ; p1
244%elif %1 == 2
245        movdqa      xmm1,                   [rsp+_p1]         ; p1
246%endif
247
248        pandn       xmm4,                   xmm5              ; high edge variance additive
249        pxor        xmm6,                   xmm2              ; unoffset
250
251        pxor        xmm1,                   xmm2              ; reoffset
252        psubsb      xmm3,                   xmm0              ; q0-= q0 add
253
254        paddsb      xmm1,                   xmm4              ; p1+= p1 add
255        pxor        xmm3,                   xmm2              ; unoffset
256
257        pxor        xmm1,                   xmm2              ; unoffset
258        psubsb      xmm7,                   xmm4              ; q1-= q1 add
259
260        pxor        xmm7,                   xmm2              ; unoffset
261%if %1 == 0
262        movq        [rsi],                  xmm6              ; p0
263        movhps      [rdi],                  xmm6
264        movq        [rsi + rax],            xmm1              ; p1
265        movhps      [rdi + rax],            xmm1
266        movq        [rsi + rcx],            xmm3              ; q0
267        movhps      [rdi + rcx],            xmm3
268        movq        [rsi + rcx*2],          xmm7              ; q1
269        movhps      [rdi + rcx*2],          xmm7
270%elif %1 == 1
271        movdqa      [rsi+rax],              xmm6              ; write back
272        movdqa      [rsi+2*rax],            xmm1              ; write back
273        movdqa      [rsi],                  xmm3              ; write back
274        movdqa      [rdi],                  xmm7              ; write back
275%endif
276
277%endmacro
278
279%if ABI_IS_32BIT
280
281;void vp8_loop_filter_horizontal_edge_sse2
282;(
283;    unsigned char *src_ptr,
284;    int            src_pixel_step,
285;    const char    *blimit,
286;    const char    *limit,
287;    const char    *thresh,
288;)
289global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE
290sym(vp8_loop_filter_horizontal_edge_sse2):
291    push        rbp
292    mov         rbp, rsp
293    SHADOW_ARGS_TO_STACK 5
294    SAVE_XMM 7
295    GET_GOT     rbx
296    push        rsi
297    push        rdi
298    ; end prolog
299
300    ALIGN_STACK 16, rax
301    sub         rsp, lf_var_size
302
303        mov         rsi,                    arg(0)           ;src_ptr
304        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
305
306        mov         rdx,                    arg(3)           ;limit
307
308        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
309
310        ; calculate breakout conditions and high edge variance
311        LFH_FILTER_AND_HEV_MASK 1
312        ; filter and write back the result
313        B_FILTER 1
314
315    add rsp, lf_var_size
316    pop rsp
317    ; begin epilog
318    pop rdi
319    pop rsi
320    RESTORE_GOT
321    RESTORE_XMM
322    UNSHADOW_ARGS
323    pop         rbp
324    ret
325
326%endif
327
328;void vp8_loop_filter_horizontal_edge_uv_sse2
329;(
330;    unsigned char *src_ptr,
331;    int            src_pixel_step,
332;    const char    *blimit,
333;    const char    *limit,
334;    const char    *thresh,
335;    int            count
336;)
337global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE
338sym(vp8_loop_filter_horizontal_edge_uv_sse2):
339    push        rbp
340    mov         rbp, rsp
341    SHADOW_ARGS_TO_STACK 6
342    SAVE_XMM 7
343    GET_GOT     rbx
344    push        rsi
345    push        rdi
346    ; end prolog
347
348    ALIGN_STACK 16, rax
349    sub         rsp, lf_var_size
350
351        mov         rsi,                    arg(0)             ; u
352        mov         rdi,                    arg(5)             ; v
353        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
354        mov         rcx,                    rax
355        neg         rax                     ; negate pitch to deal with above border
356
357        mov         rdx,                    arg(3)             ;limit
358
359        lea         rsi,                    [rsi + rcx]
360        lea         rdi,                    [rdi + rcx]
361
362        ; calculate breakout conditions and high edge variance
363        LFH_FILTER_AND_HEV_MASK 0
364        ; filter and write back the result
365        B_FILTER 0
366
367    add rsp, lf_var_size
368    pop rsp
369    ; begin epilog
370    pop rdi
371    pop rsi
372    RESTORE_GOT
373    RESTORE_XMM
374    UNSHADOW_ARGS
375    pop         rbp
376    ret
377
378
379%macro MB_FILTER_AND_WRITEBACK 1
380        movdqa      xmm3,                   [GLOBAL(t80)]
381%if %1 == 0
382        movdqa      xmm2,                   [rsp+_p1]              ; p1
383        movdqa      xmm7,                   [rsp+_q1]              ; q1
384%elif %1 == 1
385        movdqa      xmm2,                   [rsi+2*rax]     ; p1
386        movdqa      xmm7,                   [rdi]           ; q1
387
388        mov         rcx,                    rax
389        neg         rcx
390%elif %1 == 2
391        movdqa      xmm2,                   [rsp+_p1]       ; p1
392        movdqa      xmm6,                   [rsp+_p0]       ; p0
393        movdqa      xmm0,                   [rsp+_q0]       ; q0
394        movdqa      xmm7,                   [rsp+_q1]       ; q1
395%endif
396
397        pxor        xmm2,                   xmm3            ; p1 offset to convert to signed values
398        pxor        xmm7,                   xmm3            ; q1 offset to convert to signed values
399        pxor        xmm6,                   xmm3            ; offset to convert to signed values
400        pxor        xmm0,                   xmm3            ; offset to convert to signed values
401
402        psubsb      xmm2,                   xmm7            ; p1 - q1
403
404        movdqa      xmm3,                   xmm0            ; q0
405        psubsb      xmm0,                   xmm6            ; q0 - p0
406        paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
407        paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
408        paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
409        pand        xmm1,                   xmm2            ; mask filter values we don't care about
410
411        movdqa      xmm2,                   xmm1            ; vp8_filter
412
413        pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
414        pxor        xmm0,                   xmm0
415
416        pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
417        pxor        xmm1,                   xmm1
418
419        punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
420        punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
421
422        movdqa      xmm5,                   xmm2
423
424        movdqa      xmm4,                   [GLOBAL(s9)]
425        paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
426        paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
427
428        pmulhw      xmm1,                   xmm4            ; Filter 2 (lo) * 9
429        pmulhw      xmm0,                   xmm4            ; Filter 2 (hi) * 9
430
431        punpckhbw   xmm7,                   xmm5            ; axbxcxdx
432        punpcklbw   xmm5,                   xmm5            ; exfxgxhx
433
434        psraw       xmm7,                   11              ; sign extended shift right by 3
435
436        psraw       xmm5,                   11              ; sign extended shift right by 3
437        punpckhbw   xmm4,                   xmm2            ; axbxcxdx
438
439        punpcklbw   xmm2,                   xmm2            ; exfxgxhx
440        psraw       xmm4,                   11              ; sign extended shift right by 3
441
442        packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
443        psraw       xmm2,                   11              ; sign extended shift right by 3
444
445        packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
446
447        paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
448
449        psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
450        movdqa      xmm7,                   xmm1
451
452        movdqa      xmm4,                   [GLOBAL(s63)]
453        movdqa      xmm5,                   xmm0
454        movdqa      xmm2,                   xmm5
455        paddw       xmm0,                   xmm4            ; Filter 2 (hi) * 9 + 63
456        paddw       xmm1,                   xmm4            ; Filter 2 (lo) * 9 + 63
457        movdqa      xmm4,                   xmm7
458
459        paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
460
461        paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
462        paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
463
464        paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
465        paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
466        psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
467
468        paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
469        psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
470        psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
471
472        packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
473
474        psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
475        psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
476        psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7
477
478        packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
479        packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
480        movdqa      xmm7,                   [GLOBAL(t80)]
481
482%if %1 == 0
483        movdqa      xmm1,                   [rsp+_q1]       ; q1
484        movdqa      xmm4,                   [rsp+_p1]       ; p1
485        lea         rsi,                    [rsi+rcx*2]
486        lea         rdi,                    [rdi+rcx*2]
487
488%elif %1 == 1
489        movdqa      xmm1,                   [rdi]           ; q1
490        movdqa      xmm4,                   [rsi+rax*2]     ; p1
491%elif %1 == 2
492        movdqa      xmm4,                   [rsp+_p1]       ; p1
493        movdqa      xmm1,                   [rsp+_q1]       ; q1
494%endif
495
496        pxor        xmm1,                   xmm7
497        pxor        xmm4,                   xmm7
498
499        psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
500        paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)
501        psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
502        paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
503
504%if %1 == 1
505        movdqa      xmm2,                   [rdi+rax*4]     ; p2
506        movdqa      xmm5,                   [rdi+rcx]       ; q2
507%else
508        movdqa      xmm2,                   [rsp+_p2]       ; p2
509        movdqa      xmm5,                   [rsp+_q2]       ; q2
510%endif
511
512        pxor        xmm1,                   xmm7            ; *oq1 = sq^0x80;
513        pxor        xmm4,                   xmm7            ; *op1 = sp^0x80;
514        pxor        xmm2,                   xmm7
515        pxor        xmm5,                   xmm7
516        paddsb      xmm2,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
517        psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
518        pxor        xmm2,                   xmm7            ; *op2 = sp^0x80;
519        pxor        xmm5,                   xmm7            ; *oq2 = sq^0x80;
520        pxor        xmm3,                   xmm7            ; *oq0 = sq^0x80
521        pxor        xmm6,                   xmm7            ; *oq0 = sp^0x80
522%if %1 == 0
523        movq        [rsi],                  xmm6            ; p0
524        movhps      [rdi],                  xmm6
525        movq        [rsi + rcx],            xmm3            ; q0
526        movhps      [rdi + rcx],            xmm3
527        lea         rdx,                    [rcx + rcx*2]
528        movq        [rsi+rcx*2],            xmm1            ; q1
529        movhps      [rdi+rcx*2],            xmm1
530
531        movq        [rsi + rax],            xmm4            ; p1
532        movhps      [rdi + rax],            xmm4
533
534        movq        [rsi+rax*2],            xmm2            ; p2
535        movhps      [rdi+rax*2],            xmm2
536
537        movq        [rsi+rdx],              xmm5            ; q2
538        movhps      [rdi+rdx],              xmm5
539%elif %1 == 1
540        movdqa      [rdi+rcx],              xmm5            ; q2
541        movdqa      [rdi],                  xmm1            ; q1
542        movdqa      [rsi],                  xmm3            ; q0
543        movdqa      [rsi+rax  ],            xmm6            ; p0
544        movdqa      [rsi+rax*2],            xmm4            ; p1
545        movdqa      [rdi+rax*4],            xmm2            ; p2
546%elif %1 == 2
547        movdqa      [rsp+_p1],              xmm4            ; p1
548        movdqa      [rsp+_p0],              xmm6            ; p0
549        movdqa      [rsp+_q0],              xmm3            ; q0
550        movdqa      [rsp+_q1],              xmm1            ; q1
551%endif
552
553%endmacro
554
555
556;void vp8_mbloop_filter_horizontal_edge_sse2
557;(
558;    unsigned char *src_ptr,
559;    int            src_pixel_step,
560;    const char    *blimit,
561;    const char    *limit,
562;    const char    *thresh,
563;)
564global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE
565sym(vp8_mbloop_filter_horizontal_edge_sse2):
566    push        rbp
567    mov         rbp, rsp
568    SHADOW_ARGS_TO_STACK 5
569    SAVE_XMM 7
570    GET_GOT     rbx
571    push        rsi
572    push        rdi
573    ; end prolog
574
575    ALIGN_STACK 16, rax
576    sub         rsp, lf_var_size
577
578        mov         rsi,                    arg(0)            ;src_ptr
579        movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
580        mov         rdx,                    arg(3)            ;limit
581
582        lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
583
584        ; calculate breakout conditions and high edge variance
585        LFH_FILTER_AND_HEV_MASK 1
586        ; filter and write back the results
587        MB_FILTER_AND_WRITEBACK 1
588
589    add rsp, lf_var_size
590    pop rsp
591    ; begin epilog
592    pop rdi
593    pop rsi
594    RESTORE_GOT
595    RESTORE_XMM
596    UNSHADOW_ARGS
597    pop         rbp
598    ret
599
600
601;void vp8_mbloop_filter_horizontal_edge_uv_sse2
602;(
603;    unsigned char *u,
604;    int            src_pixel_step,
605;    const char    *blimit,
606;    const char    *limit,
607;    const char    *thresh,
608;    unsigned char *v
609;)
610global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE
611sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
612    push        rbp
613    mov         rbp, rsp
614    SHADOW_ARGS_TO_STACK 6
615    SAVE_XMM 7
616    GET_GOT     rbx
617    push        rsi
618    push        rdi
619    ; end prolog
620
621    ALIGN_STACK 16, rax
622    sub         rsp, lf_var_size
623
624        mov         rsi,                    arg(0)             ; u
625        mov         rdi,                    arg(5)             ; v
626        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
627        mov         rcx,                    rax
628        neg         rax                     ; negate pitch to deal with above border
629        mov         rdx,                    arg(3)             ;limit
630
631        lea         rsi,                    [rsi + rcx]
632        lea         rdi,                    [rdi + rcx]
633
634        ; calculate breakout conditions and high edge variance
635        LFH_FILTER_AND_HEV_MASK 0
636        ; filter and write back the results
637        MB_FILTER_AND_WRITEBACK 0
638
639    add rsp, lf_var_size
640    pop rsp
641    ; begin epilog
642    pop rdi
643    pop rsi
644    RESTORE_GOT
645    RESTORE_XMM
646    UNSHADOW_ARGS
647    pop         rbp
648    ret
649
650
651%macro TRANSPOSE_16X8 2
652        movq        xmm4,               [rsi]           ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
653        movq        xmm1,               [rdi]           ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
654        movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
655        movq        xmm7,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
656        movq        xmm5,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
657        movq        xmm2,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
658
659        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
660
661        movq        xmm1,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
662
663        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
664        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
665
666        movq        xmm7,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
667
668        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
669%if %1
670        lea         rsi,                [rsi+rax*8]
671        lea         rdi,                [rdi+rax*8]
672%else
673        mov         rsi,                arg(5)          ; v_ptr
674%endif
675
676        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
677        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
678        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
679        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
680        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
681
682%if %1 == 0
683        lea         rdi,                [rsi + rax - 4] ; rdi points to row +1 for indirect addressing
684        lea         rsi,                [rsi - 4]
685%endif
686
687        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
688        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
689
690        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
691        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
692
693        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
694
695        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
696
697        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
698
699        movdqa      [rsp+_t0],          xmm2            ; save to free XMM2
700
701        movq        xmm2,               [rsi]           ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
702        movq        xmm6,               [rdi]           ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
703        movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
704        movq        xmm5,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
705        movq        xmm1,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
706
707        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
708
709        movq        xmm6,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
710
711        punpcklbw   xmm0,               xmm5            ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
712
713        movq        xmm5,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
714
715        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
716
717        movq        xmm6,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
718
719        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
720
721        movdqa      xmm6,               xmm1            ;
722        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
723
724        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
725        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
726
727        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
728
729        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
730
731        movdqa      xmm0,               xmm5
732        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
733
734        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
735        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
736
737        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
738
739        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
740        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
741
742        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
743
744        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
745
746%if %2 == 0
747        movdqa      [rsp+_q3],          xmm7            ; save 7
748        movdqa      [rsp+_q2],          xmm6            ; save 6
749%endif
750        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
751        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
752        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
753        movdqa      [rsp+_p1],          xmm2            ; save 2
754
755        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
756        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
757        movdqa      [rsp+_p0],          xmm3            ; save 3
758
759        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
760
761        movdqa      [rsp+_q0],          xmm4            ; save 4
762        movdqa      [rsp+_q1],          xmm5            ; save 5
763        movdqa      xmm1,               [rsp+_t0]
764
765        movdqa      xmm2,               xmm1            ;
766        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
767        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
768
769%if %2 == 0
770        movdqa      [rsp+_p2],          xmm1
771        movdqa      [rsp+_p3],          xmm2
772%endif
773
774%endmacro
775
776%macro LFV_FILTER_MASK_HEV_MASK 0
777        movdqa      xmm0,               xmm6            ; q2
778        psubusb     xmm0,               xmm7            ; q2-q3
779
780        psubusb     xmm7,               xmm6            ; q3-q2
781        movdqa      xmm4,               xmm5            ; q1
782
783        por         xmm7,               xmm0            ; abs (q3-q2)
784        psubusb     xmm4,               xmm6            ; q1-q2
785
786        movdqa      xmm0,               xmm1
787        psubusb     xmm6,               xmm5            ; q2-q1
788
789        por         xmm6,               xmm4            ; abs (q2-q1)
790        psubusb     xmm0,               xmm2            ; p2 - p3;
791
792        psubusb     xmm2,               xmm1            ; p3 - p2;
793        por         xmm0,               xmm2            ; abs(p2-p3)
794
795        movdqa      xmm5,               [rsp+_p1]       ; p1
796        pmaxub      xmm0,               xmm7
797
798        movdqa      xmm2,               xmm5            ; p1
799        psubusb     xmm5,               xmm1            ; p1-p2
800        psubusb     xmm1,               xmm2            ; p2-p1
801
802        movdqa      xmm7,               xmm3            ; p0
803        psubusb     xmm7,               xmm2            ; p0-p1
804
805        por         xmm1,               xmm5            ; abs(p2-p1)
806        pmaxub      xmm0,               xmm6
807
808        pmaxub      xmm0,               xmm1
809        movdqa      xmm1,               xmm2            ; p1
810
811        psubusb     xmm2,               xmm3            ; p1-p0
812
813        por         xmm2,               xmm7            ; abs(p1-p0)
814
815        pmaxub      xmm0,               xmm2
816
817        movdqa      xmm5,               [rsp+_q0]       ; q0
818        movdqa      xmm7,               [rsp+_q1]       ; q1
819
820        mov         rdx,                arg(3)          ; limit
821
822        movdqa      xmm6,               xmm5            ; q0
823        movdqa      xmm4,               xmm7            ; q1
824
825        psubusb     xmm5,               xmm7            ; q0-q1
826        psubusb     xmm7,               xmm6            ; q1-q0
827
828        por         xmm7,               xmm5            ; abs(q1-q0)
829
830        pmaxub      xmm0,               xmm7
831
832        psubusb     xmm0,               [rdx]           ; limit
833
834        mov         rdx,                arg(2)          ; blimit
835        movdqa      xmm5,               xmm4            ; q1
836
837        psubusb     xmm5,               xmm1            ; q1-=p1
838        psubusb     xmm1,               xmm4            ; p1-=q1
839
840        por         xmm5,               xmm1            ; abs(p1-q1)
841        movdqa      xmm1,               xmm3            ; p0
842
843        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
844        psubusb     xmm1,               xmm6            ; p0-q0
845
846        movdqa      xmm4,               [rdx]           ; blimit
847        mov         rdx,                arg(4)          ; get thresh
848
849        psrlw       xmm5,               1               ; abs(p1-q1)/2
850        psubusb     xmm6,               xmm3            ; q0-p0
851
852        por         xmm1,               xmm6            ; abs(q0-p0)
853        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
854        movdqa      xmm3,               [rdx]
855
856        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
857        psubusb     xmm2,               xmm3            ; abs(q1 - q0) > thresh
858
859        psubusb     xmm7,               xmm3            ; abs(p1 - p0)> thresh
860
861        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
862        por         xmm2,               xmm7            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
863
864        por         xmm1,               xmm0            ; mask
865        pcmpeqb     xmm2,               xmm0
866
867        pxor        xmm0,               xmm0
868        pcmpeqb     xmm4,               xmm4
869
870        pcmpeqb     xmm1,               xmm0
871        pxor        xmm4,               xmm2
872%endmacro
873
874%macro BV_TRANSPOSE 0
875        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
876        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
877        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
878        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
879        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
880        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
881
882        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
883        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
884
885        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
886
887        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
888
889        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
890        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
891
892        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
893        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
894
895        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
896
897        punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
898        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
899        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
900        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
901        ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
902%endmacro
903
904%macro BV_WRITEBACK 2
905        movd        [rsi+2],            %1
906        movd        [rsi+4*rax+2],      %2
907        psrldq      %1,                 4
908        psrldq      %2,                 4
909        movd        [rdi+2],            %1
910        movd        [rdi+4*rax+2],      %2
911        psrldq      %1,                 4
912        psrldq      %2,                 4
913        movd        [rsi+2*rax+2],      %1
914        movd        [rsi+2*rcx+2],      %2
915        psrldq      %1,                 4
916        psrldq      %2,                 4
917        movd        [rdi+2*rax+2],      %1
918        movd        [rdi+2*rcx+2],      %2
919%endmacro
920
921%if ABI_IS_32BIT
922
923;void vp8_loop_filter_vertical_edge_sse2
924;(
925;    unsigned char *src_ptr,
926;    int            src_pixel_step,
927;    const char    *blimit,
928;    const char    *limit,
929;    const char    *thresh,
930;)
931global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE
932sym(vp8_loop_filter_vertical_edge_sse2):
933    push        rbp
934    mov         rbp, rsp
935    SHADOW_ARGS_TO_STACK 5
936    SAVE_XMM 7
937    GET_GOT     rbx
938    push        rsi
939    push        rdi
940    ; end prolog
941
942    ALIGN_STACK 16, rax
943    sub             rsp, lf_var_size
944
945        mov         rsi,        arg(0)                  ; src_ptr
946        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
947
948        lea         rsi,        [rsi - 4]
949        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
950        lea         rcx,        [rax*2+rax]
951
952        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
953        TRANSPOSE_16X8 1, 1
954
955        ; calculate filter mask and high edge variance
956        LFV_FILTER_MASK_HEV_MASK
957
958        ; start work on filters
959        B_FILTER 2
960
961        ; transpose and write back - only work on q1, q0, p0, p1
962        BV_TRANSPOSE
963        ; store 16-line result
964
965        lea         rdx,        [rax]
966        neg         rdx
967
968        BV_WRITEBACK xmm1, xmm5
969
970        lea         rsi,        [rsi+rdx*8]
971        lea         rdi,        [rdi+rdx*8]
972        BV_WRITEBACK xmm2, xmm6
973
974    add rsp, lf_var_size
975    pop rsp
976    ; begin epilog
977    pop rdi
978    pop rsi
979    RESTORE_GOT
980    RESTORE_XMM
981    UNSHADOW_ARGS
982    pop         rbp
983    ret
984
985%endif
986
987;void vp8_loop_filter_vertical_edge_uv_sse2
988;(
989;    unsigned char *u,
990;    int            src_pixel_step,
991;    const char    *blimit,
992;    const char    *limit,
993;    const char    *thresh,
994;    unsigned char *v
995;)
996global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE
997sym(vp8_loop_filter_vertical_edge_uv_sse2):
998    push        rbp
999    mov         rbp, rsp
1000    SHADOW_ARGS_TO_STACK 6
1001    SAVE_XMM 7
1002    GET_GOT     rbx
1003    push        rsi
1004    push        rdi
1005    ; end prolog
1006
1007    ALIGN_STACK 16, rax
1008    sub             rsp, lf_var_size
1009
1010        mov         rsi,        arg(0)                  ; u_ptr
1011        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
1012
1013        lea         rsi,        [rsi - 4]
1014        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1015        lea         rcx,        [rax+2*rax]
1016
1017        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1018        TRANSPOSE_16X8 0, 1
1019
1020        ; calculate filter mask and high edge variance
1021        LFV_FILTER_MASK_HEV_MASK
1022
1023        ; start work on filters
1024        B_FILTER 2
1025
1026        ; transpose and write back - only work on q1, q0, p0, p1
1027        BV_TRANSPOSE
1028
1029        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1030
1031        ; store 16-line result
1032        BV_WRITEBACK xmm1, xmm5
1033
1034        mov         rsi,        arg(0)                  ; u_ptr
1035        lea         rsi,        [rsi - 4]
1036        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1037        BV_WRITEBACK xmm2, xmm6
1038
1039    add rsp, lf_var_size
1040    pop rsp
1041    ; begin epilog
1042    pop rdi
1043    pop rsi
1044    RESTORE_GOT
1045    RESTORE_XMM
1046    UNSHADOW_ARGS
1047    pop         rbp
1048    ret
1049
1050%macro MBV_TRANSPOSE 0
1051        movdqa      xmm0,               [rsp+_p3]           ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1052        movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1053
1054        punpcklbw   xmm0,               xmm2                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1055        punpckhbw   xmm1,               xmm2                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1056
1057        movdqa      xmm7,               [rsp+_p1]           ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1058        movdqa      xmm6,               xmm7                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1059
1060        punpcklbw   xmm7,               [rsp+_p0]           ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1061        punpckhbw   xmm6,               [rsp+_p0]           ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1062
1063        movdqa      xmm3,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1064        punpcklwd   xmm0,               xmm7                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1065
1066        punpckhwd   xmm3,               xmm7                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1067        movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1068
1069        punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1070        punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1071
1072        movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1073        punpcklbw   xmm7,               [rsp+_q1]           ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1074
1075        movdqa      xmm6,               xmm5                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
1076        punpcklbw   xmm6,               [rsp+_q3]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
1077
1078        movdqa      xmm2,               xmm7                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1079        punpcklwd   xmm7,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
1080
1081        punpckhwd   xmm2,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
1082        movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1083
1084        punpckldq   xmm0,               xmm7                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
1085        punpckhdq   xmm6,               xmm7                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
1086%endmacro
1087
1088%macro MBV_WRITEBACK_1 0
1089        movq        [rsi],              xmm0
1090        movhps      [rdi],              xmm0
1091
1092        movq        [rsi+2*rax],        xmm6
1093        movhps      [rdi+2*rax],        xmm6
1094
1095        movdqa      xmm0,               xmm3                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1096        punpckldq   xmm0,               xmm2                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
1097        punpckhdq   xmm3,               xmm2                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
1098
1099        movq        [rsi+4*rax],        xmm0
1100        movhps      [rdi+4*rax],        xmm0
1101
1102        movq        [rsi+2*rcx],        xmm3
1103        movhps      [rdi+2*rcx],        xmm3
1104
1105        movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1106        punpckhbw   xmm7,               [rsp+_q1]           ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
1107        punpckhbw   xmm5,               [rsp+_q3]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
1108
1109        movdqa      xmm0,               xmm7
1110        punpcklwd   xmm0,               xmm5                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
1111        punpckhwd   xmm7,               xmm5                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
1112
1113        movdqa      xmm5,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1114        punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
1115        punpckhdq   xmm5,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
1116%endmacro
1117
1118%macro MBV_WRITEBACK_2 0
1119        movq        [rsi],              xmm1
1120        movhps      [rdi],              xmm1
1121
1122        movq        [rsi+2*rax],        xmm5
1123        movhps      [rdi+2*rax],        xmm5
1124
1125        movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1126        punpckldq   xmm1,               xmm7                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
1127        punpckhdq   xmm4,               xmm7                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
1128
1129        movq        [rsi+4*rax],        xmm1
1130        movhps      [rdi+4*rax],        xmm1
1131
1132        movq        [rsi+2*rcx],        xmm4
1133        movhps      [rdi+2*rcx],        xmm4
1134%endmacro
1135
1136
1137;void vp8_mbloop_filter_vertical_edge_sse2
1138;(
1139;    unsigned char *src_ptr,
1140;    int            src_pixel_step,
1141;    const char    *blimit,
1142;    const char    *limit,
1143;    const char    *thresh,
1144;)
1145global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE
1146sym(vp8_mbloop_filter_vertical_edge_sse2):
1147    push        rbp
1148    mov         rbp, rsp
1149    SHADOW_ARGS_TO_STACK 5
1150    SAVE_XMM 7
1151    GET_GOT     rbx
1152    push        rsi
1153    push        rdi
1154    ; end prolog
1155
1156    ALIGN_STACK 16, rax
1157    sub          rsp, lf_var_size
1158
1159        mov         rsi,                arg(0)              ; src_ptr
1160        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
1161
1162        lea         rsi,                [rsi - 4]
1163        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
1164        lea         rcx,                [rax*2+rax]
1165
1166        ; Transpose
1167        TRANSPOSE_16X8 1, 0
1168
1169        ; calculate filter mask and high edge variance
1170        LFV_FILTER_MASK_HEV_MASK
1171
1172        neg         rax
1173        ; start work on filters
1174        MB_FILTER_AND_WRITEBACK 2
1175
1176        lea         rsi,                [rsi+rax*8]
1177        lea         rdi,                [rdi+rax*8]
1178
1179        ; transpose and write back
1180        MBV_TRANSPOSE
1181
1182        neg         rax
1183
1184        MBV_WRITEBACK_1
1185
1186
1187        lea         rsi,                [rsi+rax*8]
1188        lea         rdi,                [rdi+rax*8]
1189        MBV_WRITEBACK_2
1190
1191    add rsp, lf_var_size
1192    pop rsp
1193    ; begin epilog
1194    pop rdi
1195    pop rsi
1196    RESTORE_GOT
1197    RESTORE_XMM
1198    UNSHADOW_ARGS
1199    pop         rbp
1200    ret
1201
1202
1203;void vp8_mbloop_filter_vertical_edge_uv_sse2
1204;(
1205;    unsigned char *u,
1206;    int            src_pixel_step,
1207;    const char    *blimit,
1208;    const char    *limit,
1209;    const char    *thresh,
1210;    unsigned char *v
1211;)
1212global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE
1213sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
1214    push        rbp
1215    mov         rbp, rsp
1216    SHADOW_ARGS_TO_STACK 6
1217    SAVE_XMM 7
1218    GET_GOT     rbx
1219    push        rsi
1220    push        rdi
1221    ; end prolog
1222
1223    ALIGN_STACK 16, rax
1224    sub          rsp, lf_var_size
1225
1226        mov         rsi,                arg(0)              ; u_ptr
1227        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
1228
1229        lea         rsi,                [rsi - 4]
1230        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
1231        lea         rcx,                [rax+2*rax]
1232
1233        ; Transpose
1234        TRANSPOSE_16X8 0, 0
1235
1236        ; calculate filter mask and high edge variance
1237        LFV_FILTER_MASK_HEV_MASK
1238
1239        ; start work on filters
1240        MB_FILTER_AND_WRITEBACK 2
1241
1242        ; transpose and write back
1243        MBV_TRANSPOSE
1244
1245        mov         rsi,                arg(0)             ;u_ptr
1246        lea         rsi,                [rsi - 4]
1247        lea         rdi,                [rsi + rax]
1248        MBV_WRITEBACK_1
1249        mov         rsi,                arg(5)             ;v_ptr
1250        lea         rsi,                [rsi - 4]
1251        lea         rdi,                [rsi + rax]
1252        MBV_WRITEBACK_2
1253
1254    add rsp, lf_var_size
1255    pop rsp
1256    ; begin epilog
1257    pop rdi
1258    pop rsi
1259    RESTORE_GOT
1260    RESTORE_XMM
1261    UNSHADOW_ARGS
1262    pop         rbp
1263    ret
1264
1265
1266;void vp8_loop_filter_simple_horizontal_edge_sse2
1267;(
1268;    unsigned char *src_ptr,
1269;    int  src_pixel_step,
1270;    const char *blimit,
1271;)
1272global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE
1273sym(vp8_loop_filter_simple_horizontal_edge_sse2):
1274    push        rbp
1275    mov         rbp, rsp
1276    SHADOW_ARGS_TO_STACK 3
1277    SAVE_XMM 7
1278    GET_GOT     rbx
1279    ; end prolog
1280
1281        mov         rcx, arg(0)             ;src_ptr
1282        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
1283        movdqa      xmm6, [GLOBAL(tfe)]
1284        lea         rdx, [rcx + rax]
1285        neg         rax
1286
1287        ; calculate mask
1288        movdqa      xmm0, [rdx]             ; q1
1289        mov         rdx, arg(2)             ;blimit
1290        movdqa      xmm1, [rcx+2*rax]       ; p1
1291
1292        movdqa      xmm2, xmm1
1293        movdqa      xmm3, xmm0
1294
1295        psubusb     xmm0, xmm1              ; q1-=p1
1296        psubusb     xmm1, xmm3              ; p1-=q1
1297        por         xmm1, xmm0              ; abs(p1-q1)
1298        pand        xmm1, xmm6              ; set lsb of each byte to zero
1299        psrlw       xmm1, 1                 ; abs(p1-q1)/2
1300
1301        movdqa      xmm7, XMMWORD PTR [rdx]
1302
1303        movdqa      xmm5, [rcx+rax]         ; p0
1304        movdqa      xmm4, [rcx]             ; q0
1305        movdqa      xmm0, xmm4              ; q0
1306        movdqa      xmm6, xmm5              ; p0
1307        psubusb     xmm5, xmm4              ; p0-=q0
1308        psubusb     xmm4, xmm6              ; q0-=p0
1309        por         xmm5, xmm4              ; abs(p0 - q0)
1310
1311        movdqa      xmm4, [GLOBAL(t80)]
1312
1313        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
1314        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
1315        psubusb     xmm5, xmm7              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
1316        pxor        xmm7, xmm7
1317        pcmpeqb     xmm5, xmm7
1318
1319
1320        ; start work on filters
1321        pxor        xmm2, xmm4     ; p1 offset to convert to signed values
1322        pxor        xmm3, xmm4     ; q1 offset to convert to signed values
1323        psubsb      xmm2, xmm3              ; p1 - q1
1324
1325        pxor        xmm6, xmm4     ; offset to convert to signed values
1326        pxor        xmm0, xmm4     ; offset to convert to signed values
1327        movdqa      xmm3, xmm0              ; q0
1328        psubsb      xmm0, xmm6              ; q0 - p0
1329        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
1330        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
1331        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
1332        pand        xmm5, xmm2              ; mask filter values we don't care about
1333
1334        movdqa      xmm0, xmm5
1335        paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
1336        paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
1337
1338        movdqa      xmm1, [GLOBAL(te0)]
1339        movdqa      xmm2, [GLOBAL(t1f)]
1340
1341;        pxor        xmm7, xmm7
1342        pcmpgtb     xmm7, xmm0              ;save sign
1343        pand        xmm7, xmm1              ;preserve the upper 3 bits
1344        psrlw       xmm0, 3
1345        pand        xmm0, xmm2              ;clear out upper 3 bits
1346        por         xmm0, xmm7              ;add sign
1347        psubsb      xmm3, xmm0              ; q0-= q0sz add
1348
1349        pxor        xmm7, xmm7
1350        pcmpgtb     xmm7, xmm5              ;save sign
1351        pand        xmm7, xmm1              ;preserve the upper 3 bits
1352        psrlw       xmm5, 3
1353        pand        xmm5, xmm2              ;clear out upper 3 bits
1354        por         xmm5, xmm7              ;add sign
1355        paddsb      xmm6, xmm5              ; p0+= p0 add
1356
1357        pxor        xmm3, xmm4     ; unoffset
1358        movdqa      [rcx], xmm3             ; write back
1359
1360        pxor        xmm6, xmm4     ; unoffset
1361        movdqa      [rcx+rax], xmm6         ; write back
1362
1363    ; begin epilog
1364    RESTORE_GOT
1365    RESTORE_XMM
1366    UNSHADOW_ARGS
1367    pop         rbp
1368    ret
1369
1370
1371;void vp8_loop_filter_simple_vertical_edge_sse2
1372;(
1373;    unsigned char *src_ptr,
1374;    int  src_pixel_step,
1375;    const char *blimit,
1376;)
1377global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE
1378sym(vp8_loop_filter_simple_vertical_edge_sse2):
1379    push        rbp         ; save old base pointer value.
1380    mov         rbp, rsp    ; set new base pointer value.
1381    SHADOW_ARGS_TO_STACK 3
1382    SAVE_XMM 7
1383    GET_GOT     rbx         ; save callee-saved reg
1384    push        rsi
1385    push        rdi
1386    ; end prolog
1387
1388    ALIGN_STACK 16, rax
1389    sub         rsp, 32                         ; reserve 32 bytes
1390    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
1391    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
1392
1393        mov         rsi, arg(0) ;src_ptr
1394        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
1395
1396        lea         rsi,        [rsi - 2 ]
1397        lea         rdi,        [rsi + rax]
1398        lea         rdx,        [rsi + rax*4]
1399        lea         rcx,        [rdx + rax]
1400
1401        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
1402        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
1403        movd        xmm2,       [rdi]                   ; 13 12 11 10
1404        movd        xmm3,       [rcx]                   ; 53 52 51 50
1405        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
1406        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
1407
1408        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
1409        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
1410        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
1411        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
1412        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
1413        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
1414
1415        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
1416        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
1417
1418        movdqa      xmm1,       xmm0
1419        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
1420        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
1421
1422        movdqa      xmm2,       xmm0
1423        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1424        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1425
1426        lea         rsi,        [rsi + rax*8]
1427        lea         rdi,        [rsi + rax]
1428        lea         rdx,        [rsi + rax*4]
1429        lea         rcx,        [rdx + rax]
1430
1431        movd        xmm4,       [rsi]                   ; 83 82 81 80
1432        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
1433        movd        xmm6,       [rdi]                   ; 93 92 91 90
1434        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
1435        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
1436        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
1437
1438        movd        xmm1,       [rsi + rax*2]           ; a3 a2 a1 a0
1439        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
1440        movd        xmm3,       [rdi + rax*2]           ; b3 b2 b1 b0
1441        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
1442        punpckldq   xmm1,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
1443        punpckldq   xmm3,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
1444
1445        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
1446        punpcklbw   xmm1,       xmm3                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
1447
1448        movdqa      xmm7,       xmm4
1449        punpcklwd   xmm4,       xmm1                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
1450        punpckhwd   xmm7,       xmm1                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
1451
1452        movdqa      xmm6,       xmm4
1453        punpckldq   xmm4,       xmm7                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
1454        punpckhdq   xmm6,       xmm7                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
1455
1456        movdqa      xmm1,       xmm0
1457        movdqa      xmm3,       xmm2
1458
1459        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1460        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1461        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1462        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1463
1464        mov         rdx,        arg(2)                          ;blimit
1465
1466        ; calculate mask
1467        movdqa      xmm6,       xmm0                            ; p1
1468        movdqa      xmm7,       xmm3                            ; q1
1469        psubusb     xmm7,       xmm0                            ; q1-=p1
1470        psubusb     xmm6,       xmm3                            ; p1-=q1
1471        por         xmm6,       xmm7                            ; abs(p1-q1)
1472        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
1473        psrlw       xmm6,       1                               ; abs(p1-q1)/2
1474
1475        movdqa      xmm7, [rdx]
1476
1477        movdqa      xmm5,       xmm1                            ; p0
1478        movdqa      xmm4,       xmm2                            ; q0
1479        psubusb     xmm5,       xmm2                            ; p0-=q0
1480        psubusb     xmm4,       xmm1                            ; q0-=p0
1481        por         xmm5,       xmm4                            ; abs(p0 - q0)
1482        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
1483        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
1484
1485        movdqa      xmm4, [GLOBAL(t80)]
1486
1487        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
1488        pxor        xmm7,        xmm7
1489        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
1490
1491        ; start work on filters
1492        movdqa        t0,        xmm0
1493        movdqa        t1,        xmm3
1494
1495        pxor        xmm0,        xmm4                  ; p1 offset to convert to signed values
1496        pxor        xmm3,        xmm4                  ; q1 offset to convert to signed values
1497        psubsb      xmm0,        xmm3                           ; p1 - q1
1498
1499        pxor        xmm1,        xmm4                  ; offset to convert to signed values
1500        pxor        xmm2,        xmm4                  ; offset to convert to signed values
1501
1502        movdqa      xmm3,        xmm2                           ; offseted ; q0
1503        psubsb      xmm2,        xmm1                           ; q0 - p0
1504        paddsb      xmm0,        xmm2                           ; p1 - q1 + 1 * (q0 - p0)
1505        paddsb      xmm0,        xmm2                           ; p1 - q1 + 2 * (q0 - p0)
1506        paddsb      xmm0,        xmm2                           ; p1 - q1 + 3 * (q0 - p0)
1507        pand        xmm5,        xmm0                           ; mask filter values we don't care about
1508
1509        movdqa      xmm0, xmm5
1510        paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
1511        paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
1512
1513        movdqa  xmm6, [GLOBAL(te0)]
1514        movdqa  xmm2, [GLOBAL(t1f)]
1515
1516;        pxor        xmm7, xmm7
1517        pcmpgtb     xmm7, xmm0              ;save sign
1518        pand        xmm7, xmm6              ;preserve the upper 3 bits
1519        psrlw       xmm0, 3
1520        pand        xmm0, xmm2              ;clear out upper 3 bits
1521        por         xmm0, xmm7              ;add sign
1522        psubsb      xmm3, xmm0              ; q0-= q0sz add
1523
1524        pxor        xmm7, xmm7
1525        pcmpgtb     xmm7, xmm5              ;save sign
1526        pand        xmm7, xmm6              ;preserve the upper 3 bits
1527        psrlw       xmm5, 3
1528        pand        xmm5, xmm2              ;clear out upper 3 bits
1529        por         xmm5, xmm7              ;add sign
1530        paddsb      xmm1, xmm5              ; p0+= p0 add
1531
1532        pxor        xmm3,        xmm4                  ; unoffset   q0
1533        pxor        xmm1,        xmm4                  ; unoffset   p0
1534
1535        movdqa      xmm0,        t0                             ; p1
1536        movdqa      xmm4,        t1                             ; q1
1537
1538        ; write out order: xmm0 xmm2 xmm1 xmm3
1539        lea         rdx,        [rsi + rax*4]
1540
1541        ; transpose back to write out
1542        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1543        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1544        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1545        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1546        movdqa      xmm6,       xmm0
1547        punpcklbw   xmm0,       xmm1                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1548        punpckhbw   xmm6,       xmm1                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1549
1550        movdqa      xmm5,       xmm3
1551        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1552        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1553
1554        movdqa      xmm2,       xmm0
1555        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1556        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1557
1558        movdqa      xmm3,       xmm6
1559        punpcklwd   xmm6,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1560        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1561
1562        movd        [rsi],      xmm6                               ; write the second 8-line result
1563        movd        [rdx],      xmm3
1564        psrldq      xmm6,       4
1565        psrldq      xmm3,       4
1566        movd        [rdi],      xmm6
1567        movd        [rcx],      xmm3
1568        psrldq      xmm6,       4
1569        psrldq      xmm3,       4
1570        movd        [rsi + rax*2], xmm6
1571        movd        [rdx + rax*2], xmm3
1572        psrldq      xmm6,       4
1573        psrldq      xmm3,       4
1574        movd        [rdi + rax*2], xmm6
1575        movd        [rcx + rax*2], xmm3
1576
1577        neg         rax
1578        lea         rsi,        [rsi + rax*8]
1579        neg         rax
1580        lea         rdi,        [rsi + rax]
1581        lea         rdx,        [rsi + rax*4]
1582        lea         rcx,        [rdx + rax]
1583
1584        movd        [rsi],      xmm0                                ; write the first 8-line result
1585        movd        [rdx],      xmm2
1586        psrldq      xmm0,       4
1587        psrldq      xmm2,       4
1588        movd        [rdi],      xmm0
1589        movd        [rcx],      xmm2
1590        psrldq      xmm0,       4
1591        psrldq      xmm2,       4
1592        movd        [rsi + rax*2], xmm0
1593        movd        [rdx + rax*2], xmm2
1594        psrldq      xmm0,       4
1595        psrldq      xmm2,       4
1596        movd        [rdi + rax*2], xmm0
1597        movd        [rcx + rax*2], xmm2
1598
1599    add rsp, 32
1600    pop rsp
1601    ; begin epilog
1602    pop rdi
1603    pop rsi
1604    RESTORE_GOT
1605    RESTORE_XMM
1606    UNSHADOW_ARGS
1607    pop         rbp
1608    ret
1609
1610SECTION_RODATA
1611align 16
1612tfe:
1613    times 16 db 0xfe
1614align 16
1615t80:
1616    times 16 db 0x80
1617align 16
1618t1s:
1619    times 16 db 0x01
1620align 16
1621t3:
1622    times 16 db 0x03
1623align 16
1624t4:
1625    times 16 db 0x04
1626align 16
1627ones:
1628    times 8 dw 0x0001
1629align 16
1630s9:
1631    times 8 dw 0x0900
1632align 16
1633s63:
1634    times 8 dw 0x003f
1635align 16
1636te0:
1637    times 16 db 0xe0
1638align 16
1639t1f:
1640    times 16 db 0x1f
1641