1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%define xmm_filter_shift            7
15
16
17;void vp8_filter_block2d_bil_var_ssse3
18;(
19;    unsigned char *ref_ptr,
20;    int ref_pixels_per_line,
21;    unsigned char *src_ptr,
22;    int src_pixels_per_line,
23;    unsigned int Height,
24;    int  xoffset,
25;    int  yoffset,
26;    int *sum,
27;    unsigned int *sumsquared;;
28;
29;)
30;Note: The filter coefficient at offset=0 is 128. Since the second register
31;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
32global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
33sym(vp8_filter_block2d_bil_var_ssse3):
34    push        rbp
35    mov         rbp, rsp
36    SHADOW_ARGS_TO_STACK 9
37    SAVE_XMM 7
38    GET_GOT     rbx
39    push rsi
40    push rdi
41    ; end prolog
42
43        pxor            xmm6,           xmm6
44        pxor            xmm7,           xmm7
45
46        lea             rcx,            [GLOBAL(vp8_bilinear_filters_ssse3)]
47        movsxd          rax,            dword ptr arg(5)     ; xoffset
48
49        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
50        je              .filter_block2d_bil_var_ssse3_sp_only
51
52        shl             rax,            4                    ; point to filter coeff with xoffset
53        lea             rax,            [rax + rcx]          ; HFilter
54
55        movsxd          rdx,            dword ptr arg(6)     ; yoffset
56
57        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
58        je              .filter_block2d_bil_var_ssse3_fp_only
59
60        shl             rdx,            4
61        lea             rdx,            [rdx + rcx]          ; VFilter
62
63        mov             rsi,            arg(0)               ;ref_ptr
64        mov             rdi,            arg(2)               ;src_ptr
65        movsxd          rcx,            dword ptr arg(4)     ;Height
66
67        movdqu          xmm0,           XMMWORD PTR [rsi]
68        movdqu          xmm1,           XMMWORD PTR [rsi+1]
69        movdqa          xmm2,           xmm0
70
71        punpcklbw       xmm0,           xmm1
72        punpckhbw       xmm2,           xmm1
73        pmaddubsw       xmm0,           [rax]
74        pmaddubsw       xmm2,           [rax]
75
76        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
77        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
78        psraw           xmm0,           xmm_filter_shift
79        psraw           xmm2,           xmm_filter_shift
80
81        packuswb        xmm0,           xmm2
82
83%if ABI_IS_32BIT
84        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
85%else
86        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
87        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
88        lea             rsi,            [rsi + r8]
89%endif
90
91.filter_block2d_bil_var_ssse3_loop:
92        movdqu          xmm1,           XMMWORD PTR [rsi]
93        movdqu          xmm2,           XMMWORD PTR [rsi+1]
94        movdqa          xmm3,           xmm1
95
96        punpcklbw       xmm1,           xmm2
97        punpckhbw       xmm3,           xmm2
98        pmaddubsw       xmm1,           [rax]
99        pmaddubsw       xmm3,           [rax]
100
101        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
102        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
103        psraw           xmm1,           xmm_filter_shift
104        psraw           xmm3,           xmm_filter_shift
105        packuswb        xmm1,           xmm3
106
107        movdqa          xmm2,           xmm0
108        movdqa          xmm0,           xmm1
109        movdqa          xmm3,           xmm2
110
111        punpcklbw       xmm2,           xmm1
112        punpckhbw       xmm3,           xmm1
113        pmaddubsw       xmm2,           [rdx]
114        pmaddubsw       xmm3,           [rdx]
115
116        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
117        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
118        psraw           xmm2,           xmm_filter_shift
119        psraw           xmm3,           xmm_filter_shift
120
121        movq            xmm1,           QWORD PTR [rdi]
122        pxor            xmm4,           xmm4
123        punpcklbw       xmm1,           xmm4
124        movq            xmm5,           QWORD PTR [rdi+8]
125        punpcklbw       xmm5,           xmm4
126
127        psubw           xmm2,           xmm1
128        psubw           xmm3,           xmm5
129        paddw           xmm6,           xmm2
130        paddw           xmm6,           xmm3
131        pmaddwd         xmm2,           xmm2
132        pmaddwd         xmm3,           xmm3
133        paddd           xmm7,           xmm2
134        paddd           xmm7,           xmm3
135
136%if ABI_IS_32BIT
137        add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line
138        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
139%else
140        lea             rsi,            [rsi + r8]
141        lea             rdi,            [rdi + r9]
142%endif
143
144        sub             rcx,            1
145        jnz             .filter_block2d_bil_var_ssse3_loop
146
147        jmp             .filter_block2d_bil_variance
148
149.filter_block2d_bil_var_ssse3_sp_only:
150        movsxd          rdx,            dword ptr arg(6)     ; yoffset
151
152        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
153        je              .filter_block2d_bil_var_ssse3_full_pixel
154
155        shl             rdx,            4
156        lea             rdx,            [rdx + rcx]          ; VFilter
157
158        mov             rsi,            arg(0)               ;ref_ptr
159        mov             rdi,            arg(2)               ;src_ptr
160        movsxd          rcx,            dword ptr arg(4)     ;Height
161        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
162
163        movdqu          xmm1,           XMMWORD PTR [rsi]
164        movdqa          xmm0,           xmm1
165
166%if ABI_IS_32BIT=0
167        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
168%endif
169
170        lea             rsi,            [rsi + rax]
171
172.filter_block2d_bil_sp_only_loop:
173        movdqu          xmm3,           XMMWORD PTR [rsi]
174        movdqa          xmm2,           xmm1
175        movdqa          xmm0,           xmm3
176
177        punpcklbw       xmm1,           xmm3
178        punpckhbw       xmm2,           xmm3
179        pmaddubsw       xmm1,           [rdx]
180        pmaddubsw       xmm2,           [rdx]
181
182        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
183        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
184        psraw           xmm1,           xmm_filter_shift
185        psraw           xmm2,           xmm_filter_shift
186
187        movq            xmm3,           QWORD PTR [rdi]
188        pxor            xmm4,           xmm4
189        punpcklbw       xmm3,           xmm4
190        movq            xmm5,           QWORD PTR [rdi+8]
191        punpcklbw       xmm5,           xmm4
192
193        psubw           xmm1,           xmm3
194        psubw           xmm2,           xmm5
195        paddw           xmm6,           xmm1
196        paddw           xmm6,           xmm2
197        pmaddwd         xmm1,           xmm1
198        pmaddwd         xmm2,           xmm2
199        paddd           xmm7,           xmm1
200        paddd           xmm7,           xmm2
201
202        movdqa          xmm1,           xmm0
203        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
204
205%if ABI_IS_32BIT
206        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
207%else
208        lea             rdi,            [rdi + r9]
209%endif
210
211        sub             rcx,            1
212        jnz             .filter_block2d_bil_sp_only_loop
213
214        jmp             .filter_block2d_bil_variance
215
216.filter_block2d_bil_var_ssse3_full_pixel:
217        mov             rsi,            arg(0)               ;ref_ptr
218        mov             rdi,            arg(2)               ;src_ptr
219        movsxd          rcx,            dword ptr arg(4)     ;Height
220        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
221        movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
222        pxor            xmm0,           xmm0
223
224.filter_block2d_bil_full_pixel_loop:
225        movq            xmm1,           QWORD PTR [rsi]
226        punpcklbw       xmm1,           xmm0
227        movq            xmm2,           QWORD PTR [rsi+8]
228        punpcklbw       xmm2,           xmm0
229
230        movq            xmm3,           QWORD PTR [rdi]
231        punpcklbw       xmm3,           xmm0
232        movq            xmm4,           QWORD PTR [rdi+8]
233        punpcklbw       xmm4,           xmm0
234
235        psubw           xmm1,           xmm3
236        psubw           xmm2,           xmm4
237        paddw           xmm6,           xmm1
238        paddw           xmm6,           xmm2
239        pmaddwd         xmm1,           xmm1
240        pmaddwd         xmm2,           xmm2
241        paddd           xmm7,           xmm1
242        paddd           xmm7,           xmm2
243
244        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
245        lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
246        sub             rcx,            1
247        jnz             .filter_block2d_bil_full_pixel_loop
248
249        jmp             .filter_block2d_bil_variance
250
251.filter_block2d_bil_var_ssse3_fp_only:
252        mov             rsi,            arg(0)               ;ref_ptr
253        mov             rdi,            arg(2)               ;src_ptr
254        movsxd          rcx,            dword ptr arg(4)     ;Height
255        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
256
257        pxor            xmm0,           xmm0
258
259%if ABI_IS_32BIT=0
260        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
261%endif
262
263.filter_block2d_bil_fp_only_loop:
264        movdqu          xmm1,           XMMWORD PTR [rsi]
265        movdqu          xmm2,           XMMWORD PTR [rsi+1]
266        movdqa          xmm3,           xmm1
267
268        punpcklbw       xmm1,           xmm2
269        punpckhbw       xmm3,           xmm2
270        pmaddubsw       xmm1,           [rax]
271        pmaddubsw       xmm3,           [rax]
272
273        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
274        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
275        psraw           xmm1,           xmm_filter_shift
276        psraw           xmm3,           xmm_filter_shift
277
278        movq            xmm2,           XMMWORD PTR [rdi]
279        pxor            xmm4,           xmm4
280        punpcklbw       xmm2,           xmm4
281        movq            xmm5,           QWORD PTR [rdi+8]
282        punpcklbw       xmm5,           xmm4
283
284        psubw           xmm1,           xmm2
285        psubw           xmm3,           xmm5
286        paddw           xmm6,           xmm1
287        paddw           xmm6,           xmm3
288        pmaddwd         xmm1,           xmm1
289        pmaddwd         xmm3,           xmm3
290        paddd           xmm7,           xmm1
291        paddd           xmm7,           xmm3
292
293        lea             rsi,            [rsi + rdx]
294%if ABI_IS_32BIT
295        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
296%else
297        lea             rdi,            [rdi + r9]
298%endif
299
300        sub             rcx,            1
301        jnz             .filter_block2d_bil_fp_only_loop
302
303        jmp             .filter_block2d_bil_variance
304
305.filter_block2d_bil_variance:
306        pxor        xmm0,           xmm0
307        pxor        xmm1,           xmm1
308        pxor        xmm5,           xmm5
309
310        punpcklwd   xmm0,           xmm6
311        punpckhwd   xmm1,           xmm6
312        psrad       xmm0,           16
313        psrad       xmm1,           16
314        paddd       xmm0,           xmm1
315        movdqa      xmm1,           xmm0
316
317        movdqa      xmm6,           xmm7
318        punpckldq   xmm6,           xmm5
319        punpckhdq   xmm7,           xmm5
320        paddd       xmm6,           xmm7
321
322        punpckldq   xmm0,           xmm5
323        punpckhdq   xmm1,           xmm5
324        paddd       xmm0,           xmm1
325
326        movdqa      xmm7,           xmm6
327        movdqa      xmm1,           xmm0
328
329        psrldq      xmm7,           8
330        psrldq      xmm1,           8
331
332        paddd       xmm6,           xmm7
333        paddd       xmm0,           xmm1
334
335        mov         rsi,            arg(7) ;[Sum]
336        mov         rdi,            arg(8) ;[SSE]
337
338        movd        [rsi],       xmm0
339        movd        [rdi],       xmm6
340
341    ; begin epilog
342    pop rdi
343    pop rsi
344    RESTORE_GOT
345    RESTORE_XMM
346    UNSHADOW_ARGS
347    pop         rbp
348    ret
349
350
351SECTION_RODATA
352align 16
353xmm_bi_rd:
354    times 8 dw 64
355align 16
356vp8_bilinear_filters_ssse3:
357    times 8 db 128, 0
358    times 8 db 112, 16
359    times 8 db 96,  32
360    times 8 db 80,  48
361    times 8 db 64,  64
362    times 8 db 48,  80
363    times 8 db 32,  96
364    times 8 db 16,  112
365