1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%define xmm_filter_shift 7 15 16 17;void vp8_filter_block2d_bil_var_ssse3 18;( 19; unsigned char *ref_ptr, 20; int ref_pixels_per_line, 21; unsigned char *src_ptr, 22; int src_pixels_per_line, 23; unsigned int Height, 24; int xoffset, 25; int yoffset, 26; int *sum, 27; unsigned int *sumsquared;; 28; 29;) 30;Note: The filter coefficient at offset=0 is 128. Since the second register 31;for Pmaddubsw is signed bytes, we must calculate zero offset seperately. 32global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE 33sym(vp8_filter_block2d_bil_var_ssse3): 34 push rbp 35 mov rbp, rsp 36 SHADOW_ARGS_TO_STACK 9 37 SAVE_XMM 7 38 GET_GOT rbx 39 push rsi 40 push rdi 41 ; end prolog 42 43 pxor xmm6, xmm6 44 pxor xmm7, xmm7 45 46 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] 47 movsxd rax, dword ptr arg(5) ; xoffset 48 49 cmp rax, 0 ; skip first_pass filter if xoffset=0 50 je .filter_block2d_bil_var_ssse3_sp_only 51 52 shl rax, 4 ; point to filter coeff with xoffset 53 lea rax, [rax + rcx] ; HFilter 54 55 movsxd rdx, dword ptr arg(6) ; yoffset 56 57 cmp rdx, 0 ; skip second_pass filter if yoffset=0 58 je .filter_block2d_bil_var_ssse3_fp_only 59 60 shl rdx, 4 61 lea rdx, [rdx + rcx] ; VFilter 62 63 mov rsi, arg(0) ;ref_ptr 64 mov rdi, arg(2) ;src_ptr 65 movsxd rcx, dword ptr arg(4) ;Height 66 67 movdqu xmm0, XMMWORD PTR [rsi] 68 movdqu xmm1, XMMWORD PTR [rsi+1] 69 movdqa xmm2, xmm0 70 71 punpcklbw xmm0, xmm1 72 punpckhbw xmm2, xmm1 73 pmaddubsw xmm0, [rax] 74 pmaddubsw xmm2, [rax] 75 76 paddw xmm0, [GLOBAL(xmm_bi_rd)] 77 paddw xmm2, [GLOBAL(xmm_bi_rd)] 78 psraw xmm0, xmm_filter_shift 79 psraw xmm2, xmm_filter_shift 80 81 packuswb xmm0, xmm2 82 83%if ABI_IS_32BIT 84 add rsi, dword ptr arg(1) ;ref_pixels_per_line 85%else 86 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 87 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 88 lea rsi, [rsi + r8] 89%endif 90 91.filter_block2d_bil_var_ssse3_loop: 92 movdqu xmm1, XMMWORD PTR [rsi] 93 movdqu xmm2, XMMWORD PTR [rsi+1] 94 movdqa xmm3, xmm1 95 96 punpcklbw xmm1, xmm2 97 punpckhbw xmm3, xmm2 98 pmaddubsw xmm1, [rax] 99 pmaddubsw xmm3, [rax] 100 101 paddw xmm1, [GLOBAL(xmm_bi_rd)] 102 paddw xmm3, [GLOBAL(xmm_bi_rd)] 103 psraw xmm1, xmm_filter_shift 104 psraw xmm3, xmm_filter_shift 105 packuswb xmm1, xmm3 106 107 movdqa xmm2, xmm0 108 movdqa xmm0, xmm1 109 movdqa xmm3, xmm2 110 111 punpcklbw xmm2, xmm1 112 punpckhbw xmm3, xmm1 113 pmaddubsw xmm2, [rdx] 114 pmaddubsw xmm3, [rdx] 115 116 paddw xmm2, [GLOBAL(xmm_bi_rd)] 117 paddw xmm3, [GLOBAL(xmm_bi_rd)] 118 psraw xmm2, xmm_filter_shift 119 psraw xmm3, xmm_filter_shift 120 121 movq xmm1, QWORD PTR [rdi] 122 pxor xmm4, xmm4 123 punpcklbw xmm1, xmm4 124 movq xmm5, QWORD PTR [rdi+8] 125 punpcklbw xmm5, xmm4 126 127 psubw xmm2, xmm1 128 psubw xmm3, xmm5 129 paddw xmm6, xmm2 130 paddw xmm6, xmm3 131 pmaddwd xmm2, xmm2 132 pmaddwd xmm3, xmm3 133 paddd xmm7, xmm2 134 paddd xmm7, xmm3 135 136%if ABI_IS_32BIT 137 add rsi, dword ptr arg(1) ;ref_pixels_per_line 138 add rdi, dword ptr arg(3) ;src_pixels_per_line 139%else 140 lea rsi, [rsi + r8] 141 lea rdi, [rdi + r9] 142%endif 143 144 sub rcx, 1 145 jnz .filter_block2d_bil_var_ssse3_loop 146 147 jmp .filter_block2d_bil_variance 148 149.filter_block2d_bil_var_ssse3_sp_only: 150 movsxd rdx, dword ptr arg(6) ; yoffset 151 152 cmp rdx, 0 ; Both xoffset =0 and yoffset=0 153 je .filter_block2d_bil_var_ssse3_full_pixel 154 155 shl rdx, 4 156 lea rdx, [rdx + rcx] ; VFilter 157 158 mov rsi, arg(0) ;ref_ptr 159 mov rdi, arg(2) ;src_ptr 160 movsxd rcx, dword ptr arg(4) ;Height 161 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 162 163 movdqu xmm1, XMMWORD PTR [rsi] 164 movdqa xmm0, xmm1 165 166%if ABI_IS_32BIT=0 167 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 168%endif 169 170 lea rsi, [rsi + rax] 171 172.filter_block2d_bil_sp_only_loop: 173 movdqu xmm3, XMMWORD PTR [rsi] 174 movdqa xmm2, xmm1 175 movdqa xmm0, xmm3 176 177 punpcklbw xmm1, xmm3 178 punpckhbw xmm2, xmm3 179 pmaddubsw xmm1, [rdx] 180 pmaddubsw xmm2, [rdx] 181 182 paddw xmm1, [GLOBAL(xmm_bi_rd)] 183 paddw xmm2, [GLOBAL(xmm_bi_rd)] 184 psraw xmm1, xmm_filter_shift 185 psraw xmm2, xmm_filter_shift 186 187 movq xmm3, QWORD PTR [rdi] 188 pxor xmm4, xmm4 189 punpcklbw xmm3, xmm4 190 movq xmm5, QWORD PTR [rdi+8] 191 punpcklbw xmm5, xmm4 192 193 psubw xmm1, xmm3 194 psubw xmm2, xmm5 195 paddw xmm6, xmm1 196 paddw xmm6, xmm2 197 pmaddwd xmm1, xmm1 198 pmaddwd xmm2, xmm2 199 paddd xmm7, xmm1 200 paddd xmm7, xmm2 201 202 movdqa xmm1, xmm0 203 lea rsi, [rsi + rax] ;ref_pixels_per_line 204 205%if ABI_IS_32BIT 206 add rdi, dword ptr arg(3) ;src_pixels_per_line 207%else 208 lea rdi, [rdi + r9] 209%endif 210 211 sub rcx, 1 212 jnz .filter_block2d_bil_sp_only_loop 213 214 jmp .filter_block2d_bil_variance 215 216.filter_block2d_bil_var_ssse3_full_pixel: 217 mov rsi, arg(0) ;ref_ptr 218 mov rdi, arg(2) ;src_ptr 219 movsxd rcx, dword ptr arg(4) ;Height 220 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 221 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 222 pxor xmm0, xmm0 223 224.filter_block2d_bil_full_pixel_loop: 225 movq xmm1, QWORD PTR [rsi] 226 punpcklbw xmm1, xmm0 227 movq xmm2, QWORD PTR [rsi+8] 228 punpcklbw xmm2, xmm0 229 230 movq xmm3, QWORD PTR [rdi] 231 punpcklbw xmm3, xmm0 232 movq xmm4, QWORD PTR [rdi+8] 233 punpcklbw xmm4, xmm0 234 235 psubw xmm1, xmm3 236 psubw xmm2, xmm4 237 paddw xmm6, xmm1 238 paddw xmm6, xmm2 239 pmaddwd xmm1, xmm1 240 pmaddwd xmm2, xmm2 241 paddd xmm7, xmm1 242 paddd xmm7, xmm2 243 244 lea rsi, [rsi + rax] ;ref_pixels_per_line 245 lea rdi, [rdi + rdx] ;src_pixels_per_line 246 sub rcx, 1 247 jnz .filter_block2d_bil_full_pixel_loop 248 249 jmp .filter_block2d_bil_variance 250 251.filter_block2d_bil_var_ssse3_fp_only: 252 mov rsi, arg(0) ;ref_ptr 253 mov rdi, arg(2) ;src_ptr 254 movsxd rcx, dword ptr arg(4) ;Height 255 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line 256 257 pxor xmm0, xmm0 258 259%if ABI_IS_32BIT=0 260 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 261%endif 262 263.filter_block2d_bil_fp_only_loop: 264 movdqu xmm1, XMMWORD PTR [rsi] 265 movdqu xmm2, XMMWORD PTR [rsi+1] 266 movdqa xmm3, xmm1 267 268 punpcklbw xmm1, xmm2 269 punpckhbw xmm3, xmm2 270 pmaddubsw xmm1, [rax] 271 pmaddubsw xmm3, [rax] 272 273 paddw xmm1, [GLOBAL(xmm_bi_rd)] 274 paddw xmm3, [GLOBAL(xmm_bi_rd)] 275 psraw xmm1, xmm_filter_shift 276 psraw xmm3, xmm_filter_shift 277 278 movq xmm2, XMMWORD PTR [rdi] 279 pxor xmm4, xmm4 280 punpcklbw xmm2, xmm4 281 movq xmm5, QWORD PTR [rdi+8] 282 punpcklbw xmm5, xmm4 283 284 psubw xmm1, xmm2 285 psubw xmm3, xmm5 286 paddw xmm6, xmm1 287 paddw xmm6, xmm3 288 pmaddwd xmm1, xmm1 289 pmaddwd xmm3, xmm3 290 paddd xmm7, xmm1 291 paddd xmm7, xmm3 292 293 lea rsi, [rsi + rdx] 294%if ABI_IS_32BIT 295 add rdi, dword ptr arg(3) ;src_pixels_per_line 296%else 297 lea rdi, [rdi + r9] 298%endif 299 300 sub rcx, 1 301 jnz .filter_block2d_bil_fp_only_loop 302 303 jmp .filter_block2d_bil_variance 304 305.filter_block2d_bil_variance: 306 pxor xmm0, xmm0 307 pxor xmm1, xmm1 308 pxor xmm5, xmm5 309 310 punpcklwd xmm0, xmm6 311 punpckhwd xmm1, xmm6 312 psrad xmm0, 16 313 psrad xmm1, 16 314 paddd xmm0, xmm1 315 movdqa xmm1, xmm0 316 317 movdqa xmm6, xmm7 318 punpckldq xmm6, xmm5 319 punpckhdq xmm7, xmm5 320 paddd xmm6, xmm7 321 322 punpckldq xmm0, xmm5 323 punpckhdq xmm1, xmm5 324 paddd xmm0, xmm1 325 326 movdqa xmm7, xmm6 327 movdqa xmm1, xmm0 328 329 psrldq xmm7, 8 330 psrldq xmm1, 8 331 332 paddd xmm6, xmm7 333 paddd xmm0, xmm1 334 335 mov rsi, arg(7) ;[Sum] 336 mov rdi, arg(8) ;[SSE] 337 338 movd [rsi], xmm0 339 movd [rdi], xmm6 340 341 ; begin epilog 342 pop rdi 343 pop rsi 344 RESTORE_GOT 345 RESTORE_XMM 346 UNSHADOW_ARGS 347 pop rbp 348 ret 349 350 351SECTION_RODATA 352align 16 353xmm_bi_rd: 354 times 8 dw 64 355align 16 356vp8_bilinear_filters_ssse3: 357 times 8 db 128, 0 358 times 8 db 112, 16 359 times 8 db 96, 32 360 times 8 db 80, 48 361 times 8 db 64, 64 362 times 8 db 48, 80 363 times 8 db 32, 96 364 times 8 db 16, 112 365