1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13%macro HIGH_GET_PARAM_4 0 14 mov rdx, arg(5) ;filter ptr 15 mov rsi, arg(0) ;src_ptr 16 mov rdi, arg(2) ;output_ptr 17 mov rcx, 0x00000040 18 19 movdqa xmm3, [rdx] ;load filters 20 pshuflw xmm4, xmm3, 11111111b ;k3 21 psrldq xmm3, 8 22 pshuflw xmm3, xmm3, 0b ;k4 23 punpcklwd xmm4, xmm3 ;k3k4 24 25 movq xmm3, rcx ;rounding 26 pshufd xmm3, xmm3, 0 27 28 mov rdx, 0x00010001 29 movsxd rcx, DWORD PTR arg(6) ;bps 30 movq xmm5, rdx 31 movq xmm2, rcx 32 pshufd xmm5, xmm5, 0b 33 movdqa xmm1, xmm5 34 psllw xmm5, xmm2 35 psubw xmm5, xmm1 ;max value (for clamping) 36 pxor xmm2, xmm2 ;min value (for clamping) 37 38 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 39 movsxd rdx, DWORD PTR arg(3) ;out_pitch 40 movsxd rcx, DWORD PTR arg(4) ;output_height 41%endm 42 43%macro HIGH_APPLY_FILTER_4 1 44 45 punpcklwd xmm0, xmm1 ;two row in one register 46 pmaddwd xmm0, xmm4 ;multiply the filter factors 47 48 paddd xmm0, xmm3 ;rounding 49 psrad xmm0, 7 ;shift 50 packssdw xmm0, xmm0 ;pack to word 51 52 ;clamp the values 53 pminsw xmm0, xmm5 54 pmaxsw xmm0, xmm2 55 56%if %1 57 movq xmm1, [rdi] 58 pavgw xmm0, xmm1 59%endif 60 61 movq [rdi], xmm0 62 lea rsi, [rsi + 2*rax] 63 lea rdi, [rdi + 2*rdx] 64 dec rcx 65%endm 66 67%if ARCH_X86_64 68%macro HIGH_GET_PARAM 0 69 mov rdx, arg(5) ;filter ptr 70 mov rsi, arg(0) ;src_ptr 71 mov rdi, arg(2) ;output_ptr 72 mov rcx, 0x00000040 73 74 movdqa xmm6, [rdx] ;load filters 75 76 pshuflw xmm7, xmm6, 11111111b ;k3 77 pshufhw xmm6, xmm6, 0b ;k4 78 psrldq xmm6, 8 79 punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4 80 81 movq xmm4, rcx ;rounding 82 pshufd xmm4, xmm4, 0 83 84 mov rdx, 0x00010001 85 movsxd rcx, DWORD PTR arg(6) ;bps 86 movq xmm8, rdx 87 movq xmm5, rcx 88 pshufd xmm8, xmm8, 0b 89 movdqa xmm1, xmm8 90 psllw xmm8, xmm5 91 psubw xmm8, xmm1 ;max value (for clamping) 92 pxor xmm5, xmm5 ;min value (for clamping) 93 94 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 95 movsxd rdx, DWORD PTR arg(3) ;out_pitch 96 movsxd rcx, DWORD PTR arg(4) ;output_height 97%endm 98 99%macro HIGH_APPLY_FILTER_8 1 100 movdqa xmm6, xmm0 101 punpckhwd xmm6, xmm1 102 punpcklwd xmm0, xmm1 103 pmaddwd xmm6, xmm7 104 pmaddwd xmm0, xmm7 105 106 paddd xmm6, xmm4 ;rounding 107 paddd xmm0, xmm4 ;rounding 108 psrad xmm6, 7 ;shift 109 psrad xmm0, 7 ;shift 110 packssdw xmm0, xmm6 ;pack back to word 111 112 ;clamp the values 113 pminsw xmm0, xmm8 114 pmaxsw xmm0, xmm5 115 116%if %1 117 movdqu xmm1, [rdi] 118 pavgw xmm0, xmm1 119%endif 120 movdqu [rdi], xmm0 ;store the result 121 122 lea rsi, [rsi + 2*rax] 123 lea rdi, [rdi + 2*rdx] 124 dec rcx 125%endm 126 127%macro HIGH_APPLY_FILTER_16 1 128 movdqa xmm9, xmm0 129 movdqa xmm6, xmm2 130 punpckhwd xmm9, xmm1 131 punpckhwd xmm6, xmm3 132 punpcklwd xmm0, xmm1 133 punpcklwd xmm2, xmm3 134 135 pmaddwd xmm9, xmm7 136 pmaddwd xmm6, xmm7 137 pmaddwd xmm0, xmm7 138 pmaddwd xmm2, xmm7 139 140 paddd xmm9, xmm4 ;rounding 141 paddd xmm6, xmm4 142 paddd xmm0, xmm4 143 paddd xmm2, xmm4 144 145 psrad xmm9, 7 ;shift 146 psrad xmm6, 7 147 psrad xmm0, 7 148 psrad xmm2, 7 149 150 packssdw xmm0, xmm9 ;pack back to word 151 packssdw xmm2, xmm6 ;pack back to word 152 153 ;clamp the values 154 pminsw xmm0, xmm8 155 pmaxsw xmm0, xmm5 156 pminsw xmm2, xmm8 157 pmaxsw xmm2, xmm5 158 159%if %1 160 movdqu xmm1, [rdi] 161 movdqu xmm3, [rdi + 16] 162 pavgw xmm0, xmm1 163 pavgw xmm2, xmm3 164%endif 165 movdqu [rdi], xmm0 ;store the result 166 movdqu [rdi + 16], xmm2 ;store the result 167 168 lea rsi, [rsi + 2*rax] 169 lea rdi, [rdi + 2*rdx] 170 dec rcx 171%endm 172%endif 173 174global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE 175sym(vpx_highbd_filter_block1d4_v2_sse2): 176 push rbp 177 mov rbp, rsp 178 SHADOW_ARGS_TO_STACK 7 179 push rsi 180 push rdi 181 ; end prolog 182 183 HIGH_GET_PARAM_4 184.loop: 185 movq xmm0, [rsi] ;load src 186 movq xmm1, [rsi + 2*rax] 187 188 HIGH_APPLY_FILTER_4 0 189 jnz .loop 190 191 ; begin epilog 192 pop rdi 193 pop rsi 194 UNSHADOW_ARGS 195 pop rbp 196 ret 197 198%if ARCH_X86_64 199global sym(vpx_highbd_filter_block1d8_v2_sse2) PRIVATE 200sym(vpx_highbd_filter_block1d8_v2_sse2): 201 push rbp 202 mov rbp, rsp 203 SHADOW_ARGS_TO_STACK 7 204 SAVE_XMM 8 205 push rsi 206 push rdi 207 ; end prolog 208 209 HIGH_GET_PARAM 210.loop: 211 movdqu xmm0, [rsi] ;0 212 movdqu xmm1, [rsi + 2*rax] ;1 213 214 HIGH_APPLY_FILTER_8 0 215 jnz .loop 216 217 ; begin epilog 218 pop rdi 219 pop rsi 220 RESTORE_XMM 221 UNSHADOW_ARGS 222 pop rbp 223 ret 224 225global sym(vpx_highbd_filter_block1d16_v2_sse2) PRIVATE 226sym(vpx_highbd_filter_block1d16_v2_sse2): 227 push rbp 228 mov rbp, rsp 229 SHADOW_ARGS_TO_STACK 7 230 SAVE_XMM 9 231 push rsi 232 push rdi 233 ; end prolog 234 235 HIGH_GET_PARAM 236.loop: 237 movdqu xmm0, [rsi] ;0 238 movdqu xmm2, [rsi + 16] 239 movdqu xmm1, [rsi + 2*rax] ;1 240 movdqu xmm3, [rsi + 2*rax + 16] 241 242 HIGH_APPLY_FILTER_16 0 243 jnz .loop 244 245 ; begin epilog 246 pop rdi 247 pop rsi 248 RESTORE_XMM 249 UNSHADOW_ARGS 250 pop rbp 251 ret 252%endif 253 254global sym(vpx_highbd_filter_block1d4_v2_avg_sse2) PRIVATE 255sym(vpx_highbd_filter_block1d4_v2_avg_sse2): 256 push rbp 257 mov rbp, rsp 258 SHADOW_ARGS_TO_STACK 7 259 push rsi 260 push rdi 261 ; end prolog 262 263 HIGH_GET_PARAM_4 264.loop: 265 movq xmm0, [rsi] ;load src 266 movq xmm1, [rsi + 2*rax] 267 268 HIGH_APPLY_FILTER_4 1 269 jnz .loop 270 271 ; begin epilog 272 pop rdi 273 pop rsi 274 UNSHADOW_ARGS 275 pop rbp 276 ret 277 278%if ARCH_X86_64 279global sym(vpx_highbd_filter_block1d8_v2_avg_sse2) PRIVATE 280sym(vpx_highbd_filter_block1d8_v2_avg_sse2): 281 push rbp 282 mov rbp, rsp 283 SHADOW_ARGS_TO_STACK 7 284 SAVE_XMM 8 285 push rsi 286 push rdi 287 ; end prolog 288 289 HIGH_GET_PARAM 290.loop: 291 movdqu xmm0, [rsi] ;0 292 movdqu xmm1, [rsi + 2*rax] ;1 293 294 HIGH_APPLY_FILTER_8 1 295 jnz .loop 296 297 ; begin epilog 298 pop rdi 299 pop rsi 300 RESTORE_XMM 301 UNSHADOW_ARGS 302 pop rbp 303 ret 304 305global sym(vpx_highbd_filter_block1d16_v2_avg_sse2) PRIVATE 306sym(vpx_highbd_filter_block1d16_v2_avg_sse2): 307 push rbp 308 mov rbp, rsp 309 SHADOW_ARGS_TO_STACK 7 310 SAVE_XMM 9 311 push rsi 312 push rdi 313 ; end prolog 314 315 HIGH_GET_PARAM 316.loop: 317 movdqu xmm0, [rsi] ;0 318 movdqu xmm1, [rsi + 2*rax] ;1 319 movdqu xmm2, [rsi + 16] 320 movdqu xmm3, [rsi + 2*rax + 16] 321 322 HIGH_APPLY_FILTER_16 1 323 jnz .loop 324 325 ; begin epilog 326 pop rdi 327 pop rsi 328 RESTORE_XMM 329 UNSHADOW_ARGS 330 pop rbp 331 ret 332%endif 333 334global sym(vpx_highbd_filter_block1d4_h2_sse2) PRIVATE 335sym(vpx_highbd_filter_block1d4_h2_sse2): 336 push rbp 337 mov rbp, rsp 338 SHADOW_ARGS_TO_STACK 7 339 push rsi 340 push rdi 341 ; end prolog 342 343 HIGH_GET_PARAM_4 344.loop: 345 movdqu xmm0, [rsi] ;load src 346 movdqa xmm1, xmm0 347 psrldq xmm1, 2 348 349 HIGH_APPLY_FILTER_4 0 350 jnz .loop 351 352 ; begin epilog 353 pop rdi 354 pop rsi 355 UNSHADOW_ARGS 356 pop rbp 357 ret 358 359%if ARCH_X86_64 360global sym(vpx_highbd_filter_block1d8_h2_sse2) PRIVATE 361sym(vpx_highbd_filter_block1d8_h2_sse2): 362 push rbp 363 mov rbp, rsp 364 SHADOW_ARGS_TO_STACK 7 365 SAVE_XMM 8 366 push rsi 367 push rdi 368 ; end prolog 369 370 HIGH_GET_PARAM 371.loop: 372 movdqu xmm0, [rsi] ;load src 373 movdqu xmm1, [rsi + 2] 374 375 HIGH_APPLY_FILTER_8 0 376 jnz .loop 377 378 ; begin epilog 379 pop rdi 380 pop rsi 381 RESTORE_XMM 382 UNSHADOW_ARGS 383 pop rbp 384 ret 385 386global sym(vpx_highbd_filter_block1d16_h2_sse2) PRIVATE 387sym(vpx_highbd_filter_block1d16_h2_sse2): 388 push rbp 389 mov rbp, rsp 390 SHADOW_ARGS_TO_STACK 7 391 SAVE_XMM 9 392 push rsi 393 push rdi 394 ; end prolog 395 396 HIGH_GET_PARAM 397.loop: 398 movdqu xmm0, [rsi] ;load src 399 movdqu xmm1, [rsi + 2] 400 movdqu xmm2, [rsi + 16] 401 movdqu xmm3, [rsi + 18] 402 403 HIGH_APPLY_FILTER_16 0 404 jnz .loop 405 406 ; begin epilog 407 pop rdi 408 pop rsi 409 RESTORE_XMM 410 UNSHADOW_ARGS 411 pop rbp 412 ret 413%endif 414 415global sym(vpx_highbd_filter_block1d4_h2_avg_sse2) PRIVATE 416sym(vpx_highbd_filter_block1d4_h2_avg_sse2): 417 push rbp 418 mov rbp, rsp 419 SHADOW_ARGS_TO_STACK 7 420 push rsi 421 push rdi 422 ; end prolog 423 424 HIGH_GET_PARAM_4 425.loop: 426 movdqu xmm0, [rsi] ;load src 427 movdqa xmm1, xmm0 428 psrldq xmm1, 2 429 430 HIGH_APPLY_FILTER_4 1 431 jnz .loop 432 433 ; begin epilog 434 pop rdi 435 pop rsi 436 UNSHADOW_ARGS 437 pop rbp 438 ret 439 440%if ARCH_X86_64 441global sym(vpx_highbd_filter_block1d8_h2_avg_sse2) PRIVATE 442sym(vpx_highbd_filter_block1d8_h2_avg_sse2): 443 push rbp 444 mov rbp, rsp 445 SHADOW_ARGS_TO_STACK 7 446 SAVE_XMM 8 447 push rsi 448 push rdi 449 ; end prolog 450 451 HIGH_GET_PARAM 452.loop: 453 movdqu xmm0, [rsi] ;load src 454 movdqu xmm1, [rsi + 2] 455 456 HIGH_APPLY_FILTER_8 1 457 jnz .loop 458 459 ; begin epilog 460 pop rdi 461 pop rsi 462 RESTORE_XMM 463 UNSHADOW_ARGS 464 pop rbp 465 ret 466 467global sym(vpx_highbd_filter_block1d16_h2_avg_sse2) PRIVATE 468sym(vpx_highbd_filter_block1d16_h2_avg_sse2): 469 push rbp 470 mov rbp, rsp 471 SHADOW_ARGS_TO_STACK 7 472 SAVE_XMM 9 473 push rsi 474 push rdi 475 ; end prolog 476 477 HIGH_GET_PARAM 478.loop: 479 movdqu xmm0, [rsi] ;load src 480 movdqu xmm1, [rsi + 2] 481 movdqu xmm2, [rsi + 16] 482 movdqu xmm3, [rsi + 18] 483 484 HIGH_APPLY_FILTER_16 1 485 jnz .loop 486 487 ; begin epilog 488 pop rdi 489 pop rsi 490 RESTORE_XMM 491 UNSHADOW_ARGS 492 pop rbp 493 ret 494%endif 495