1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;/************************************************************************************ 15; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 16; input pixel array has output_height rows. This routine assumes that output_height is an 17; even number. This function handles 8 pixels in horizontal direction, calculating ONE 18; rows each iteration to take advantage of the 128 bits operations. 19; 20; This is an implementation of some of the SSE optimizations first seen in ffvp8 21; 22;*************************************************************************************/ 23 24 25%macro VERTx4 1 26 mov rdx, arg(5) ;filter ptr 27 mov rsi, arg(0) ;src_ptr 28 mov rdi, arg(2) ;output_ptr 29 mov rcx, 0x0400040 30 31 movdqa xmm4, [rdx] ;load filters 32 movd xmm5, rcx 33 packsswb xmm4, xmm4 34 pshuflw xmm0, xmm4, 0b ;k0_k1 35 pshuflw xmm1, xmm4, 01010101b ;k2_k3 36 pshuflw xmm2, xmm4, 10101010b ;k4_k5 37 pshuflw xmm3, xmm4, 11111111b ;k6_k7 38 39 punpcklqdq xmm0, xmm0 40 punpcklqdq xmm1, xmm1 41 punpcklqdq xmm2, xmm2 42 punpcklqdq xmm3, xmm3 43 44 movdqa k0k1, xmm0 45 movdqa k2k3, xmm1 46 pshufd xmm5, xmm5, 0 47 movdqa k4k5, xmm2 48 movdqa k6k7, xmm3 49 movdqa krd, xmm5 50 51 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 52 53%if ABI_IS_32BIT=0 54 movsxd r8, DWORD PTR arg(3) ;out_pitch 55%endif 56 mov rax, rsi 57 movsxd rcx, DWORD PTR arg(4) ;output_height 58 add rax, rdx 59 60 lea rbx, [rdx + rdx*4] 61 add rbx, rdx ;pitch * 6 62 63.loop: 64 movd xmm0, [rsi] ;A 65 movd xmm1, [rsi + rdx] ;B 66 movd xmm2, [rsi + rdx * 2] ;C 67 movd xmm3, [rax + rdx * 2] ;D 68 movd xmm4, [rsi + rdx * 4] ;E 69 movd xmm5, [rax + rdx * 4] ;F 70 71 punpcklbw xmm0, xmm1 ;A B 72 punpcklbw xmm2, xmm3 ;C D 73 punpcklbw xmm4, xmm5 ;E F 74 75 movd xmm6, [rsi + rbx] ;G 76 movd xmm7, [rax + rbx] ;H 77 78 pmaddubsw xmm0, k0k1 79 pmaddubsw xmm2, k2k3 80 punpcklbw xmm6, xmm7 ;G H 81 pmaddubsw xmm4, k4k5 82 pmaddubsw xmm6, k6k7 83 84 paddsw xmm0, xmm6 85 paddsw xmm0, xmm2 86 paddsw xmm0, xmm4 87 paddsw xmm0, krd 88 89 psraw xmm0, 7 90 packuswb xmm0, xmm0 91 92 add rsi, rdx 93 add rax, rdx 94%if %1 95 movd xmm1, [rdi] 96 pavgb xmm0, xmm1 97%endif 98 movd [rdi], xmm0 99 100%if ABI_IS_32BIT 101 add rdi, DWORD PTR arg(3) ;out_pitch 102%else 103 add rdi, r8 104%endif 105 dec rcx 106 jnz .loop 107%endm 108 109%macro VERTx8 1 110 mov rdx, arg(5) ;filter ptr 111 mov rsi, arg(0) ;src_ptr 112 mov rdi, arg(2) ;output_ptr 113 mov rcx, 0x0400040 114 115 movdqa xmm4, [rdx] ;load filters 116 movq xmm5, rcx 117 packsswb xmm4, xmm4 118 pshuflw xmm0, xmm4, 0b ;k0_k1 119 pshuflw xmm1, xmm4, 01010101b ;k2_k3 120 pshuflw xmm2, xmm4, 10101010b ;k4_k5 121 pshuflw xmm3, xmm4, 11111111b ;k6_k7 122 123 punpcklqdq xmm0, xmm0 124 punpcklqdq xmm1, xmm1 125 punpcklqdq xmm2, xmm2 126 punpcklqdq xmm3, xmm3 127 128 movdqa k0k1, xmm0 129 movdqa k2k3, xmm1 130 pshufd xmm5, xmm5, 0 131 movdqa k4k5, xmm2 132 movdqa k6k7, xmm3 133 movdqa krd, xmm5 134 135 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 136 137%if ABI_IS_32BIT=0 138 movsxd r8, DWORD PTR arg(3) ;out_pitch 139%endif 140 mov rax, rsi 141 movsxd rcx, DWORD PTR arg(4) ;output_height 142 add rax, rdx 143 144 lea rbx, [rdx + rdx*4] 145 add rbx, rdx ;pitch * 6 146 147.loop: 148 movq xmm0, [rsi] ;A 149 movq xmm1, [rsi + rdx] ;B 150 movq xmm2, [rsi + rdx * 2] ;C 151 movq xmm3, [rax + rdx * 2] ;D 152 movq xmm4, [rsi + rdx * 4] ;E 153 movq xmm5, [rax + rdx * 4] ;F 154 155 punpcklbw xmm0, xmm1 ;A B 156 punpcklbw xmm2, xmm3 ;C D 157 punpcklbw xmm4, xmm5 ;E F 158 159 movq xmm6, [rsi + rbx] ;G 160 movq xmm7, [rax + rbx] ;H 161 162 pmaddubsw xmm0, k0k1 163 pmaddubsw xmm2, k2k3 164 punpcklbw xmm6, xmm7 ;G H 165 pmaddubsw xmm4, k4k5 166 pmaddubsw xmm6, k6k7 167 168 paddsw xmm0, xmm6 169 paddsw xmm0, xmm2 170 paddsw xmm0, xmm4 171 paddsw xmm0, krd 172 173 psraw xmm0, 7 174 packuswb xmm0, xmm0 175 176 add rsi, rdx 177 add rax, rdx 178%if %1 179 movq xmm1, [rdi] 180 pavgb xmm0, xmm1 181%endif 182 movq [rdi], xmm0 183 184%if ABI_IS_32BIT 185 add rdi, DWORD PTR arg(3) ;out_pitch 186%else 187 add rdi, r8 188%endif 189 dec rcx 190 jnz .loop 191%endm 192 193 194%macro VERTx16 1 195 mov rdx, arg(5) ;filter ptr 196 mov rsi, arg(0) ;src_ptr 197 mov rdi, arg(2) ;output_ptr 198 mov rcx, 0x0400040 199 200 movdqa xmm4, [rdx] ;load filters 201 movq xmm5, rcx 202 packsswb xmm4, xmm4 203 pshuflw xmm0, xmm4, 0b ;k0_k1 204 pshuflw xmm1, xmm4, 01010101b ;k2_k3 205 pshuflw xmm2, xmm4, 10101010b ;k4_k5 206 pshuflw xmm3, xmm4, 11111111b ;k6_k7 207 208 punpcklqdq xmm0, xmm0 209 punpcklqdq xmm1, xmm1 210 punpcklqdq xmm2, xmm2 211 punpcklqdq xmm3, xmm3 212 213 movdqa k0k1, xmm0 214 movdqa k2k3, xmm1 215 pshufd xmm5, xmm5, 0 216 movdqa k4k5, xmm2 217 movdqa k6k7, xmm3 218 movdqa krd, xmm5 219 220 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 221 222%if ABI_IS_32BIT=0 223 movsxd r8, DWORD PTR arg(3) ;out_pitch 224%endif 225 mov rax, rsi 226 movsxd rcx, DWORD PTR arg(4) ;output_height 227 add rax, rdx 228 229 lea rbx, [rdx + rdx*4] 230 add rbx, rdx ;pitch * 6 231 232.loop: 233 movq xmm0, [rsi] ;A 234 movq xmm1, [rsi + rdx] ;B 235 movq xmm2, [rsi + rdx * 2] ;C 236 movq xmm3, [rax + rdx * 2] ;D 237 movq xmm4, [rsi + rdx * 4] ;E 238 movq xmm5, [rax + rdx * 4] ;F 239 240 punpcklbw xmm0, xmm1 ;A B 241 punpcklbw xmm2, xmm3 ;C D 242 punpcklbw xmm4, xmm5 ;E F 243 244 movq xmm6, [rsi + rbx] ;G 245 movq xmm7, [rax + rbx] ;H 246 247 pmaddubsw xmm0, k0k1 248 pmaddubsw xmm2, k2k3 249 punpcklbw xmm6, xmm7 ;G H 250 pmaddubsw xmm4, k4k5 251 pmaddubsw xmm6, k6k7 252 253 paddsw xmm0, xmm6 254 paddsw xmm0, xmm2 255 paddsw xmm0, xmm4 256 paddsw xmm0, krd 257 258 psraw xmm0, 7 259 packuswb xmm0, xmm0 260%if %1 261 movq xmm1, [rdi] 262 pavgb xmm0, xmm1 263%endif 264 movq [rdi], xmm0 265 266 movq xmm0, [rsi + 8] ;A 267 movq xmm1, [rsi + rdx + 8] ;B 268 movq xmm2, [rsi + rdx * 2 + 8] ;C 269 movq xmm3, [rax + rdx * 2 + 8] ;D 270 movq xmm4, [rsi + rdx * 4 + 8] ;E 271 movq xmm5, [rax + rdx * 4 + 8] ;F 272 273 punpcklbw xmm0, xmm1 ;A B 274 punpcklbw xmm2, xmm3 ;C D 275 punpcklbw xmm4, xmm5 ;E F 276 277 278 movq xmm6, [rsi + rbx + 8] ;G 279 movq xmm7, [rax + rbx + 8] ;H 280 punpcklbw xmm6, xmm7 ;G H 281 282 283 pmaddubsw xmm0, k0k1 284 pmaddubsw xmm2, k2k3 285 pmaddubsw xmm4, k4k5 286 pmaddubsw xmm6, k6k7 287 288 paddsw xmm0, xmm6 289 paddsw xmm0, xmm2 290 paddsw xmm0, xmm4 291 paddsw xmm0, krd 292 293 psraw xmm0, 7 294 packuswb xmm0, xmm0 295 296 add rsi, rdx 297 add rax, rdx 298%if %1 299 movq xmm1, [rdi+8] 300 pavgb xmm0, xmm1 301%endif 302 303 movq [rdi+8], xmm0 304 305%if ABI_IS_32BIT 306 add rdi, DWORD PTR arg(3) ;out_pitch 307%else 308 add rdi, r8 309%endif 310 dec rcx 311 jnz .loop 312%endm 313 314;void vp9_filter_block1d8_v8_ssse3 315;( 316; unsigned char *src_ptr, 317; unsigned int src_pitch, 318; unsigned char *output_ptr, 319; unsigned int out_pitch, 320; unsigned int output_height, 321; short *filter 322;) 323global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE 324sym(vp9_filter_block1d4_v8_ssse3): 325 push rbp 326 mov rbp, rsp 327 SHADOW_ARGS_TO_STACK 6 328 SAVE_XMM 7 329 push rsi 330 push rdi 331 push rbx 332 ; end prolog 333 334 ALIGN_STACK 16, rax 335 sub rsp, 16*5 336 %define k0k1 [rsp + 16*0] 337 %define k2k3 [rsp + 16*1] 338 %define k4k5 [rsp + 16*2] 339 %define k6k7 [rsp + 16*3] 340 %define krd [rsp + 16*4] 341 342 VERTx4 0 343 344 add rsp, 16*5 345 pop rsp 346 pop rbx 347 ; begin epilog 348 pop rdi 349 pop rsi 350 RESTORE_XMM 351 UNSHADOW_ARGS 352 pop rbp 353 ret 354 355;void vp9_filter_block1d8_v8_ssse3 356;( 357; unsigned char *src_ptr, 358; unsigned int src_pitch, 359; unsigned char *output_ptr, 360; unsigned int out_pitch, 361; unsigned int output_height, 362; short *filter 363;) 364global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE 365sym(vp9_filter_block1d8_v8_ssse3): 366 push rbp 367 mov rbp, rsp 368 SHADOW_ARGS_TO_STACK 6 369 SAVE_XMM 7 370 push rsi 371 push rdi 372 push rbx 373 ; end prolog 374 375 ALIGN_STACK 16, rax 376 sub rsp, 16*5 377 %define k0k1 [rsp + 16*0] 378 %define k2k3 [rsp + 16*1] 379 %define k4k5 [rsp + 16*2] 380 %define k6k7 [rsp + 16*3] 381 %define krd [rsp + 16*4] 382 383 VERTx8 0 384 385 add rsp, 16*5 386 pop rsp 387 pop rbx 388 ; begin epilog 389 pop rdi 390 pop rsi 391 RESTORE_XMM 392 UNSHADOW_ARGS 393 pop rbp 394 ret 395 396;void vp9_filter_block1d16_v8_ssse3 397;( 398; unsigned char *src_ptr, 399; unsigned int src_pitch, 400; unsigned char *output_ptr, 401; unsigned int out_pitch, 402; unsigned int output_height, 403; short *filter 404;) 405global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE 406sym(vp9_filter_block1d16_v8_ssse3): 407 push rbp 408 mov rbp, rsp 409 SHADOW_ARGS_TO_STACK 6 410 SAVE_XMM 7 411 push rsi 412 push rdi 413 push rbx 414 ; end prolog 415 416 ALIGN_STACK 16, rax 417 sub rsp, 16*5 418 %define k0k1 [rsp + 16*0] 419 %define k2k3 [rsp + 16*1] 420 %define k4k5 [rsp + 16*2] 421 %define k6k7 [rsp + 16*3] 422 %define krd [rsp + 16*4] 423 424 VERTx16 0 425 426 add rsp, 16*5 427 pop rsp 428 pop rbx 429 ; begin epilog 430 pop rdi 431 pop rsi 432 RESTORE_XMM 433 UNSHADOW_ARGS 434 pop rbp 435 ret 436 437;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 438 439 440global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE 441sym(vp9_filter_block1d4_v8_avg_ssse3): 442 push rbp 443 mov rbp, rsp 444 SHADOW_ARGS_TO_STACK 6 445 SAVE_XMM 7 446 push rsi 447 push rdi 448 push rbx 449 ; end prolog 450 451 ALIGN_STACK 16, rax 452 sub rsp, 16*5 453 %define k0k1 [rsp + 16*0] 454 %define k2k3 [rsp + 16*1] 455 %define k4k5 [rsp + 16*2] 456 %define k6k7 [rsp + 16*3] 457 %define krd [rsp + 16*4] 458 459 VERTx4 1 460 461 add rsp, 16*5 462 pop rsp 463 pop rbx 464 ; begin epilog 465 pop rdi 466 pop rsi 467 RESTORE_XMM 468 UNSHADOW_ARGS 469 pop rbp 470 ret 471 472global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE 473sym(vp9_filter_block1d8_v8_avg_ssse3): 474 push rbp 475 mov rbp, rsp 476 SHADOW_ARGS_TO_STACK 6 477 SAVE_XMM 7 478 push rsi 479 push rdi 480 push rbx 481 ; end prolog 482 483 ALIGN_STACK 16, rax 484 sub rsp, 16*5 485 %define k0k1 [rsp + 16*0] 486 %define k2k3 [rsp + 16*1] 487 %define k4k5 [rsp + 16*2] 488 %define k6k7 [rsp + 16*3] 489 %define krd [rsp + 16*4] 490 491 VERTx8 1 492 493 add rsp, 16*5 494 pop rsp 495 pop rbx 496 ; begin epilog 497 pop rdi 498 pop rsi 499 RESTORE_XMM 500 UNSHADOW_ARGS 501 pop rbp 502 ret 503 504global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE 505sym(vp9_filter_block1d16_v8_avg_ssse3): 506 push rbp 507 mov rbp, rsp 508 SHADOW_ARGS_TO_STACK 6 509 SAVE_XMM 7 510 push rsi 511 push rdi 512 push rbx 513 ; end prolog 514 515 ALIGN_STACK 16, rax 516 sub rsp, 16*5 517 %define k0k1 [rsp + 16*0] 518 %define k2k3 [rsp + 16*1] 519 %define k4k5 [rsp + 16*2] 520 %define k6k7 [rsp + 16*3] 521 %define krd [rsp + 16*4] 522 523 VERTx16 1 524 525 add rsp, 16*5 526 pop rsp 527 pop rbx 528 ; begin epilog 529 pop rdi 530 pop rsi 531 RESTORE_XMM 532 UNSHADOW_ARGS 533 pop rbp 534 ret 535 536;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 537%macro HORIZx4_ROW 2 538 movdqa %2, %1 539 pshufb %1, [GLOBAL(shuf_t0t1)] 540 pshufb %2, [GLOBAL(shuf_t2t3)] 541 pmaddubsw %1, xmm6 542 pmaddubsw %2, xmm7 543 544 paddsw %1, %2 545 movdqa %2, %1 546 psrldq %2, 8 547 paddsw %1, %2 548 paddsw %1, xmm5 549 psraw %1, 7 550 packuswb %1, %1 551%endm 552 553%macro HORIZx4 1 554 mov rdx, arg(5) ;filter ptr 555 mov rsi, arg(0) ;src_ptr 556 mov rdi, arg(2) ;output_ptr 557 mov rcx, 0x0400040 558 559 movdqa xmm4, [rdx] ;load filters 560 movq xmm5, rcx 561 packsswb xmm4, xmm4 562 pshuflw xmm6, xmm4, 0b ;k0_k1 563 pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5 564 pshuflw xmm7, xmm4, 01010101b ;k2_k3 565 pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 566 pshufd xmm5, xmm5, 0 ;rounding 567 568 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 569 movsxd rdx, dword ptr arg(3) ;output_pitch 570 movsxd rcx, dword ptr arg(4) ;output_height 571 shr rcx, 1 572.loop: 573 ;Do two rows once 574 movq xmm0, [rsi - 3] ;load src 575 movq xmm1, [rsi + 5] 576 movq xmm2, [rsi + rax - 3] 577 movq xmm3, [rsi + rax + 5] 578 punpcklqdq xmm0, xmm1 579 punpcklqdq xmm2, xmm3 580 581 HORIZx4_ROW xmm0, xmm1 582 HORIZx4_ROW xmm2, xmm3 583%if %1 584 movd xmm1, [rdi] 585 pavgb xmm0, xmm1 586 movd xmm3, [rdi + rdx] 587 pavgb xmm2, xmm3 588%endif 589 movd [rdi], xmm0 590 movd [rdi +rdx], xmm2 591 592 lea rsi, [rsi + rax] 593 prefetcht0 [rsi + 4 * rax - 3] 594 lea rsi, [rsi + rax] 595 lea rdi, [rdi + 2 * rdx] 596 prefetcht0 [rsi + 2 * rax - 3] 597 598 dec rcx 599 jnz .loop 600 601 ; Do last row if output_height is odd 602 movsxd rcx, dword ptr arg(4) ;output_height 603 and rcx, 1 604 je .done 605 606 movq xmm0, [rsi - 3] ; load src 607 movq xmm1, [rsi + 5] 608 punpcklqdq xmm0, xmm1 609 610 HORIZx4_ROW xmm0, xmm1 611%if %1 612 movd xmm1, [rdi] 613 pavgb xmm0, xmm1 614%endif 615 movd [rdi], xmm0 616.done 617%endm 618 619%macro HORIZx8_ROW 4 620 movdqa %2, %1 621 movdqa %3, %1 622 movdqa %4, %1 623 624 pshufb %1, [GLOBAL(shuf_t0t1)] 625 pshufb %2, [GLOBAL(shuf_t2t3)] 626 pshufb %3, [GLOBAL(shuf_t4t5)] 627 pshufb %4, [GLOBAL(shuf_t6t7)] 628 629 pmaddubsw %1, k0k1 630 pmaddubsw %2, k2k3 631 pmaddubsw %3, k4k5 632 pmaddubsw %4, k6k7 633 634 paddsw %1, %2 635 paddsw %1, %4 636 paddsw %1, %3 637 paddsw %1, krd 638 psraw %1, 7 639 packuswb %1, %1 640%endm 641 642%macro HORIZx8 1 643 mov rdx, arg(5) ;filter ptr 644 mov rsi, arg(0) ;src_ptr 645 mov rdi, arg(2) ;output_ptr 646 mov rcx, 0x0400040 647 648 movdqa xmm4, [rdx] ;load filters 649 movd xmm5, rcx 650 packsswb xmm4, xmm4 651 pshuflw xmm0, xmm4, 0b ;k0_k1 652 pshuflw xmm1, xmm4, 01010101b ;k2_k3 653 pshuflw xmm2, xmm4, 10101010b ;k4_k5 654 pshuflw xmm3, xmm4, 11111111b ;k6_k7 655 656 punpcklqdq xmm0, xmm0 657 punpcklqdq xmm1, xmm1 658 punpcklqdq xmm2, xmm2 659 punpcklqdq xmm3, xmm3 660 661 movdqa k0k1, xmm0 662 movdqa k2k3, xmm1 663 pshufd xmm5, xmm5, 0 664 movdqa k4k5, xmm2 665 movdqa k6k7, xmm3 666 movdqa krd, xmm5 667 668 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 669 movsxd rdx, dword ptr arg(3) ;output_pitch 670 movsxd rcx, dword ptr arg(4) ;output_height 671 shr rcx, 1 672 673.loop: 674 movq xmm0, [rsi - 3] ;load src 675 movq xmm3, [rsi + 5] 676 movq xmm4, [rsi + rax - 3] 677 movq xmm7, [rsi + rax + 5] 678 punpcklqdq xmm0, xmm3 679 punpcklqdq xmm4, xmm7 680 681 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 682 HORIZx8_ROW xmm4, xmm5, xmm6, xmm7 683%if %1 684 movq xmm1, [rdi] 685 movq xmm2, [rdi + rdx] 686 pavgb xmm0, xmm1 687 pavgb xmm4, xmm2 688%endif 689 movq [rdi], xmm0 690 movq [rdi + rdx], xmm4 691 692 lea rsi, [rsi + rax] 693 prefetcht0 [rsi + 4 * rax - 3] 694 lea rsi, [rsi + rax] 695 lea rdi, [rdi + 2 * rdx] 696 prefetcht0 [rsi + 2 * rax - 3] 697 dec rcx 698 jnz .loop 699 700 ;Do last row if output_height is odd 701 movsxd rcx, dword ptr arg(4) ;output_height 702 and rcx, 1 703 je .done 704 705 movq xmm0, [rsi - 3] 706 movq xmm3, [rsi + 5] 707 punpcklqdq xmm0, xmm3 708 709 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 710%if %1 711 movq xmm1, [rdi] 712 pavgb xmm0, xmm1 713%endif 714 movq [rdi], xmm0 715.done 716%endm 717 718%macro HORIZx16 1 719 mov rdx, arg(5) ;filter ptr 720 mov rsi, arg(0) ;src_ptr 721 mov rdi, arg(2) ;output_ptr 722 mov rcx, 0x0400040 723 724 movdqa xmm4, [rdx] ;load filters 725 movq xmm5, rcx 726 packsswb xmm4, xmm4 727 pshuflw xmm0, xmm4, 0b ;k0_k1 728 pshuflw xmm1, xmm4, 01010101b ;k2_k3 729 pshuflw xmm2, xmm4, 10101010b ;k4_k5 730 pshuflw xmm3, xmm4, 11111111b ;k6_k7 731 732 punpcklqdq xmm0, xmm0 733 punpcklqdq xmm1, xmm1 734 punpcklqdq xmm2, xmm2 735 punpcklqdq xmm3, xmm3 736 737 movdqa k0k1, xmm0 738 movdqa k2k3, xmm1 739 pshufd xmm5, xmm5, 0 740 movdqa k4k5, xmm2 741 movdqa k6k7, xmm3 742 movdqa krd, xmm5 743 744 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 745 movsxd rdx, dword ptr arg(3) ;output_pitch 746 movsxd rcx, dword ptr arg(4) ;output_height 747 748.loop: 749 prefetcht0 [rsi + 2 * rax -3] 750 751 movq xmm0, [rsi - 3] ;load src data 752 movq xmm4, [rsi + 5] 753 movq xmm7, [rsi + 13] 754 punpcklqdq xmm0, xmm4 755 punpcklqdq xmm4, xmm7 756 757 movdqa xmm1, xmm0 758 movdqa xmm2, xmm0 759 movdqa xmm3, xmm0 760 movdqa xmm5, xmm4 761 movdqa xmm6, xmm4 762 movdqa xmm7, xmm4 763 764 pshufb xmm0, [GLOBAL(shuf_t0t1)] 765 pshufb xmm1, [GLOBAL(shuf_t2t3)] 766 pshufb xmm2, [GLOBAL(shuf_t4t5)] 767 pshufb xmm3, [GLOBAL(shuf_t6t7)] 768 pshufb xmm4, [GLOBAL(shuf_t0t1)] 769 pshufb xmm5, [GLOBAL(shuf_t2t3)] 770 pshufb xmm6, [GLOBAL(shuf_t4t5)] 771 pshufb xmm7, [GLOBAL(shuf_t6t7)] 772 773 pmaddubsw xmm0, k0k1 774 pmaddubsw xmm1, k2k3 775 pmaddubsw xmm2, k4k5 776 pmaddubsw xmm3, k6k7 777 pmaddubsw xmm4, k0k1 778 pmaddubsw xmm5, k2k3 779 pmaddubsw xmm6, k4k5 780 pmaddubsw xmm7, k6k7 781 782 paddsw xmm0, xmm1 783 paddsw xmm0, xmm3 784 paddsw xmm0, xmm2 785 paddsw xmm4, xmm5 786 paddsw xmm4, xmm7 787 paddsw xmm4, xmm6 788 789 paddsw xmm0, krd 790 paddsw xmm4, krd 791 psraw xmm0, 7 792 psraw xmm4, 7 793 packuswb xmm0, xmm0 794 packuswb xmm4, xmm4 795 punpcklqdq xmm0, xmm4 796%if %1 797 movdqa xmm1, [rdi] 798 pavgb xmm0, xmm1 799%endif 800 801 lea rsi, [rsi + rax] 802 movdqa [rdi], xmm0 803 804 lea rdi, [rdi + rdx] 805 dec rcx 806 jnz .loop 807%endm 808 809;void vp9_filter_block1d4_h8_ssse3 810;( 811; unsigned char *src_ptr, 812; unsigned int src_pixels_per_line, 813; unsigned char *output_ptr, 814; unsigned int output_pitch, 815; unsigned int output_height, 816; short *filter 817;) 818global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE 819sym(vp9_filter_block1d4_h8_ssse3): 820 push rbp 821 mov rbp, rsp 822 SHADOW_ARGS_TO_STACK 6 823 SAVE_XMM 7 824 GET_GOT rbx 825 push rsi 826 push rdi 827 ; end prolog 828 829 HORIZx4 0 830 831 ; begin epilog 832 pop rdi 833 pop rsi 834 RESTORE_GOT 835 RESTORE_XMM 836 UNSHADOW_ARGS 837 pop rbp 838 ret 839 840;void vp9_filter_block1d8_h8_ssse3 841;( 842; unsigned char *src_ptr, 843; unsigned int src_pixels_per_line, 844; unsigned char *output_ptr, 845; unsigned int output_pitch, 846; unsigned int output_height, 847; short *filter 848;) 849global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE 850sym(vp9_filter_block1d8_h8_ssse3): 851 push rbp 852 mov rbp, rsp 853 SHADOW_ARGS_TO_STACK 6 854 SAVE_XMM 7 855 GET_GOT rbx 856 push rsi 857 push rdi 858 ; end prolog 859 860 ALIGN_STACK 16, rax 861 sub rsp, 16*5 862 %define k0k1 [rsp + 16*0] 863 %define k2k3 [rsp + 16*1] 864 %define k4k5 [rsp + 16*2] 865 %define k6k7 [rsp + 16*3] 866 %define krd [rsp + 16*4] 867 868 HORIZx8 0 869 870 add rsp, 16*5 871 pop rsp 872 873 ; begin epilog 874 pop rdi 875 pop rsi 876 RESTORE_GOT 877 RESTORE_XMM 878 UNSHADOW_ARGS 879 pop rbp 880 ret 881 882;void vp9_filter_block1d16_h8_ssse3 883;( 884; unsigned char *src_ptr, 885; unsigned int src_pixels_per_line, 886; unsigned char *output_ptr, 887; unsigned int output_pitch, 888; unsigned int output_height, 889; short *filter 890;) 891global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE 892sym(vp9_filter_block1d16_h8_ssse3): 893 push rbp 894 mov rbp, rsp 895 SHADOW_ARGS_TO_STACK 6 896 SAVE_XMM 7 897 GET_GOT rbx 898 push rsi 899 push rdi 900 ; end prolog 901 902 ALIGN_STACK 16, rax 903 sub rsp, 16*5 904 %define k0k1 [rsp + 16*0] 905 %define k2k3 [rsp + 16*1] 906 %define k4k5 [rsp + 16*2] 907 %define k6k7 [rsp + 16*3] 908 %define krd [rsp + 16*4] 909 910 HORIZx16 0 911 912 add rsp, 16*5 913 pop rsp 914 915 ; begin epilog 916 pop rdi 917 pop rsi 918 RESTORE_GOT 919 RESTORE_XMM 920 UNSHADOW_ARGS 921 pop rbp 922 ret 923 924global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE 925sym(vp9_filter_block1d4_h8_avg_ssse3): 926 push rbp 927 mov rbp, rsp 928 SHADOW_ARGS_TO_STACK 6 929 SAVE_XMM 7 930 GET_GOT rbx 931 push rsi 932 push rdi 933 ; end prolog 934 935 HORIZx4 1 936 937 ; begin epilog 938 pop rdi 939 pop rsi 940 RESTORE_GOT 941 RESTORE_XMM 942 UNSHADOW_ARGS 943 pop rbp 944 ret 945 946global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE 947sym(vp9_filter_block1d8_h8_avg_ssse3): 948 push rbp 949 mov rbp, rsp 950 SHADOW_ARGS_TO_STACK 6 951 SAVE_XMM 7 952 GET_GOT rbx 953 push rsi 954 push rdi 955 ; end prolog 956 957 ALIGN_STACK 16, rax 958 sub rsp, 16*5 959 %define k0k1 [rsp + 16*0] 960 %define k2k3 [rsp + 16*1] 961 %define k4k5 [rsp + 16*2] 962 %define k6k7 [rsp + 16*3] 963 %define krd [rsp + 16*4] 964 965 HORIZx8 1 966 967 add rsp, 16*5 968 pop rsp 969 970 ; begin epilog 971 pop rdi 972 pop rsi 973 RESTORE_GOT 974 RESTORE_XMM 975 UNSHADOW_ARGS 976 pop rbp 977 ret 978 979global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE 980sym(vp9_filter_block1d16_h8_avg_ssse3): 981 push rbp 982 mov rbp, rsp 983 SHADOW_ARGS_TO_STACK 6 984 SAVE_XMM 7 985 GET_GOT rbx 986 push rsi 987 push rdi 988 ; end prolog 989 990 ALIGN_STACK 16, rax 991 sub rsp, 16*5 992 %define k0k1 [rsp + 16*0] 993 %define k2k3 [rsp + 16*1] 994 %define k4k5 [rsp + 16*2] 995 %define k6k7 [rsp + 16*3] 996 %define krd [rsp + 16*4] 997 998 HORIZx16 1 999 1000 add rsp, 16*5 1001 pop rsp 1002 1003 ; begin epilog 1004 pop rdi 1005 pop rsi 1006 RESTORE_GOT 1007 RESTORE_XMM 1008 UNSHADOW_ARGS 1009 pop rbp 1010 ret 1011SECTION_RODATA 1012align 16 1013shuf_t0t1: 1014 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 1015align 16 1016shuf_t2t3: 1017 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 1018align 16 1019shuf_t4t5: 1020 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 1021align 16 1022shuf_t6t7: 1023 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 1024