1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%define BLOCK_HEIGHT_WIDTH 4 15%define VP8_FILTER_WEIGHT 128 16%define VP8_FILTER_SHIFT 7 17 18 19;/************************************************************************************ 20; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 21; input pixel array has output_height rows. This routine assumes that output_height is an 22; even number. This function handles 8 pixels in horizontal direction, calculating ONE 23; rows each iteration to take advantage of the 128 bits operations. 24; 25; This is an implementation of some of the SSE optimizations first seen in ffvp8 26; 27;*************************************************************************************/ 28;void vp8_filter_block1d8_h6_ssse3 29;( 30; unsigned char *src_ptr, 31; unsigned int src_pixels_per_line, 32; unsigned char *output_ptr, 33; unsigned int output_pitch, 34; unsigned int output_height, 35; unsigned int vp8_filter_index 36;) 37global sym(vp8_filter_block1d8_h6_ssse3) 38sym(vp8_filter_block1d8_h6_ssse3): 39 push rbp 40 mov rbp, rsp 41 SHADOW_ARGS_TO_STACK 6 42 GET_GOT rbx 43 push rsi 44 push rdi 45 ; end prolog 46 47 movsxd rdx, DWORD PTR arg(5) ;table index 48 xor rsi, rsi 49 shl rdx, 4 50 51 movdqa xmm7, [GLOBAL(rd)] 52 53 lea rax, [GLOBAL(k0_k5)] 54 add rax, rdx 55 mov rdi, arg(2) ;output_ptr 56 57 cmp esi, DWORD PTR [rax] 58 je vp8_filter_block1d8_h4_ssse3 59 60 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 61 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 62 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 63 64 mov rsi, arg(0) ;src_ptr 65 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 66 movsxd rcx, dword ptr arg(4) ;output_height 67 68 movsxd rdx, dword ptr arg(3) ;output_pitch 69 70 sub rdi, rdx 71;xmm3 free 72filter_block1d8_h6_rowloop_ssse3: 73 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 74 75 movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 76 77 punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 78 79 movdqa xmm1, xmm0 80 pmaddubsw xmm0, xmm4 81 82 movdqa xmm2, xmm1 83 pshufb xmm1, [GLOBAL(shuf2bfrom1)] 84 85 pshufb xmm2, [GLOBAL(shuf3bfrom1)] 86 pmaddubsw xmm1, xmm5 87 88 lea rdi, [rdi + rdx] 89 pmaddubsw xmm2, xmm6 90 91 lea rsi, [rsi + rax] 92 dec rcx 93 94 paddsw xmm0, xmm1 95 paddsw xmm2, xmm7 96 97 paddsw xmm0, xmm2 98 99 psraw xmm0, 7 100 101 packuswb xmm0, xmm0 102 103 movq MMWORD Ptr [rdi], xmm0 104 jnz filter_block1d8_h6_rowloop_ssse3 105 106 ; begin epilog 107 pop rdi 108 pop rsi 109 RESTORE_GOT 110 UNSHADOW_ARGS 111 pop rbp 112 ret 113 114vp8_filter_block1d8_h4_ssse3: 115 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 116 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 117 118 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] 119 movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] 120 121 mov rsi, arg(0) ;src_ptr 122 123 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 124 movsxd rcx, dword ptr arg(4) ;output_height 125 126 movsxd rdx, dword ptr arg(3) ;output_pitch 127 128 sub rdi, rdx 129 130filter_block1d8_h4_rowloop_ssse3: 131 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 132 133 movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 134 135 punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 136 137 movdqa xmm2, xmm0 138 pshufb xmm0, xmm3 139 140 pshufb xmm2, xmm4 141 pmaddubsw xmm0, xmm5 142 143 lea rdi, [rdi + rdx] 144 pmaddubsw xmm2, xmm6 145 146 lea rsi, [rsi + rax] 147 dec rcx 148 149 paddsw xmm0, xmm7 150 151 paddsw xmm0, xmm2 152 153 psraw xmm0, 7 154 155 packuswb xmm0, xmm0 156 157 movq MMWORD Ptr [rdi], xmm0 158 159 jnz filter_block1d8_h4_rowloop_ssse3 160 161 ; begin epilog 162 pop rdi 163 pop rsi 164 RESTORE_GOT 165 UNSHADOW_ARGS 166 pop rbp 167 ret 168;void vp8_filter_block1d16_h6_ssse3 169;( 170; unsigned char *src_ptr, 171; unsigned int src_pixels_per_line, 172; unsigned char *output_ptr, 173; unsigned int output_pitch, 174; unsigned int output_height, 175; unsigned int vp8_filter_index 176;) 177global sym(vp8_filter_block1d16_h6_ssse3) 178sym(vp8_filter_block1d16_h6_ssse3): 179 push rbp 180 mov rbp, rsp 181 SHADOW_ARGS_TO_STACK 6 182 SAVE_XMM 183 GET_GOT rbx 184 push rsi 185 push rdi 186 ; end prolog 187 188 movsxd rdx, DWORD PTR arg(5) ;table index 189 xor rsi, rsi 190 shl rdx, 4 ; 191 192 lea rax, [GLOBAL(k0_k5)] 193 add rax, rdx 194 195 mov rdi, arg(2) ;output_ptr 196 197;; 198;; cmp esi, DWORD PTR [rax] 199;; je vp8_filter_block1d16_h4_ssse3 200 201 mov rsi, arg(0) ;src_ptr 202 203 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 204 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 205 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 206 207 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 208 movsxd rcx, dword ptr arg(4) ;output_height 209 movsxd rdx, dword ptr arg(3) ;output_pitch 210 211filter_block1d16_h6_rowloop_ssse3: 212 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 213 214 movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 215 216 punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 217 218 movdqa xmm1, xmm0 219 pmaddubsw xmm0, xmm4 220 221 movdqa xmm2, xmm1 222 pshufb xmm1, [GLOBAL(shuf2bfrom1)] 223 224 pshufb xmm2, [GLOBAL(shuf3bfrom1)] 225 movq xmm3, MMWORD PTR [rsi + 6] 226 227 pmaddubsw xmm1, xmm5 228 movq xmm7, MMWORD PTR [rsi + 11] 229 230 pmaddubsw xmm2, xmm6 231 punpcklbw xmm3, xmm7 232 233 paddsw xmm0, xmm1 234 movdqa xmm1, xmm3 235 236 pmaddubsw xmm3, xmm4 237 paddsw xmm0, xmm2 238 239 movdqa xmm2, xmm1 240 paddsw xmm0, [GLOBAL(rd)] 241 242 pshufb xmm1, [GLOBAL(shuf2bfrom1)] 243 pshufb xmm2, [GLOBAL(shuf3bfrom1)] 244 245 psraw xmm0, 7 246 pmaddubsw xmm1, xmm5 247 248 pmaddubsw xmm2, xmm6 249 packuswb xmm0, xmm0 250 251 lea rsi, [rsi + rax] 252 paddsw xmm3, xmm1 253 254 paddsw xmm3, xmm2 255 256 paddsw xmm3, [GLOBAL(rd)] 257 258 psraw xmm3, 7 259 260 packuswb xmm3, xmm3 261 262 punpcklqdq xmm0, xmm3 263 264 movdqa XMMWORD Ptr [rdi], xmm0 265 266 lea rdi, [rdi + rdx] 267 dec rcx 268 jnz filter_block1d16_h6_rowloop_ssse3 269 270 ; begin epilog 271 pop rdi 272 pop rsi 273 RESTORE_GOT 274 UNSHADOW_ARGS 275 pop rbp 276 ret 277 278vp8_filter_block1d16_h4_ssse3: 279 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 280 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 281 282 mov rsi, arg(0) ;src_ptr 283 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 284 movsxd rcx, dword ptr arg(4) ;output_height 285 movsxd rdx, dword ptr arg(3) ;output_pitch 286 287filter_block1d16_h4_rowloop_ssse3: 288 movdqu xmm1, XMMWORD PTR [rsi - 2] 289 290 movdqa xmm2, xmm1 291 pshufb xmm1, [GLOBAL(shuf2b)] 292 pshufb xmm2, [GLOBAL(shuf3b)] 293 pmaddubsw xmm1, xmm5 294 295 movdqu xmm3, XMMWORD PTR [rsi + 6] 296 297 pmaddubsw xmm2, xmm6 298 movdqa xmm0, xmm3 299 pshufb xmm3, [GLOBAL(shuf3b)] 300 pshufb xmm0, [GLOBAL(shuf2b)] 301 302 paddsw xmm1, [GLOBAL(rd)] 303 paddsw xmm1, xmm2 304 305 pmaddubsw xmm0, xmm5 306 pmaddubsw xmm3, xmm6 307 308 psraw xmm1, 7 309 packuswb xmm1, xmm1 310 lea rsi, [rsi + rax] 311 paddsw xmm3, xmm0 312 paddsw xmm3, [GLOBAL(rd)] 313 psraw xmm3, 7 314 packuswb xmm3, xmm3 315 316 punpcklqdq xmm1, xmm3 317 318 movdqa XMMWORD Ptr [rdi], xmm1 319 320 add rdi, rdx 321 dec rcx 322 jnz filter_block1d16_h4_rowloop_ssse3 323 324 325 ; begin epilog 326 pop rdi 327 pop rsi 328 RESTORE_GOT 329 UNSHADOW_ARGS 330 pop rbp 331 ret 332 333;void vp8_filter_block1d4_h6_ssse3 334;( 335; unsigned char *src_ptr, 336; unsigned int src_pixels_per_line, 337; unsigned char *output_ptr, 338; unsigned int output_pitch, 339; unsigned int output_height, 340; unsigned int vp8_filter_index 341;) 342global sym(vp8_filter_block1d4_h6_ssse3) 343sym(vp8_filter_block1d4_h6_ssse3): 344 push rbp 345 mov rbp, rsp 346 SHADOW_ARGS_TO_STACK 6 347 GET_GOT rbx 348 push rsi 349 push rdi 350 ; end prolog 351 352 movsxd rdx, DWORD PTR arg(5) ;table index 353 xor rsi, rsi 354 shl rdx, 4 ; 355 356 lea rax, [GLOBAL(k0_k5)] 357 add rax, rdx 358 movdqa xmm7, [GLOBAL(rd)] 359 360 cmp esi, DWORD PTR [rax] 361 je vp8_filter_block1d4_h4_ssse3 362 363 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 364 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 365 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 366 367 mov rsi, arg(0) ;src_ptr 368 mov rdi, arg(2) ;output_ptr 369 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 370 movsxd rcx, dword ptr arg(4) ;output_height 371 372 movsxd rdx, dword ptr arg(3) ;output_pitch 373 374;xmm3 free 375filter_block1d4_h6_rowloop_ssse3: 376 movdqu xmm0, XMMWORD PTR [rsi - 2] 377 378 movdqa xmm1, xmm0 379 pshufb xmm0, [GLOBAL(shuf1b)] 380 381 movdqa xmm2, xmm1 382 pshufb xmm1, [GLOBAL(shuf2b)] 383 pmaddubsw xmm0, xmm4 384 pshufb xmm2, [GLOBAL(shuf3b)] 385 pmaddubsw xmm1, xmm5 386 387;-- 388 pmaddubsw xmm2, xmm6 389 390 lea rsi, [rsi + rax] 391;-- 392 paddsw xmm0, xmm1 393 paddsw xmm0, xmm7 394 pxor xmm1, xmm1 395 paddsw xmm0, xmm2 396 psraw xmm0, 7 397 packuswb xmm0, xmm0 398 399 movd DWORD PTR [rdi], xmm0 400 401 add rdi, rdx 402 dec rcx 403 jnz filter_block1d4_h6_rowloop_ssse3 404 405 ; begin epilog 406 pop rdi 407 pop rsi 408 RESTORE_GOT 409 UNSHADOW_ARGS 410 pop rbp 411 ret 412 413vp8_filter_block1d4_h4_ssse3: 414 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 415 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 416 movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] 417 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] 418 419 mov rsi, arg(0) ;src_ptr 420 mov rdi, arg(2) ;output_ptr 421 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 422 movsxd rcx, dword ptr arg(4) ;output_height 423 424 movsxd rdx, dword ptr arg(3) ;output_pitch 425 426filter_block1d4_h4_rowloop_ssse3: 427 movdqu xmm1, XMMWORD PTR [rsi - 2] 428 429 movdqa xmm2, xmm1 430 pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] 431 pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] 432 pmaddubsw xmm1, xmm5 433 434;-- 435 pmaddubsw xmm2, xmm6 436 437 lea rsi, [rsi + rax] 438;-- 439 paddsw xmm1, xmm7 440 paddsw xmm1, xmm2 441 psraw xmm1, 7 442 packuswb xmm1, xmm1 443 444 movd DWORD PTR [rdi], xmm1 445 446 add rdi, rdx 447 dec rcx 448 jnz filter_block1d4_h4_rowloop_ssse3 449 450 ; begin epilog 451 pop rdi 452 pop rsi 453 RESTORE_GOT 454 UNSHADOW_ARGS 455 pop rbp 456 ret 457 458 459 460;void vp8_filter_block1d16_v6_ssse3 461;( 462; unsigned char *src_ptr, 463; unsigned int src_pitch, 464; unsigned char *output_ptr, 465; unsigned int out_pitch, 466; unsigned int output_height, 467; unsigned int vp8_filter_index 468;) 469global sym(vp8_filter_block1d16_v6_ssse3) 470sym(vp8_filter_block1d16_v6_ssse3): 471 push rbp 472 mov rbp, rsp 473 SHADOW_ARGS_TO_STACK 6 474 GET_GOT rbx 475 push rsi 476 push rdi 477 ; end prolog 478 479 movsxd rdx, DWORD PTR arg(5) ;table index 480 xor rsi, rsi 481 shl rdx, 4 ; 482 483 lea rax, [GLOBAL(k0_k5)] 484 add rax, rdx 485 486 cmp esi, DWORD PTR [rax] 487 je vp8_filter_block1d16_v4_ssse3 488 489 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 490 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 491 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 492 493 mov rsi, arg(0) ;src_ptr 494 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 495 mov rdi, arg(2) ;output_ptr 496 497%if ABI_IS_32BIT=0 498 movsxd r8, DWORD PTR arg(3) ;out_pitch 499%endif 500 mov rax, rsi 501 movsxd rcx, DWORD PTR arg(4) ;output_height 502 add rax, rdx 503 504 505vp8_filter_block1d16_v6_ssse3_loop: 506 movq xmm1, MMWORD PTR [rsi] ;A 507 movq xmm2, MMWORD PTR [rsi + rdx] ;B 508 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 509 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 510 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 511 512 punpcklbw xmm2, xmm4 ;B D 513 punpcklbw xmm3, xmm0 ;C E 514 515 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F 516 517 pmaddubsw xmm3, xmm6 518 punpcklbw xmm1, xmm0 ;A F 519 pmaddubsw xmm2, xmm7 520 pmaddubsw xmm1, xmm5 521 522 paddsw xmm2, xmm3 523 paddsw xmm2, xmm1 524 paddsw xmm2, [GLOBAL(rd)] 525 psraw xmm2, 7 526 packuswb xmm2, xmm2 527 528 movq MMWORD PTR [rdi], xmm2 ;store the results 529 530 movq xmm1, MMWORD PTR [rsi + 8] ;A 531 movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B 532 movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C 533 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D 534 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E 535 536 punpcklbw xmm2, xmm4 ;B D 537 punpcklbw xmm3, xmm0 ;C E 538 539 movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F 540 pmaddubsw xmm3, xmm6 541 punpcklbw xmm1, xmm0 ;A F 542 pmaddubsw xmm2, xmm7 543 pmaddubsw xmm1, xmm5 544 545 add rsi, rdx 546 add rax, rdx 547;-- 548;-- 549 paddsw xmm2, xmm3 550 paddsw xmm2, xmm1 551 paddsw xmm2, [GLOBAL(rd)] 552 psraw xmm2, 7 553 packuswb xmm2, xmm2 554 555 movq MMWORD PTR [rdi+8], xmm2 556 557%if ABI_IS_32BIT 558 add rdi, DWORD PTR arg(3) ;out_pitch 559%else 560 add rdi, r8 561%endif 562 dec rcx 563 jnz vp8_filter_block1d16_v6_ssse3_loop 564 565 ; begin epilog 566 pop rdi 567 pop rsi 568 RESTORE_GOT 569 UNSHADOW_ARGS 570 pop rbp 571 ret 572 573vp8_filter_block1d16_v4_ssse3: 574 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 575 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 576 577 mov rsi, arg(0) ;src_ptr 578 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 579 mov rdi, arg(2) ;output_ptr 580 581%if ABI_IS_32BIT=0 582 movsxd r8, DWORD PTR arg(3) ;out_pitch 583%endif 584 mov rax, rsi 585 movsxd rcx, DWORD PTR arg(4) ;output_height 586 add rax, rdx 587 588vp8_filter_block1d16_v4_ssse3_loop: 589 movq xmm2, MMWORD PTR [rsi + rdx] ;B 590 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 591 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 592 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 593 594 punpcklbw xmm2, xmm4 ;B D 595 punpcklbw xmm3, xmm0 ;C E 596 597 pmaddubsw xmm3, xmm6 598 pmaddubsw xmm2, xmm7 599 movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B 600 movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C 601 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D 602 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E 603 604 paddsw xmm2, [GLOBAL(rd)] 605 paddsw xmm2, xmm3 606 psraw xmm2, 7 607 packuswb xmm2, xmm2 608 609 punpcklbw xmm5, xmm4 ;B D 610 punpcklbw xmm1, xmm0 ;C E 611 612 pmaddubsw xmm1, xmm6 613 pmaddubsw xmm5, xmm7 614 615 movdqa xmm4, [GLOBAL(rd)] 616 add rsi, rdx 617 add rax, rdx 618;-- 619;-- 620 paddsw xmm5, xmm1 621 paddsw xmm5, xmm4 622 psraw xmm5, 7 623 packuswb xmm5, xmm5 624 625 punpcklqdq xmm2, xmm5 626 627 movdqa XMMWORD PTR [rdi], xmm2 628 629%if ABI_IS_32BIT 630 add rdi, DWORD PTR arg(3) ;out_pitch 631%else 632 add rdi, r8 633%endif 634 dec rcx 635 jnz vp8_filter_block1d16_v4_ssse3_loop 636 637 ; begin epilog 638 pop rdi 639 pop rsi 640 RESTORE_GOT 641 UNSHADOW_ARGS 642 pop rbp 643 ret 644 645;void vp8_filter_block1d8_v6_ssse3 646;( 647; unsigned char *src_ptr, 648; unsigned int src_pitch, 649; unsigned char *output_ptr, 650; unsigned int out_pitch, 651; unsigned int output_height, 652; unsigned int vp8_filter_index 653;) 654global sym(vp8_filter_block1d8_v6_ssse3) 655sym(vp8_filter_block1d8_v6_ssse3): 656 push rbp 657 mov rbp, rsp 658 SHADOW_ARGS_TO_STACK 6 659 GET_GOT rbx 660 push rsi 661 push rdi 662 ; end prolog 663 664 movsxd rdx, DWORD PTR arg(5) ;table index 665 xor rsi, rsi 666 shl rdx, 4 ; 667 668 lea rax, [GLOBAL(k0_k5)] 669 add rax, rdx 670 671 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 672 mov rdi, arg(2) ;output_ptr 673%if ABI_IS_32BIT=0 674 movsxd r8, DWORD PTR arg(3) ; out_pitch 675%endif 676 movsxd rcx, DWORD PTR arg(4) ;[output_height] 677 678 cmp esi, DWORD PTR [rax] 679 je vp8_filter_block1d8_v4_ssse3 680 681 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 682 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 683 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 684 685 mov rsi, arg(0) ;src_ptr 686 687 mov rax, rsi 688 add rax, rdx 689 690vp8_filter_block1d8_v6_ssse3_loop: 691 movq xmm1, MMWORD PTR [rsi] ;A 692 movq xmm2, MMWORD PTR [rsi + rdx] ;B 693 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 694 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 695 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 696 697 punpcklbw xmm2, xmm4 ;B D 698 punpcklbw xmm3, xmm0 ;C E 699 700 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F 701 movdqa xmm4, [GLOBAL(rd)] 702 703 pmaddubsw xmm3, xmm6 704 punpcklbw xmm1, xmm0 ;A F 705 pmaddubsw xmm2, xmm7 706 pmaddubsw xmm1, xmm5 707 add rsi, rdx 708 add rax, rdx 709;-- 710;-- 711 paddsw xmm2, xmm3 712 paddsw xmm2, xmm1 713 paddsw xmm2, xmm4 714 psraw xmm2, 7 715 packuswb xmm2, xmm2 716 717 movq MMWORD PTR [rdi], xmm2 718 719%if ABI_IS_32BIT 720 add rdi, DWORD PTR arg(3) ;[out_pitch] 721%else 722 add rdi, r8 723%endif 724 dec rcx 725 jnz vp8_filter_block1d8_v6_ssse3_loop 726 727 ; begin epilog 728 pop rdi 729 pop rsi 730 RESTORE_GOT 731 UNSHADOW_ARGS 732 pop rbp 733 ret 734 735vp8_filter_block1d8_v4_ssse3: 736 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 737 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 738 movdqa xmm5, [GLOBAL(rd)] 739 740 mov rsi, arg(0) ;src_ptr 741 742 mov rax, rsi 743 add rax, rdx 744 745vp8_filter_block1d8_v4_ssse3_loop: 746 movq xmm2, MMWORD PTR [rsi + rdx] ;B 747 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 748 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 749 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 750 751 punpcklbw xmm2, xmm4 ;B D 752 punpcklbw xmm3, xmm0 ;C E 753 754 pmaddubsw xmm3, xmm6 755 pmaddubsw xmm2, xmm7 756 add rsi, rdx 757 add rax, rdx 758;-- 759;-- 760 paddsw xmm2, xmm3 761 paddsw xmm2, xmm5 762 psraw xmm2, 7 763 packuswb xmm2, xmm2 764 765 movq MMWORD PTR [rdi], xmm2 766 767%if ABI_IS_32BIT 768 add rdi, DWORD PTR arg(3) ;[out_pitch] 769%else 770 add rdi, r8 771%endif 772 dec rcx 773 jnz vp8_filter_block1d8_v4_ssse3_loop 774 775 ; begin epilog 776 pop rdi 777 pop rsi 778 RESTORE_GOT 779 UNSHADOW_ARGS 780 pop rbp 781 ret 782;void vp8_filter_block1d4_v6_ssse3 783;( 784; unsigned char *src_ptr, 785; unsigned int src_pitch, 786; unsigned char *output_ptr, 787; unsigned int out_pitch, 788; unsigned int output_height, 789; unsigned int vp8_filter_index 790;) 791global sym(vp8_filter_block1d4_v6_ssse3) 792sym(vp8_filter_block1d4_v6_ssse3): 793 push rbp 794 mov rbp, rsp 795 SHADOW_ARGS_TO_STACK 6 796 GET_GOT rbx 797 push rsi 798 push rdi 799 ; end prolog 800 801 movsxd rdx, DWORD PTR arg(5) ;table index 802 xor rsi, rsi 803 shl rdx, 4 ; 804 805 lea rax, [GLOBAL(k0_k5)] 806 add rax, rdx 807 808 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 809 mov rdi, arg(2) ;output_ptr 810%if ABI_IS_32BIT=0 811 movsxd r8, DWORD PTR arg(3) ; out_pitch 812%endif 813 movsxd rcx, DWORD PTR arg(4) ;[output_height] 814 815 cmp esi, DWORD PTR [rax] 816 je vp8_filter_block1d4_v4_ssse3 817 818 movq mm5, MMWORD PTR [rax] ;k0_k5 819 movq mm6, MMWORD PTR [rax+256] ;k2_k4 820 movq mm7, MMWORD PTR [rax+128] ;k1_k3 821 822 mov rsi, arg(0) ;src_ptr 823 824 mov rax, rsi 825 add rax, rdx 826 827vp8_filter_block1d4_v6_ssse3_loop: 828 movd mm1, DWORD PTR [rsi] ;A 829 movd mm2, DWORD PTR [rsi + rdx] ;B 830 movd mm3, DWORD PTR [rsi + rdx * 2] ;C 831 movd mm4, DWORD PTR [rax + rdx * 2] ;D 832 movd mm0, DWORD PTR [rsi + rdx * 4] ;E 833 834 punpcklbw mm2, mm4 ;B D 835 punpcklbw mm3, mm0 ;C E 836 837 movd mm0, DWORD PTR [rax + rdx * 4] ;F 838 839 movq mm4, [GLOBAL(rd)] 840 841 pmaddubsw mm3, mm6 842 punpcklbw mm1, mm0 ;A F 843 pmaddubsw mm2, mm7 844 pmaddubsw mm1, mm5 845 add rsi, rdx 846 add rax, rdx 847;-- 848;-- 849 paddsw mm2, mm3 850 paddsw mm2, mm1 851 paddsw mm2, mm4 852 psraw mm2, 7 853 packuswb mm2, mm2 854 855 movd DWORD PTR [rdi], mm2 856 857%if ABI_IS_32BIT 858 add rdi, DWORD PTR arg(3) ;[out_pitch] 859%else 860 add rdi, r8 861%endif 862 dec rcx 863 jnz vp8_filter_block1d4_v6_ssse3_loop 864 865 ; begin epilog 866 pop rdi 867 pop rsi 868 RESTORE_GOT 869 UNSHADOW_ARGS 870 pop rbp 871 ret 872 873vp8_filter_block1d4_v4_ssse3: 874 movq mm6, MMWORD PTR [rax+256] ;k2_k4 875 movq mm7, MMWORD PTR [rax+128] ;k1_k3 876 movq mm5, MMWORD PTR [GLOBAL(rd)] 877 878 mov rsi, arg(0) ;src_ptr 879 880 mov rax, rsi 881 add rax, rdx 882 883vp8_filter_block1d4_v4_ssse3_loop: 884 movd mm2, DWORD PTR [rsi + rdx] ;B 885 movd mm3, DWORD PTR [rsi + rdx * 2] ;C 886 movd mm4, DWORD PTR [rax + rdx * 2] ;D 887 movd mm0, DWORD PTR [rsi + rdx * 4] ;E 888 889 punpcklbw mm2, mm4 ;B D 890 punpcklbw mm3, mm0 ;C E 891 892 pmaddubsw mm3, mm6 893 pmaddubsw mm2, mm7 894 add rsi, rdx 895 add rax, rdx 896;-- 897;-- 898 paddsw mm2, mm3 899 paddsw mm2, mm5 900 psraw mm2, 7 901 packuswb mm2, mm2 902 903 movd DWORD PTR [rdi], mm2 904 905%if ABI_IS_32BIT 906 add rdi, DWORD PTR arg(3) ;[out_pitch] 907%else 908 add rdi, r8 909%endif 910 dec rcx 911 jnz vp8_filter_block1d4_v4_ssse3_loop 912 913 ; begin epilog 914 pop rdi 915 pop rsi 916 RESTORE_GOT 917 UNSHADOW_ARGS 918 pop rbp 919 ret 920 921;void vp8_bilinear_predict16x16_ssse3 922;( 923; unsigned char *src_ptr, 924; int src_pixels_per_line, 925; int xoffset, 926; int yoffset, 927; unsigned char *dst_ptr, 928; int dst_pitch 929;) 930global sym(vp8_bilinear_predict16x16_ssse3) 931sym(vp8_bilinear_predict16x16_ssse3): 932 push rbp 933 mov rbp, rsp 934 SHADOW_ARGS_TO_STACK 6 935 SAVE_XMM 936 GET_GOT rbx 937 push rsi 938 push rdi 939 ; end prolog 940 941 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] 942 movsxd rax, dword ptr arg(2) ; xoffset 943 944 cmp rax, 0 ; skip first_pass filter if xoffset=0 945 je b16x16_sp_only 946 947 shl rax, 4 948 lea rax, [rax + rcx] ; HFilter 949 950 mov rdi, arg(4) ; dst_ptr 951 mov rsi, arg(0) ; src_ptr 952 movsxd rdx, dword ptr arg(5) ; dst_pitch 953 954 movdqa xmm1, [rax] 955 956 movsxd rax, dword ptr arg(3) ; yoffset 957 958 cmp rax, 0 ; skip second_pass filter if yoffset=0 959 je b16x16_fp_only 960 961 shl rax, 4 962 lea rax, [rax + rcx] ; VFilter 963 964 lea rcx, [rdi+rdx*8] 965 lea rcx, [rcx+rdx*8] 966 movsxd rdx, dword ptr arg(1) ; src_pixels_per_line 967 968 movdqa xmm2, [rax] 969 970%if ABI_IS_32BIT=0 971 movsxd r8, dword ptr arg(5) ; dst_pitch 972%endif 973 movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 974 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 975 976 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 977 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 978 979 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 980 981 lea rsi, [rsi + rdx] ; next line 982 983 pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 984 985 punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 986 pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 987 988 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 989 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 990 991 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value 992 psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 993 994 movdqa xmm7, xmm3 995 packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 996 997.next_row: 998 movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 999 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 1000 1001 punpcklbw xmm6, xmm5 1002 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 1003 1004 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 1005 lea rsi, [rsi + rdx] ; next line 1006 1007 pmaddubsw xmm6, xmm1 1008 1009 punpcklbw xmm4, xmm5 1010 pmaddubsw xmm4, xmm1 1011 1012 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value 1013 psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 1014 1015 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value 1016 psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 1017 1018 packuswb xmm6, xmm4 1019 movdqa xmm5, xmm7 1020 1021 punpcklbw xmm5, xmm6 1022 pmaddubsw xmm5, xmm2 1023 1024 punpckhbw xmm7, xmm6 1025 pmaddubsw xmm7, xmm2 1026 1027 paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value 1028 psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128 1029 1030 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value 1031 psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 1032 1033 packuswb xmm5, xmm7 1034 movdqa xmm7, xmm6 1035 1036 movdqa [rdi], xmm5 ; store the results in the destination 1037%if ABI_IS_32BIT 1038 add rdi, DWORD PTR arg(5) ; dst_pitch 1039%else 1040 add rdi, r8 1041%endif 1042 1043 cmp rdi, rcx 1044 jne .next_row 1045 1046 jmp done 1047 1048b16x16_sp_only: 1049 movsxd rax, dword ptr arg(3) ; yoffset 1050 shl rax, 4 1051 lea rax, [rax + rcx] ; VFilter 1052 1053 mov rdi, arg(4) ; dst_ptr 1054 mov rsi, arg(0) ; src_ptr 1055 movsxd rdx, dword ptr arg(5) ; dst_pitch 1056 1057 movdqa xmm1, [rax] ; VFilter 1058 1059 lea rcx, [rdi+rdx*8] 1060 lea rcx, [rcx+rdx*8] 1061 movsxd rax, dword ptr arg(1) ; src_pixels_per_line 1062 1063 ; get the first horizontal line done 1064 movq xmm4, [rsi] ; load row 0 1065 movq xmm2, [rsi + 8] ; load row 0 1066 1067 lea rsi, [rsi + rax] ; next line 1068.next_row: 1069 movq xmm3, [rsi] ; load row + 1 1070 movq xmm5, [rsi + 8] ; load row + 1 1071 1072 punpcklbw xmm4, xmm3 1073 punpcklbw xmm2, xmm5 1074 1075 pmaddubsw xmm4, xmm1 1076 movq xmm7, [rsi + rax] ; load row + 2 1077 1078 pmaddubsw xmm2, xmm1 1079 movq xmm6, [rsi + rax + 8] ; load row + 2 1080 1081 punpcklbw xmm3, xmm7 1082 punpcklbw xmm5, xmm6 1083 1084 pmaddubsw xmm3, xmm1 1085 paddw xmm4, [GLOBAL(rd)] 1086 1087 pmaddubsw xmm5, xmm1 1088 paddw xmm2, [GLOBAL(rd)] 1089 1090 psraw xmm4, VP8_FILTER_SHIFT 1091 psraw xmm2, VP8_FILTER_SHIFT 1092 1093 packuswb xmm4, xmm2 1094 paddw xmm3, [GLOBAL(rd)] 1095 1096 movdqa [rdi], xmm4 ; store row 0 1097 paddw xmm5, [GLOBAL(rd)] 1098 1099 psraw xmm3, VP8_FILTER_SHIFT 1100 psraw xmm5, VP8_FILTER_SHIFT 1101 1102 packuswb xmm3, xmm5 1103 movdqa xmm4, xmm7 1104 1105 movdqa [rdi + rdx],xmm3 ; store row 1 1106 lea rsi, [rsi + 2*rax] 1107 1108 movdqa xmm2, xmm6 1109 lea rdi, [rdi + 2*rdx] 1110 1111 cmp rdi, rcx 1112 jne .next_row 1113 1114 jmp done 1115 1116b16x16_fp_only: 1117 lea rcx, [rdi+rdx*8] 1118 lea rcx, [rcx+rdx*8] 1119 movsxd rax, dword ptr arg(1) ; src_pixels_per_line 1120 1121.next_row: 1122 movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 1123 movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 1124 1125 punpcklbw xmm2, xmm4 1126 movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 1127 1128 pmaddubsw xmm2, xmm1 1129 movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 1130 1131 lea rsi, [rsi + rax] ; next line 1132 punpcklbw xmm3, xmm4 1133 1134 pmaddubsw xmm3, xmm1 1135 movq xmm5, [rsi] 1136 1137 paddw xmm2, [GLOBAL(rd)] 1138 movq xmm7, [rsi+1] 1139 1140 movq xmm6, [rsi+8] 1141 psraw xmm2, VP8_FILTER_SHIFT 1142 1143 punpcklbw xmm5, xmm7 1144 movq xmm7, [rsi+9] 1145 1146 paddw xmm3, [GLOBAL(rd)] 1147 pmaddubsw xmm5, xmm1 1148 1149 psraw xmm3, VP8_FILTER_SHIFT 1150 punpcklbw xmm6, xmm7 1151 1152 packuswb xmm2, xmm3 1153 pmaddubsw xmm6, xmm1 1154 1155 movdqa [rdi], xmm2 ; store the results in the destination 1156 paddw xmm5, [GLOBAL(rd)] 1157 1158 lea rdi, [rdi + rdx] ; dst_pitch 1159 psraw xmm5, VP8_FILTER_SHIFT 1160 1161 paddw xmm6, [GLOBAL(rd)] 1162 psraw xmm6, VP8_FILTER_SHIFT 1163 1164 packuswb xmm5, xmm6 1165 lea rsi, [rsi + rax] ; next line 1166 1167 movdqa [rdi], xmm5 ; store the results in the destination 1168 lea rdi, [rdi + rdx] ; dst_pitch 1169 1170 cmp rdi, rcx 1171 1172 jne .next_row 1173 1174done: 1175 ; begin epilog 1176 pop rdi 1177 pop rsi 1178 RESTORE_GOT 1179 RESTORE_XMM 1180 UNSHADOW_ARGS 1181 pop rbp 1182 ret 1183 1184;void vp8_bilinear_predict8x8_ssse3 1185;( 1186; unsigned char *src_ptr, 1187; int src_pixels_per_line, 1188; int xoffset, 1189; int yoffset, 1190; unsigned char *dst_ptr, 1191; int dst_pitch 1192;) 1193global sym(vp8_bilinear_predict8x8_ssse3) 1194sym(vp8_bilinear_predict8x8_ssse3): 1195 push rbp 1196 mov rbp, rsp 1197 SHADOW_ARGS_TO_STACK 6 1198 SAVE_XMM 1199 GET_GOT rbx 1200 push rsi 1201 push rdi 1202 ; end prolog 1203 1204 ALIGN_STACK 16, rax 1205 sub rsp, 144 ; reserve 144 bytes 1206 1207 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] 1208 1209 mov rsi, arg(0) ;src_ptr 1210 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 1211 1212 ;Read 9-line unaligned data in and put them on stack. This gives a big 1213 ;performance boost. 1214 movdqu xmm0, [rsi] 1215 lea rax, [rdx + rdx*2] 1216 movdqu xmm1, [rsi+rdx] 1217 movdqu xmm2, [rsi+rdx*2] 1218 add rsi, rax 1219 movdqu xmm3, [rsi] 1220 movdqu xmm4, [rsi+rdx] 1221 movdqu xmm5, [rsi+rdx*2] 1222 add rsi, rax 1223 movdqu xmm6, [rsi] 1224 movdqu xmm7, [rsi+rdx] 1225 1226 movdqa XMMWORD PTR [rsp], xmm0 1227 1228 movdqu xmm0, [rsi+rdx*2] 1229 1230 movdqa XMMWORD PTR [rsp+16], xmm1 1231 movdqa XMMWORD PTR [rsp+32], xmm2 1232 movdqa XMMWORD PTR [rsp+48], xmm3 1233 movdqa XMMWORD PTR [rsp+64], xmm4 1234 movdqa XMMWORD PTR [rsp+80], xmm5 1235 movdqa XMMWORD PTR [rsp+96], xmm6 1236 movdqa XMMWORD PTR [rsp+112], xmm7 1237 movdqa XMMWORD PTR [rsp+128], xmm0 1238 1239 movsxd rax, dword ptr arg(2) ; xoffset 1240 cmp rax, 0 ; skip first_pass filter if xoffset=0 1241 je b8x8_sp_only 1242 1243 shl rax, 4 1244 add rax, rcx ; HFilter 1245 1246 mov rdi, arg(4) ; dst_ptr 1247 movsxd rdx, dword ptr arg(5) ; dst_pitch 1248 1249 movdqa xmm0, [rax] 1250 1251 movsxd rax, dword ptr arg(3) ; yoffset 1252 cmp rax, 0 ; skip second_pass filter if yoffset=0 1253 je b8x8_fp_only 1254 1255 shl rax, 4 1256 lea rax, [rax + rcx] ; VFilter 1257 1258 lea rcx, [rdi+rdx*8] 1259 1260 movdqa xmm1, [rax] 1261 1262 ; get the first horizontal line done 1263 movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1264 movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx 1265 1266 psrldq xmm5, 1 1267 lea rsp, [rsp + 16] ; next line 1268 1269 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 1270 pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 1271 1272 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1273 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1274 1275 movdqa xmm7, xmm3 1276 packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1277 1278.next_row: 1279 movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1280 lea rsp, [rsp + 16] ; next line 1281 1282 movdqa xmm5, xmm6 1283 1284 psrldq xmm5, 1 1285 1286 punpcklbw xmm6, xmm5 1287 pmaddubsw xmm6, xmm0 1288 1289 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value 1290 psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 1291 1292 packuswb xmm6, xmm6 1293 1294 punpcklbw xmm7, xmm6 1295 pmaddubsw xmm7, xmm1 1296 1297 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value 1298 psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 1299 1300 packuswb xmm7, xmm7 1301 1302 movq [rdi], xmm7 ; store the results in the destination 1303 lea rdi, [rdi + rdx] 1304 1305 movdqa xmm7, xmm6 1306 1307 cmp rdi, rcx 1308 jne .next_row 1309 1310 jmp done8x8 1311 1312b8x8_sp_only: 1313 movsxd rax, dword ptr arg(3) ; yoffset 1314 shl rax, 4 1315 lea rax, [rax + rcx] ; VFilter 1316 1317 mov rdi, arg(4) ;dst_ptr 1318 movsxd rdx, dword ptr arg(5) ; dst_pitch 1319 1320 movdqa xmm0, [rax] ; VFilter 1321 1322 movq xmm1, XMMWORD PTR [rsp] 1323 movq xmm2, XMMWORD PTR [rsp+16] 1324 1325 movq xmm3, XMMWORD PTR [rsp+32] 1326 punpcklbw xmm1, xmm2 1327 1328 movq xmm4, XMMWORD PTR [rsp+48] 1329 punpcklbw xmm2, xmm3 1330 1331 movq xmm5, XMMWORD PTR [rsp+64] 1332 punpcklbw xmm3, xmm4 1333 1334 movq xmm6, XMMWORD PTR [rsp+80] 1335 punpcklbw xmm4, xmm5 1336 1337 movq xmm7, XMMWORD PTR [rsp+96] 1338 punpcklbw xmm5, xmm6 1339 1340 pmaddubsw xmm1, xmm0 1341 pmaddubsw xmm2, xmm0 1342 1343 pmaddubsw xmm3, xmm0 1344 pmaddubsw xmm4, xmm0 1345 1346 pmaddubsw xmm5, xmm0 1347 punpcklbw xmm6, xmm7 1348 1349 pmaddubsw xmm6, xmm0 1350 paddw xmm1, [GLOBAL(rd)] 1351 1352 paddw xmm2, [GLOBAL(rd)] 1353 psraw xmm1, VP8_FILTER_SHIFT 1354 1355 paddw xmm3, [GLOBAL(rd)] 1356 psraw xmm2, VP8_FILTER_SHIFT 1357 1358 paddw xmm4, [GLOBAL(rd)] 1359 psraw xmm3, VP8_FILTER_SHIFT 1360 1361 paddw xmm5, [GLOBAL(rd)] 1362 psraw xmm4, VP8_FILTER_SHIFT 1363 1364 paddw xmm6, [GLOBAL(rd)] 1365 psraw xmm5, VP8_FILTER_SHIFT 1366 1367 psraw xmm6, VP8_FILTER_SHIFT 1368 packuswb xmm1, xmm1 1369 1370 packuswb xmm2, xmm2 1371 movq [rdi], xmm1 1372 1373 packuswb xmm3, xmm3 1374 movq [rdi+rdx], xmm2 1375 1376 packuswb xmm4, xmm4 1377 movq xmm1, XMMWORD PTR [rsp+112] 1378 1379 lea rdi, [rdi + 2*rdx] 1380 movq xmm2, XMMWORD PTR [rsp+128] 1381 1382 packuswb xmm5, xmm5 1383 movq [rdi], xmm3 1384 1385 packuswb xmm6, xmm6 1386 movq [rdi+rdx], xmm4 1387 1388 lea rdi, [rdi + 2*rdx] 1389 punpcklbw xmm7, xmm1 1390 1391 movq [rdi], xmm5 1392 pmaddubsw xmm7, xmm0 1393 1394 movq [rdi+rdx], xmm6 1395 punpcklbw xmm1, xmm2 1396 1397 pmaddubsw xmm1, xmm0 1398 paddw xmm7, [GLOBAL(rd)] 1399 1400 psraw xmm7, VP8_FILTER_SHIFT 1401 paddw xmm1, [GLOBAL(rd)] 1402 1403 psraw xmm1, VP8_FILTER_SHIFT 1404 packuswb xmm7, xmm7 1405 1406 packuswb xmm1, xmm1 1407 lea rdi, [rdi + 2*rdx] 1408 1409 movq [rdi], xmm7 1410 1411 movq [rdi+rdx], xmm1 1412 lea rsp, [rsp + 144] 1413 1414 jmp done8x8 1415 1416b8x8_fp_only: 1417 lea rcx, [rdi+rdx*8] 1418 1419.next_row: 1420 movdqa xmm1, XMMWORD PTR [rsp] 1421 movdqa xmm3, XMMWORD PTR [rsp+16] 1422 1423 movdqa xmm2, xmm1 1424 movdqa xmm5, XMMWORD PTR [rsp+32] 1425 1426 psrldq xmm2, 1 1427 movdqa xmm7, XMMWORD PTR [rsp+48] 1428 1429 movdqa xmm4, xmm3 1430 psrldq xmm4, 1 1431 1432 movdqa xmm6, xmm5 1433 psrldq xmm6, 1 1434 1435 punpcklbw xmm1, xmm2 1436 pmaddubsw xmm1, xmm0 1437 1438 punpcklbw xmm3, xmm4 1439 pmaddubsw xmm3, xmm0 1440 1441 punpcklbw xmm5, xmm6 1442 pmaddubsw xmm5, xmm0 1443 1444 movdqa xmm2, xmm7 1445 psrldq xmm2, 1 1446 1447 punpcklbw xmm7, xmm2 1448 pmaddubsw xmm7, xmm0 1449 1450 paddw xmm1, [GLOBAL(rd)] 1451 psraw xmm1, VP8_FILTER_SHIFT 1452 1453 paddw xmm3, [GLOBAL(rd)] 1454 psraw xmm3, VP8_FILTER_SHIFT 1455 1456 paddw xmm5, [GLOBAL(rd)] 1457 psraw xmm5, VP8_FILTER_SHIFT 1458 1459 paddw xmm7, [GLOBAL(rd)] 1460 psraw xmm7, VP8_FILTER_SHIFT 1461 1462 packuswb xmm1, xmm1 1463 packuswb xmm3, xmm3 1464 1465 packuswb xmm5, xmm5 1466 movq [rdi], xmm1 1467 1468 packuswb xmm7, xmm7 1469 movq [rdi+rdx], xmm3 1470 1471 lea rdi, [rdi + 2*rdx] 1472 movq [rdi], xmm5 1473 1474 lea rsp, [rsp + 4*16] 1475 movq [rdi+rdx], xmm7 1476 1477 lea rdi, [rdi + 2*rdx] 1478 cmp rdi, rcx 1479 1480 jne .next_row 1481 1482 lea rsp, [rsp + 16] 1483 1484done8x8: 1485 ;add rsp, 144 1486 pop rsp 1487 ; begin epilog 1488 pop rdi 1489 pop rsi 1490 RESTORE_GOT 1491 RESTORE_XMM 1492 UNSHADOW_ARGS 1493 pop rbp 1494 ret 1495 1496SECTION_RODATA 1497align 16 1498shuf1b: 1499 db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 1500shuf2b: 1501 db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 1502shuf3b: 1503 db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 1504 1505align 16 1506shuf2bfrom1: 1507 db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 1508align 16 1509shuf3bfrom1: 1510 db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 1511 1512align 16 1513rd: 1514 times 8 dw 0x40 1515 1516align 16 1517k0_k5: 1518 times 8 db 0, 0 ;placeholder 1519 times 8 db 0, 0 1520 times 8 db 2, 1 1521 times 8 db 0, 0 1522 times 8 db 3, 3 1523 times 8 db 0, 0 1524 times 8 db 1, 2 1525 times 8 db 0, 0 1526k1_k3: 1527 times 8 db 0, 0 ;placeholder 1528 times 8 db -6, 12 1529 times 8 db -11, 36 1530 times 8 db -9, 50 1531 times 8 db -16, 77 1532 times 8 db -6, 93 1533 times 8 db -8, 108 1534 times 8 db -1, 123 1535k2_k4: 1536 times 8 db 128, 0 ;placeholder 1537 times 8 db 123, -1 1538 times 8 db 108, -8 1539 times 8 db 93, -6 1540 times 8 db 77, -16 1541 times 8 db 50, -9 1542 times 8 db 36, -11 1543 times 8 db 12, -6 1544align 16 1545vp8_bilinear_filters_ssse3: 1546 times 8 db 128, 0 1547 times 8 db 112, 16 1548 times 8 db 96, 32 1549 times 8 db 80, 48 1550 times 8 db 64, 64 1551 times 8 db 48, 80 1552 times 8 db 32, 96 1553 times 8 db 16, 112 1554 1555