1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%define xmm_filter_shift 7 15 16;unsigned int vp8_get_mb_ss_sse2 17;( 18; short *src_ptr 19;) 20global sym(vp8_get_mb_ss_sse2) 21sym(vp8_get_mb_ss_sse2): 22 push rbp 23 mov rbp, rsp 24 SHADOW_ARGS_TO_STACK 1 25 GET_GOT rbx 26 push rsi 27 push rdi 28 sub rsp, 16 29 ; end prolog 30 31 32 mov rax, arg(0) ;[src_ptr] 33 mov rcx, 8 34 pxor xmm4, xmm4 35 36NEXTROW: 37 movdqa xmm0, [rax] 38 movdqa xmm1, [rax+16] 39 movdqa xmm2, [rax+32] 40 movdqa xmm3, [rax+48] 41 pmaddwd xmm0, xmm0 42 pmaddwd xmm1, xmm1 43 pmaddwd xmm2, xmm2 44 pmaddwd xmm3, xmm3 45 46 paddd xmm0, xmm1 47 paddd xmm2, xmm3 48 paddd xmm4, xmm0 49 paddd xmm4, xmm2 50 51 add rax, 0x40 52 dec rcx 53 ja NEXTROW 54 55 movdqa xmm3,xmm4 56 psrldq xmm4,8 57 paddd xmm4,xmm3 58 movdqa xmm3,xmm4 59 psrldq xmm4,4 60 paddd xmm4,xmm3 61 movq rax,xmm4 62 63 64 ; begin epilog 65 add rsp, 16 66 pop rdi 67 pop rsi 68 RESTORE_GOT 69 UNSHADOW_ARGS 70 pop rbp 71 ret 72 73 74;unsigned int vp8_get16x16var_sse2 75;( 76; unsigned char * src_ptr, 77; int source_stride, 78; unsigned char * ref_ptr, 79; int recon_stride, 80; unsigned int * SSE, 81; int * Sum 82;) 83global sym(vp8_get16x16var_sse2) 84sym(vp8_get16x16var_sse2): 85 push rbp 86 mov rbp, rsp 87 SHADOW_ARGS_TO_STACK 6 88 push rbx 89 push rsi 90 push rdi 91 ; end prolog 92 93 mov rsi, arg(0) ;[src_ptr] 94 mov rdi, arg(2) ;[ref_ptr] 95 96 movsxd rax, DWORD PTR arg(1) ;[source_stride] 97 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 98 99 ; Prefetch data 100 lea rcx, [rax+rax*2] 101 prefetcht0 [rsi] 102 prefetcht0 [rsi+rax] 103 prefetcht0 [rsi+rax*2] 104 prefetcht0 [rsi+rcx] 105 lea rbx, [rsi+rax*4] 106 prefetcht0 [rbx] 107 prefetcht0 [rbx+rax] 108 prefetcht0 [rbx+rax*2] 109 prefetcht0 [rbx+rcx] 110 111 lea rcx, [rdx+rdx*2] 112 prefetcht0 [rdi] 113 prefetcht0 [rdi+rdx] 114 prefetcht0 [rdi+rdx*2] 115 prefetcht0 [rdi+rcx] 116 lea rbx, [rdi+rdx*4] 117 prefetcht0 [rbx] 118 prefetcht0 [rbx+rdx] 119 prefetcht0 [rbx+rdx*2] 120 prefetcht0 [rbx+rcx] 121 122 pxor xmm0, xmm0 ; clear xmm0 for unpack 123 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 124 125 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 126 mov rcx, 16 127 128var16loop: 129 movdqu xmm1, XMMWORD PTR [rsi] 130 movdqu xmm2, XMMWORD PTR [rdi] 131 132 prefetcht0 [rsi+rax*8] 133 prefetcht0 [rdi+rdx*8] 134 135 movdqa xmm3, xmm1 136 movdqa xmm4, xmm2 137 138 139 punpcklbw xmm1, xmm0 140 punpckhbw xmm3, xmm0 141 142 punpcklbw xmm2, xmm0 143 punpckhbw xmm4, xmm0 144 145 146 psubw xmm1, xmm2 147 psubw xmm3, xmm4 148 149 paddw xmm7, xmm1 150 pmaddwd xmm1, xmm1 151 152 paddw xmm7, xmm3 153 pmaddwd xmm3, xmm3 154 155 paddd xmm6, xmm1 156 paddd xmm6, xmm3 157 158 add rsi, rax 159 add rdi, rdx 160 161 sub rcx, 1 162 jnz var16loop 163 164 165 movdqa xmm1, xmm6 166 pxor xmm6, xmm6 167 168 pxor xmm5, xmm5 169 punpcklwd xmm6, xmm7 170 171 punpckhwd xmm5, xmm7 172 psrad xmm5, 16 173 174 psrad xmm6, 16 175 paddd xmm6, xmm5 176 177 movdqa xmm2, xmm1 178 punpckldq xmm1, xmm0 179 180 punpckhdq xmm2, xmm0 181 movdqa xmm7, xmm6 182 183 paddd xmm1, xmm2 184 punpckldq xmm6, xmm0 185 186 punpckhdq xmm7, xmm0 187 paddd xmm6, xmm7 188 189 movdqa xmm2, xmm1 190 movdqa xmm7, xmm6 191 192 psrldq xmm1, 8 193 psrldq xmm6, 8 194 195 paddd xmm7, xmm6 196 paddd xmm1, xmm2 197 198 mov rax, arg(5) ;[Sum] 199 mov rdi, arg(4) ;[SSE] 200 201 movd DWORD PTR [rax], xmm7 202 movd DWORD PTR [rdi], xmm1 203 204 205 ; begin epilog 206 pop rdi 207 pop rsi 208 pop rbx 209 UNSHADOW_ARGS 210 pop rbp 211 ret 212 213 214;unsigned int vp8_get16x16pred_error_sse2 215;( 216; unsigned char *src_ptr, 217; int src_stride, 218; unsigned char *ref_ptr, 219; int ref_stride 220;) 221global sym(vp8_get16x16pred_error_sse2) 222sym(vp8_get16x16pred_error_sse2): 223 push rbp 224 mov rbp, rsp 225 SHADOW_ARGS_TO_STACK 4 226 GET_GOT rbx 227 push rsi 228 push rdi 229 sub rsp, 16 230 ; end prolog 231 232 mov rsi, arg(0) ;[src_ptr] 233 mov rdi, arg(2) ;[ref_ptr] 234 235 movsxd rax, DWORD PTR arg(1) ;[src_stride] 236 movsxd rdx, DWORD PTR arg(3) ;[ref_stride] 237 238 pxor xmm0, xmm0 ; clear xmm0 for unpack 239 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 240 241 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 242 mov rcx, 16 243 244var16peloop: 245 movdqu xmm1, XMMWORD PTR [rsi] 246 movdqu xmm2, XMMWORD PTR [rdi] 247 248 movdqa xmm3, xmm1 249 movdqa xmm4, xmm2 250 251 punpcklbw xmm1, xmm0 252 punpckhbw xmm3, xmm0 253 254 punpcklbw xmm2, xmm0 255 punpckhbw xmm4, xmm0 256 257 psubw xmm1, xmm2 258 psubw xmm3, xmm4 259 260 paddw xmm7, xmm1 261 pmaddwd xmm1, xmm1 262 263 paddw xmm7, xmm3 264 pmaddwd xmm3, xmm3 265 266 paddd xmm6, xmm1 267 paddd xmm6, xmm3 268 269 add rsi, rax 270 add rdi, rdx 271 272 sub rcx, 1 273 jnz var16peloop 274 275 276 movdqa xmm1, xmm6 277 pxor xmm6, xmm6 278 279 pxor xmm5, xmm5 280 punpcklwd xmm6, xmm7 281 282 punpckhwd xmm5, xmm7 283 psrad xmm5, 16 284 285 psrad xmm6, 16 286 paddd xmm6, xmm5 287 288 movdqa xmm2, xmm1 289 punpckldq xmm1, xmm0 290 291 punpckhdq xmm2, xmm0 292 movdqa xmm7, xmm6 293 294 paddd xmm1, xmm2 295 punpckldq xmm6, xmm0 296 297 punpckhdq xmm7, xmm0 298 paddd xmm6, xmm7 299 300 movdqa xmm2, xmm1 301 movdqa xmm7, xmm6 302 303 psrldq xmm1, 8 304 psrldq xmm6, 8 305 306 paddd xmm7, xmm6 307 paddd xmm1, xmm2 308 309 movd DWORD PTR [rsp], xmm7 ;Sum 310 movd DWORD PTR [rsp+4], xmm1 ;SSE 311 312 ; return (SSE-((Sum*Sum)>>8)); 313 movsxd rdx, dword ptr [rsp] 314 imul rdx, rdx 315 sar rdx, 8 316 movsxd rax, dword ptr [rsp + 4] 317 sub rax, rdx 318 319 ; begin epilog 320 add rsp, 16 321 pop rdi 322 pop rsi 323 RESTORE_GOT 324 UNSHADOW_ARGS 325 pop rbp 326 ret 327 328 329 330;unsigned int vp8_get8x8var_sse2 331;( 332; unsigned char * src_ptr, 333; int source_stride, 334; unsigned char * ref_ptr, 335; int recon_stride, 336; unsigned int * SSE, 337; int * Sum 338;) 339global sym(vp8_get8x8var_sse2) 340sym(vp8_get8x8var_sse2): 341 push rbp 342 mov rbp, rsp 343 SHADOW_ARGS_TO_STACK 6 344 GET_GOT rbx 345 push rsi 346 push rdi 347 sub rsp, 16 348 ; end prolog 349 350 mov rsi, arg(0) ;[src_ptr] 351 mov rdi, arg(2) ;[ref_ptr] 352 353 movsxd rax, DWORD PTR arg(1) ;[source_stride] 354 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 355 356 pxor xmm0, xmm0 ; clear xmm0 for unpack 357 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 358 359 movq xmm1, QWORD PTR [rsi] 360 movq xmm2, QWORD PTR [rdi] 361 362 punpcklbw xmm1, xmm0 363 punpcklbw xmm2, xmm0 364 365 psubsw xmm1, xmm2 366 paddw xmm7, xmm1 367 368 pmaddwd xmm1, xmm1 369 370 movq xmm2, QWORD PTR[rsi + rax] 371 movq xmm3, QWORD PTR[rdi + rdx] 372 373 punpcklbw xmm2, xmm0 374 punpcklbw xmm3, xmm0 375 376 psubsw xmm2, xmm3 377 paddw xmm7, xmm2 378 379 pmaddwd xmm2, xmm2 380 paddd xmm1, xmm2 381 382 383 movq xmm2, QWORD PTR[rsi + rax * 2] 384 movq xmm3, QWORD PTR[rdi + rdx * 2] 385 386 punpcklbw xmm2, xmm0 387 punpcklbw xmm3, xmm0 388 389 psubsw xmm2, xmm3 390 paddw xmm7, xmm2 391 392 pmaddwd xmm2, xmm2 393 paddd xmm1, xmm2 394 395 396 lea rsi, [rsi + rax * 2] 397 lea rdi, [rdi + rdx * 2] 398 movq xmm2, QWORD PTR[rsi + rax] 399 movq xmm3, QWORD PTR[rdi + rdx] 400 401 punpcklbw xmm2, xmm0 402 punpcklbw xmm3, xmm0 403 404 psubsw xmm2, xmm3 405 paddw xmm7, xmm2 406 407 pmaddwd xmm2, xmm2 408 paddd xmm1, xmm2 409 410 movq xmm2, QWORD PTR[rsi + rax *2] 411 movq xmm3, QWORD PTR[rdi + rdx *2] 412 413 punpcklbw xmm2, xmm0 414 punpcklbw xmm3, xmm0 415 416 psubsw xmm2, xmm3 417 paddw xmm7, xmm2 418 419 pmaddwd xmm2, xmm2 420 paddd xmm1, xmm2 421 422 423 lea rsi, [rsi + rax * 2] 424 lea rdi, [rdi + rdx * 2] 425 426 427 movq xmm2, QWORD PTR[rsi + rax] 428 movq xmm3, QWORD PTR[rdi + rdx] 429 430 punpcklbw xmm2, xmm0 431 punpcklbw xmm3, xmm0 432 433 psubsw xmm2, xmm3 434 paddw xmm7, xmm2 435 436 pmaddwd xmm2, xmm2 437 paddd xmm1, xmm2 438 439 movq xmm2, QWORD PTR[rsi + rax *2] 440 movq xmm3, QWORD PTR[rdi + rdx *2] 441 442 punpcklbw xmm2, xmm0 443 punpcklbw xmm3, xmm0 444 445 psubsw xmm2, xmm3 446 paddw xmm7, xmm2 447 448 pmaddwd xmm2, xmm2 449 paddd xmm1, xmm2 450 451 452 lea rsi, [rsi + rax * 2] 453 lea rdi, [rdi + rdx * 2] 454 455 movq xmm2, QWORD PTR[rsi + rax] 456 movq xmm3, QWORD PTR[rdi + rdx] 457 458 punpcklbw xmm2, xmm0 459 punpcklbw xmm3, xmm0 460 461 psubsw xmm2, xmm3 462 paddw xmm7, xmm2 463 464 pmaddwd xmm2, xmm2 465 paddd xmm1, xmm2 466 467 468 movdqa xmm6, xmm7 469 punpcklwd xmm6, xmm0 470 471 punpckhwd xmm7, xmm0 472 movdqa xmm2, xmm1 473 474 paddw xmm6, xmm7 475 punpckldq xmm1, xmm0 476 477 punpckhdq xmm2, xmm0 478 movdqa xmm7, xmm6 479 480 paddd xmm1, xmm2 481 punpckldq xmm6, xmm0 482 483 punpckhdq xmm7, xmm0 484 paddw xmm6, xmm7 485 486 movdqa xmm2, xmm1 487 movdqa xmm7, xmm6 488 489 psrldq xmm1, 8 490 psrldq xmm6, 8 491 492 paddw xmm7, xmm6 493 paddd xmm1, xmm2 494 495 mov rax, arg(5) ;[Sum] 496 mov rdi, arg(4) ;[SSE] 497 498 movq rdx, xmm7 499 movsx rcx, dx 500 501 mov dword ptr [rax], ecx 502 movd DWORD PTR [rdi], xmm1 503 504 ; begin epilog 505 add rsp, 16 506 pop rdi 507 pop rsi 508 RESTORE_GOT 509 UNSHADOW_ARGS 510 pop rbp 511 ret 512 513;void vp8_filter_block2d_bil_var_sse2 514;( 515; unsigned char *ref_ptr, 516; int ref_pixels_per_line, 517; unsigned char *src_ptr, 518; int src_pixels_per_line, 519; unsigned int Height, 520; int xoffset, 521; int yoffset, 522; int *sum, 523; unsigned int *sumsquared;; 524; 525;) 526global sym(vp8_filter_block2d_bil_var_sse2) 527sym(vp8_filter_block2d_bil_var_sse2): 528 push rbp 529 mov rbp, rsp 530 SHADOW_ARGS_TO_STACK 9 531 SAVE_XMM 532 GET_GOT rbx 533 push rsi 534 push rdi 535 push rbx 536 ; end prolog 537 538 pxor xmm6, xmm6 ; 539 pxor xmm7, xmm7 ; 540 541 lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding 542 movdqa xmm4, XMMWORD PTR [rsi] 543 544 lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)] 545 movsxd rax, dword ptr arg(5) ; xoffset 546 547 cmp rax, 0 ; skip first_pass filter if xoffset=0 548 je filter_block2d_bil_var_sse2_sp_only 549 550 shl rax, 5 ; point to filter coeff with xoffset 551 lea rax, [rax + rcx] ; HFilter 552 553 movsxd rdx, dword ptr arg(6) ; yoffset 554 555 cmp rdx, 0 ; skip second_pass filter if yoffset=0 556 je filter_block2d_bil_var_sse2_fp_only 557 558 shl rdx, 5 559 lea rdx, [rdx + rcx] ; VFilter 560 561 mov rsi, arg(0) ;ref_ptr 562 mov rdi, arg(2) ;src_ptr 563 movsxd rcx, dword ptr arg(4) ;Height 564 565 pxor xmm0, xmm0 ; 566 movq xmm1, QWORD PTR [rsi] ; 567 movq xmm3, QWORD PTR [rsi+1] ; 568 569 punpcklbw xmm1, xmm0 ; 570 pmullw xmm1, [rax] ; 571 punpcklbw xmm3, xmm0 572 pmullw xmm3, [rax+16] ; 573 574 paddw xmm1, xmm3 ; 575 paddw xmm1, xmm4 ; 576 psraw xmm1, xmm_filter_shift ; 577 movdqa xmm5, xmm1 578 579 movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line 580 lea rsi, [rsi + rbx] 581%if ABI_IS_32BIT=0 582 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 583%endif 584 585filter_block2d_bil_var_sse2_loop: 586 movq xmm1, QWORD PTR [rsi] ; 587 movq xmm3, QWORD PTR [rsi+1] ; 588 589 punpcklbw xmm1, xmm0 ; 590 pmullw xmm1, [rax] ; 591 punpcklbw xmm3, xmm0 ; 592 pmullw xmm3, [rax+16] ; 593 594 paddw xmm1, xmm3 ; 595 paddw xmm1, xmm4 ; 596 psraw xmm1, xmm_filter_shift ; 597 598 movdqa xmm3, xmm5 ; 599 movdqa xmm5, xmm1 ; 600 601 pmullw xmm3, [rdx] ; 602 pmullw xmm1, [rdx+16] ; 603 paddw xmm1, xmm3 ; 604 paddw xmm1, xmm4 ; 605 psraw xmm1, xmm_filter_shift ; 606 607 movq xmm3, QWORD PTR [rdi] ; 608 punpcklbw xmm3, xmm0 ; 609 610 psubw xmm1, xmm3 ; 611 paddw xmm6, xmm1 ; 612 613 pmaddwd xmm1, xmm1 ; 614 paddd xmm7, xmm1 ; 615 616 lea rsi, [rsi + rbx] ;ref_pixels_per_line 617%if ABI_IS_32BIT 618 add rdi, dword ptr arg(3) ;src_pixels_per_line 619%else 620 lea rdi, [rdi + r9] 621%endif 622 623 sub rcx, 1 ; 624 jnz filter_block2d_bil_var_sse2_loop ; 625 626 jmp filter_block2d_bil_variance 627 628filter_block2d_bil_var_sse2_sp_only: 629 movsxd rdx, dword ptr arg(6) ; yoffset 630 631 cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0 632 je filter_block2d_bil_var_sse2_full_pixel 633 634 shl rdx, 5 635 lea rdx, [rdx + rcx] ; VFilter 636 637 mov rsi, arg(0) ;ref_ptr 638 mov rdi, arg(2) ;src_ptr 639 movsxd rcx, dword ptr arg(4) ;Height 640 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 641 642 pxor xmm0, xmm0 ; 643 movq xmm1, QWORD PTR [rsi] ; 644 punpcklbw xmm1, xmm0 ; 645 646 movsxd rbx, dword ptr arg(3) ;src_pixels_per_line 647 lea rsi, [rsi + rax] 648 649filter_block2d_bil_sp_only_loop: 650 movq xmm3, QWORD PTR [rsi] ; 651 punpcklbw xmm3, xmm0 ; 652 movdqa xmm5, xmm3 653 654 pmullw xmm1, [rdx] ; 655 pmullw xmm3, [rdx+16] ; 656 paddw xmm1, xmm3 ; 657 paddw xmm1, xmm4 ; 658 psraw xmm1, xmm_filter_shift ; 659 660 movq xmm3, QWORD PTR [rdi] ; 661 punpcklbw xmm3, xmm0 ; 662 663 psubw xmm1, xmm3 ; 664 paddw xmm6, xmm1 ; 665 666 pmaddwd xmm1, xmm1 ; 667 paddd xmm7, xmm1 ; 668 669 movdqa xmm1, xmm5 ; 670 lea rsi, [rsi + rax] ;ref_pixels_per_line 671 lea rdi, [rdi + rbx] ;src_pixels_per_line 672 673 sub rcx, 1 ; 674 jnz filter_block2d_bil_sp_only_loop ; 675 676 jmp filter_block2d_bil_variance 677 678filter_block2d_bil_var_sse2_full_pixel: 679 mov rsi, arg(0) ;ref_ptr 680 mov rdi, arg(2) ;src_ptr 681 movsxd rcx, dword ptr arg(4) ;Height 682 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 683 movsxd rbx, dword ptr arg(3) ;src_pixels_per_line 684 pxor xmm0, xmm0 ; 685 686filter_block2d_bil_full_pixel_loop: 687 movq xmm1, QWORD PTR [rsi] ; 688 punpcklbw xmm1, xmm0 ; 689 690 movq xmm2, QWORD PTR [rdi] ; 691 punpcklbw xmm2, xmm0 ; 692 693 psubw xmm1, xmm2 ; 694 paddw xmm6, xmm1 ; 695 696 pmaddwd xmm1, xmm1 ; 697 paddd xmm7, xmm1 ; 698 699 lea rsi, [rsi + rax] ;ref_pixels_per_line 700 lea rdi, [rdi + rbx] ;src_pixels_per_line 701 702 sub rcx, 1 ; 703 jnz filter_block2d_bil_full_pixel_loop ; 704 705 jmp filter_block2d_bil_variance 706 707filter_block2d_bil_var_sse2_fp_only: 708 mov rsi, arg(0) ;ref_ptr 709 mov rdi, arg(2) ;src_ptr 710 movsxd rcx, dword ptr arg(4) ;Height 711 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line 712 713 pxor xmm0, xmm0 ; 714 movsxd rbx, dword ptr arg(3) ;src_pixels_per_line 715 716filter_block2d_bil_fp_only_loop: 717 movq xmm1, QWORD PTR [rsi] ; 718 movq xmm3, QWORD PTR [rsi+1] ; 719 720 punpcklbw xmm1, xmm0 ; 721 pmullw xmm1, [rax] ; 722 punpcklbw xmm3, xmm0 ; 723 pmullw xmm3, [rax+16] ; 724 725 paddw xmm1, xmm3 ; 726 paddw xmm1, xmm4 ; 727 psraw xmm1, xmm_filter_shift ; 728 729 movq xmm3, QWORD PTR [rdi] ; 730 punpcklbw xmm3, xmm0 ; 731 732 psubw xmm1, xmm3 ; 733 paddw xmm6, xmm1 ; 734 735 pmaddwd xmm1, xmm1 ; 736 paddd xmm7, xmm1 ; 737 lea rsi, [rsi + rdx] 738 lea rdi, [rdi + rbx] ;src_pixels_per_line 739 740 sub rcx, 1 ; 741 jnz filter_block2d_bil_fp_only_loop ; 742 743 jmp filter_block2d_bil_variance 744 745filter_block2d_bil_variance: 746 movdq2q mm6, xmm6 ; 747 movdq2q mm7, xmm7 ; 748 749 psrldq xmm6, 8 750 psrldq xmm7, 8 751 752 movdq2q mm2, xmm6 753 movdq2q mm3, xmm7 754 755 paddw mm6, mm2 756 paddd mm7, mm3 757 758 pxor mm3, mm3 ; 759 pxor mm2, mm2 ; 760 761 punpcklwd mm2, mm6 ; 762 punpckhwd mm3, mm6 ; 763 764 paddd mm2, mm3 ; 765 movq mm6, mm2 ; 766 767 psrlq mm6, 32 ; 768 paddd mm2, mm6 ; 769 770 psrad mm2, 16 ; 771 movq mm4, mm7 ; 772 773 psrlq mm4, 32 ; 774 paddd mm4, mm7 ; 775 776 mov rsi, arg(7) ; sum 777 mov rdi, arg(8) ; sumsquared 778 779 movd [rsi], mm2 ; xsum 780 movd [rdi], mm4 ; xxsum 781 782 ; begin epilog 783 pop rbx 784 pop rdi 785 pop rsi 786 RESTORE_GOT 787 RESTORE_XMM 788 UNSHADOW_ARGS 789 pop rbp 790 ret 791 792 793;void vp8_half_horiz_vert_variance8x_h_sse2 794;( 795; unsigned char *ref_ptr, 796; int ref_pixels_per_line, 797; unsigned char *src_ptr, 798; int src_pixels_per_line, 799; unsigned int Height, 800; int *sum, 801; unsigned int *sumsquared 802;) 803global sym(vp8_half_horiz_vert_variance8x_h_sse2) 804sym(vp8_half_horiz_vert_variance8x_h_sse2): 805 push rbp 806 mov rbp, rsp 807 SHADOW_ARGS_TO_STACK 7 808 GET_GOT rbx 809 push rsi 810 push rdi 811 ; end prolog 812 813%if ABI_IS_32BIT=0 814 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 815 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 816%endif 817 818 pxor xmm6, xmm6 ; error accumulator 819 pxor xmm7, xmm7 ; sse eaccumulator 820 mov rsi, arg(0) ;ref_ptr ; 821 822 mov rdi, arg(2) ;src_ptr ; 823 movsxd rcx, dword ptr arg(4) ;Height ; 824 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 825 826 pxor xmm0, xmm0 ; 827 828 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 829 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 830 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 831 832%if ABI_IS_32BIT 833 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source 834%else 835 add rsi, r8 836%endif 837 838vp8_half_horiz_vert_variance8x_h_1: 839 840 movq xmm1, QWORD PTR [rsi] ; 841 movq xmm2, QWORD PTR [rsi+1] ; 842 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 843 844 pavgb xmm5, xmm1 ; xmm = vertical average of the above 845 punpcklbw xmm5, xmm0 ; xmm5 = words of above 846 847 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 848 punpcklbw xmm3, xmm0 ; xmm3 = words of above 849 850 psubw xmm5, xmm3 ; xmm5 -= xmm3 851 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 852 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 853 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 854 855 movdqa xmm5, xmm1 ; save xmm1 for use on the next row 856 857%if ABI_IS_32BIT 858 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 859 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 860%else 861 add rsi, r8 862 add rdi, r9 863%endif 864 865 sub rcx, 1 ; 866 jnz vp8_half_horiz_vert_variance8x_h_1 ; 867 868 movdq2q mm6, xmm6 ; 869 movdq2q mm7, xmm7 ; 870 871 psrldq xmm6, 8 872 psrldq xmm7, 8 873 874 movdq2q mm2, xmm6 875 movdq2q mm3, xmm7 876 877 paddw mm6, mm2 878 paddd mm7, mm3 879 880 pxor mm3, mm3 ; 881 pxor mm2, mm2 ; 882 883 punpcklwd mm2, mm6 ; 884 punpckhwd mm3, mm6 ; 885 886 paddd mm2, mm3 ; 887 movq mm6, mm2 ; 888 889 psrlq mm6, 32 ; 890 paddd mm2, mm6 ; 891 892 psrad mm2, 16 ; 893 movq mm4, mm7 ; 894 895 psrlq mm4, 32 ; 896 paddd mm4, mm7 ; 897 898 mov rsi, arg(5) ; sum 899 mov rdi, arg(6) ; sumsquared 900 901 movd [rsi], mm2 ; 902 movd [rdi], mm4 ; 903 904 905 ; begin epilog 906 pop rdi 907 pop rsi 908 RESTORE_GOT 909 UNSHADOW_ARGS 910 pop rbp 911 ret 912 913;void vp8_half_horiz_vert_variance16x_h_sse2 914;( 915; unsigned char *ref_ptr, 916; int ref_pixels_per_line, 917; unsigned char *src_ptr, 918; int src_pixels_per_line, 919; unsigned int Height, 920; int *sum, 921; unsigned int *sumsquared 922;) 923global sym(vp8_half_horiz_vert_variance16x_h_sse2) 924sym(vp8_half_horiz_vert_variance16x_h_sse2): 925 push rbp 926 mov rbp, rsp 927 SHADOW_ARGS_TO_STACK 7 928 SAVE_XMM 929 GET_GOT rbx 930 push rsi 931 push rdi 932 ; end prolog 933 934 pxor xmm6, xmm6 ; error accumulator 935 pxor xmm7, xmm7 ; sse eaccumulator 936 mov rsi, arg(0) ;ref_ptr ; 937 938 mov rdi, arg(2) ;src_ptr ; 939 movsxd rcx, dword ptr arg(4) ;Height ; 940 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 941 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 942 943 pxor xmm0, xmm0 ; 944 945 movdqu xmm5, XMMWORD PTR [rsi] 946 movdqu xmm3, XMMWORD PTR [rsi+1] 947 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 948 949 lea rsi, [rsi + rax] 950 951vp8_half_horiz_vert_variance16x_h_1: 952 movdqu xmm1, XMMWORD PTR [rsi] ; 953 movdqu xmm2, XMMWORD PTR [rsi+1] ; 954 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 955 956 pavgb xmm5, xmm1 ; xmm = vertical average of the above 957 958 movdqa xmm4, xmm5 959 punpcklbw xmm5, xmm0 ; xmm5 = words of above 960 punpckhbw xmm4, xmm0 961 962 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 963 punpcklbw xmm3, xmm0 ; xmm3 = words of above 964 psubw xmm5, xmm3 ; xmm5 -= xmm3 965 966 movq xmm3, QWORD PTR [rdi+8] 967 punpcklbw xmm3, xmm0 968 psubw xmm4, xmm3 969 970 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 971 paddw xmm6, xmm4 972 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 973 pmaddwd xmm4, xmm4 974 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 975 paddd xmm7, xmm4 976 977 movdqa xmm5, xmm1 ; save xmm1 for use on the next row 978 979 lea rsi, [rsi + rax] 980 lea rdi, [rdi + rdx] 981 982 sub rcx, 1 ; 983 jnz vp8_half_horiz_vert_variance16x_h_1 ; 984 985 pxor xmm1, xmm1 986 pxor xmm5, xmm5 987 988 punpcklwd xmm0, xmm6 989 punpckhwd xmm1, xmm6 990 psrad xmm0, 16 991 psrad xmm1, 16 992 paddd xmm0, xmm1 993 movdqa xmm1, xmm0 994 995 movdqa xmm6, xmm7 996 punpckldq xmm6, xmm5 997 punpckhdq xmm7, xmm5 998 paddd xmm6, xmm7 999 1000 punpckldq xmm0, xmm5 1001 punpckhdq xmm1, xmm5 1002 paddd xmm0, xmm1 1003 1004 movdqa xmm7, xmm6 1005 movdqa xmm1, xmm0 1006 1007 psrldq xmm7, 8 1008 psrldq xmm1, 8 1009 1010 paddd xmm6, xmm7 1011 paddd xmm0, xmm1 1012 1013 mov rsi, arg(5) ;[Sum] 1014 mov rdi, arg(6) ;[SSE] 1015 1016 movd [rsi], xmm0 1017 movd [rdi], xmm6 1018 1019 ; begin epilog 1020 pop rdi 1021 pop rsi 1022 RESTORE_GOT 1023 RESTORE_XMM 1024 UNSHADOW_ARGS 1025 pop rbp 1026 ret 1027 1028 1029;void vp8_half_vert_variance8x_h_sse2 1030;( 1031; unsigned char *ref_ptr, 1032; int ref_pixels_per_line, 1033; unsigned char *src_ptr, 1034; int src_pixels_per_line, 1035; unsigned int Height, 1036; int *sum, 1037; unsigned int *sumsquared 1038;) 1039global sym(vp8_half_vert_variance8x_h_sse2) 1040sym(vp8_half_vert_variance8x_h_sse2): 1041 push rbp 1042 mov rbp, rsp 1043 SHADOW_ARGS_TO_STACK 7 1044 GET_GOT rbx 1045 push rsi 1046 push rdi 1047 ; end prolog 1048 1049%if ABI_IS_32BIT=0 1050 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 1051 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 1052%endif 1053 1054 pxor xmm6, xmm6 ; error accumulator 1055 pxor xmm7, xmm7 ; sse eaccumulator 1056 mov rsi, arg(0) ;ref_ptr ; 1057 1058 mov rdi, arg(2) ;src_ptr ; 1059 movsxd rcx, dword ptr arg(4) ;Height ; 1060 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1061 1062 pxor xmm0, xmm0 ; 1063vp8_half_vert_variance8x_h_1: 1064 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 1065 movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 1066 1067 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 1068 punpcklbw xmm5, xmm0 ; xmm5 = words of above 1069 1070 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 1071 punpcklbw xmm3, xmm0 ; xmm3 = words of above 1072 1073 psubw xmm5, xmm3 ; xmm5 -= xmm3 1074 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1075 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1076 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1077 1078%if ABI_IS_32BIT 1079 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 1080 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 1081%else 1082 add rsi, r8 1083 add rdi, r9 1084%endif 1085 1086 sub rcx, 1 ; 1087 jnz vp8_half_vert_variance8x_h_1 ; 1088 1089 movdq2q mm6, xmm6 ; 1090 movdq2q mm7, xmm7 ; 1091 1092 psrldq xmm6, 8 1093 psrldq xmm7, 8 1094 1095 movdq2q mm2, xmm6 1096 movdq2q mm3, xmm7 1097 1098 paddw mm6, mm2 1099 paddd mm7, mm3 1100 1101 pxor mm3, mm3 ; 1102 pxor mm2, mm2 ; 1103 1104 punpcklwd mm2, mm6 ; 1105 punpckhwd mm3, mm6 ; 1106 1107 paddd mm2, mm3 ; 1108 movq mm6, mm2 ; 1109 1110 psrlq mm6, 32 ; 1111 paddd mm2, mm6 ; 1112 1113 psrad mm2, 16 ; 1114 movq mm4, mm7 ; 1115 1116 psrlq mm4, 32 ; 1117 paddd mm4, mm7 ; 1118 1119 mov rsi, arg(5) ; sum 1120 mov rdi, arg(6) ; sumsquared 1121 1122 movd [rsi], mm2 ; 1123 movd [rdi], mm4 ; 1124 1125 1126 ; begin epilog 1127 pop rdi 1128 pop rsi 1129 RESTORE_GOT 1130 UNSHADOW_ARGS 1131 pop rbp 1132 ret 1133 1134;void vp8_half_vert_variance16x_h_sse2 1135;( 1136; unsigned char *ref_ptr, 1137; int ref_pixels_per_line, 1138; unsigned char *src_ptr, 1139; int src_pixels_per_line, 1140; unsigned int Height, 1141; int *sum, 1142; unsigned int *sumsquared 1143;) 1144global sym(vp8_half_vert_variance16x_h_sse2) 1145sym(vp8_half_vert_variance16x_h_sse2): 1146 push rbp 1147 mov rbp, rsp 1148 SHADOW_ARGS_TO_STACK 7 1149 SAVE_XMM 1150 GET_GOT rbx 1151 push rsi 1152 push rdi 1153 ; end prolog 1154 1155 pxor xmm6, xmm6 ; error accumulator 1156 pxor xmm7, xmm7 ; sse eaccumulator 1157 mov rsi, arg(0) ;ref_ptr 1158 1159 mov rdi, arg(2) ;src_ptr 1160 movsxd rcx, dword ptr arg(4) ;Height 1161 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1162 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 1163 1164 movdqu xmm5, XMMWORD PTR [rsi] 1165 lea rsi, [rsi + rax ] 1166 pxor xmm0, xmm0 1167 1168vp8_half_vert_variance16x_h_1: 1169 movdqu xmm3, XMMWORD PTR [rsi] 1170 1171 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 1172 movdqa xmm4, xmm5 1173 punpcklbw xmm5, xmm0 1174 punpckhbw xmm4, xmm0 1175 1176 movq xmm2, QWORD PTR [rdi] 1177 punpcklbw xmm2, xmm0 1178 psubw xmm5, xmm2 1179 movq xmm2, QWORD PTR [rdi+8] 1180 punpcklbw xmm2, xmm0 1181 psubw xmm4, xmm2 1182 1183 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1184 paddw xmm6, xmm4 1185 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1186 pmaddwd xmm4, xmm4 1187 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1188 paddd xmm7, xmm4 1189 1190 movdqa xmm5, xmm3 1191 1192 lea rsi, [rsi + rax] 1193 lea rdi, [rdi + rdx] 1194 1195 sub rcx, 1 1196 jnz vp8_half_vert_variance16x_h_1 1197 1198 pxor xmm1, xmm1 1199 pxor xmm5, xmm5 1200 1201 punpcklwd xmm0, xmm6 1202 punpckhwd xmm1, xmm6 1203 psrad xmm0, 16 1204 psrad xmm1, 16 1205 paddd xmm0, xmm1 1206 movdqa xmm1, xmm0 1207 1208 movdqa xmm6, xmm7 1209 punpckldq xmm6, xmm5 1210 punpckhdq xmm7, xmm5 1211 paddd xmm6, xmm7 1212 1213 punpckldq xmm0, xmm5 1214 punpckhdq xmm1, xmm5 1215 paddd xmm0, xmm1 1216 1217 movdqa xmm7, xmm6 1218 movdqa xmm1, xmm0 1219 1220 psrldq xmm7, 8 1221 psrldq xmm1, 8 1222 1223 paddd xmm6, xmm7 1224 paddd xmm0, xmm1 1225 1226 mov rsi, arg(5) ;[Sum] 1227 mov rdi, arg(6) ;[SSE] 1228 1229 movd [rsi], xmm0 1230 movd [rdi], xmm6 1231 1232 ; begin epilog 1233 pop rdi 1234 pop rsi 1235 RESTORE_GOT 1236 RESTORE_XMM 1237 UNSHADOW_ARGS 1238 pop rbp 1239 ret 1240 1241 1242;void vp8_half_horiz_variance8x_h_sse2 1243;( 1244; unsigned char *ref_ptr, 1245; int ref_pixels_per_line, 1246; unsigned char *src_ptr, 1247; int src_pixels_per_line, 1248; unsigned int Height, 1249; int *sum, 1250; unsigned int *sumsquared 1251;) 1252global sym(vp8_half_horiz_variance8x_h_sse2) 1253sym(vp8_half_horiz_variance8x_h_sse2): 1254 push rbp 1255 mov rbp, rsp 1256 SHADOW_ARGS_TO_STACK 7 1257 GET_GOT rbx 1258 push rsi 1259 push rdi 1260 ; end prolog 1261 1262%if ABI_IS_32BIT=0 1263 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 1264 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 1265%endif 1266 1267 pxor xmm6, xmm6 ; error accumulator 1268 pxor xmm7, xmm7 ; sse eaccumulator 1269 mov rsi, arg(0) ;ref_ptr ; 1270 1271 mov rdi, arg(2) ;src_ptr ; 1272 movsxd rcx, dword ptr arg(4) ;Height ; 1273 1274 pxor xmm0, xmm0 ; 1275vp8_half_horiz_variance8x_h_1: 1276 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 1277 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 1278 1279 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 1280 punpcklbw xmm5, xmm0 ; xmm5 = words of above 1281 1282 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 1283 punpcklbw xmm3, xmm0 ; xmm3 = words of above 1284 1285 psubw xmm5, xmm3 ; xmm5 -= xmm3 1286 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1287 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1288 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1289 1290%if ABI_IS_32BIT 1291 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 1292 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 1293%else 1294 add rsi, r8 1295 add rdi, r9 1296%endif 1297 sub rcx, 1 ; 1298 jnz vp8_half_horiz_variance8x_h_1 ; 1299 1300 movdq2q mm6, xmm6 ; 1301 movdq2q mm7, xmm7 ; 1302 1303 psrldq xmm6, 8 1304 psrldq xmm7, 8 1305 1306 movdq2q mm2, xmm6 1307 movdq2q mm3, xmm7 1308 1309 paddw mm6, mm2 1310 paddd mm7, mm3 1311 1312 pxor mm3, mm3 ; 1313 pxor mm2, mm2 ; 1314 1315 punpcklwd mm2, mm6 ; 1316 punpckhwd mm3, mm6 ; 1317 1318 paddd mm2, mm3 ; 1319 movq mm6, mm2 ; 1320 1321 psrlq mm6, 32 ; 1322 paddd mm2, mm6 ; 1323 1324 psrad mm2, 16 ; 1325 movq mm4, mm7 ; 1326 1327 psrlq mm4, 32 ; 1328 paddd mm4, mm7 ; 1329 1330 mov rsi, arg(5) ; sum 1331 mov rdi, arg(6) ; sumsquared 1332 1333 movd [rsi], mm2 ; 1334 movd [rdi], mm4 ; 1335 1336 1337 ; begin epilog 1338 pop rdi 1339 pop rsi 1340 RESTORE_GOT 1341 UNSHADOW_ARGS 1342 pop rbp 1343 ret 1344 1345;void vp8_half_horiz_variance16x_h_sse2 1346;( 1347; unsigned char *ref_ptr, 1348; int ref_pixels_per_line, 1349; unsigned char *src_ptr, 1350; int src_pixels_per_line, 1351; unsigned int Height, 1352; int *sum, 1353; unsigned int *sumsquared 1354;) 1355global sym(vp8_half_horiz_variance16x_h_sse2) 1356sym(vp8_half_horiz_variance16x_h_sse2): 1357 push rbp 1358 mov rbp, rsp 1359 SHADOW_ARGS_TO_STACK 7 1360 SAVE_XMM 1361 GET_GOT rbx 1362 push rsi 1363 push rdi 1364 ; end prolog 1365 1366 pxor xmm6, xmm6 ; error accumulator 1367 pxor xmm7, xmm7 ; sse eaccumulator 1368 mov rsi, arg(0) ;ref_ptr ; 1369 1370 mov rdi, arg(2) ;src_ptr ; 1371 movsxd rcx, dword ptr arg(4) ;Height ; 1372 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1373 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 1374 1375 pxor xmm0, xmm0 ; 1376 1377vp8_half_horiz_variance16x_h_1: 1378 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 1379 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 1380 1381 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 1382 movdqa xmm1, xmm5 1383 punpcklbw xmm5, xmm0 ; xmm5 = words of above 1384 punpckhbw xmm1, xmm0 1385 1386 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 1387 punpcklbw xmm3, xmm0 ; xmm3 = words of above 1388 movq xmm2, QWORD PTR [rdi+8] 1389 punpcklbw xmm2, xmm0 1390 1391 psubw xmm5, xmm3 ; xmm5 -= xmm3 1392 psubw xmm1, xmm2 1393 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1394 paddw xmm6, xmm1 1395 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1396 pmaddwd xmm1, xmm1 1397 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1398 paddd xmm7, xmm1 1399 1400 lea rsi, [rsi + rax] 1401 lea rdi, [rdi + rdx] 1402 1403 sub rcx, 1 ; 1404 jnz vp8_half_horiz_variance16x_h_1 ; 1405 1406 pxor xmm1, xmm1 1407 pxor xmm5, xmm5 1408 1409 punpcklwd xmm0, xmm6 1410 punpckhwd xmm1, xmm6 1411 psrad xmm0, 16 1412 psrad xmm1, 16 1413 paddd xmm0, xmm1 1414 movdqa xmm1, xmm0 1415 1416 movdqa xmm6, xmm7 1417 punpckldq xmm6, xmm5 1418 punpckhdq xmm7, xmm5 1419 paddd xmm6, xmm7 1420 1421 punpckldq xmm0, xmm5 1422 punpckhdq xmm1, xmm5 1423 paddd xmm0, xmm1 1424 1425 movdqa xmm7, xmm6 1426 movdqa xmm1, xmm0 1427 1428 psrldq xmm7, 8 1429 psrldq xmm1, 8 1430 1431 paddd xmm6, xmm7 1432 paddd xmm0, xmm1 1433 1434 mov rsi, arg(5) ;[Sum] 1435 mov rdi, arg(6) ;[SSE] 1436 1437 movd [rsi], xmm0 1438 movd [rdi], xmm6 1439 1440 ; begin epilog 1441 pop rdi 1442 pop rsi 1443 RESTORE_GOT 1444 RESTORE_XMM 1445 UNSHADOW_ARGS 1446 pop rbp 1447 ret 1448 1449SECTION_RODATA 1450; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; 1451align 16 1452xmm_bi_rd: 1453 times 8 dw 64 1454align 16 1455vp8_bilinear_filters_sse2: 1456 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 1457 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 1458 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 1459 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 1460 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 1461 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 1462 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 1463 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 1464