variance_impl_mmx.asm revision 1b362b15af34006e6a11974088a46d42b903418e
1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;unsigned int vp8_get_mb_ss_mmx( short *src_ptr ) 15global sym(vp8_get_mb_ss_mmx) PRIVATE 16sym(vp8_get_mb_ss_mmx): 17 push rbp 18 mov rbp, rsp 19 SHADOW_ARGS_TO_STACK 7 20 GET_GOT rbx 21 push rsi 22 push rdi 23 sub rsp, 8 24 ; end prolog 25 26 mov rax, arg(0) ;src_ptr 27 mov rcx, 16 28 pxor mm4, mm4 29 30.NEXTROW: 31 movq mm0, [rax] 32 movq mm1, [rax+8] 33 movq mm2, [rax+16] 34 movq mm3, [rax+24] 35 pmaddwd mm0, mm0 36 pmaddwd mm1, mm1 37 pmaddwd mm2, mm2 38 pmaddwd mm3, mm3 39 40 paddd mm4, mm0 41 paddd mm4, mm1 42 paddd mm4, mm2 43 paddd mm4, mm3 44 45 add rax, 32 46 dec rcx 47 ja .NEXTROW 48 movq QWORD PTR [rsp], mm4 49 50 ;return sum[0]+sum[1]; 51 movsxd rax, dword ptr [rsp] 52 movsxd rcx, dword ptr [rsp+4] 53 add rax, rcx 54 55 56 ; begin epilog 57 add rsp, 8 58 pop rdi 59 pop rsi 60 RESTORE_GOT 61 UNSHADOW_ARGS 62 pop rbp 63 ret 64 65 66;unsigned int vp8_get8x8var_mmx 67;( 68; unsigned char *src_ptr, 69; int source_stride, 70; unsigned char *ref_ptr, 71; int recon_stride, 72; unsigned int *SSE, 73; int *Sum 74;) 75global sym(vp8_get8x8var_mmx) PRIVATE 76sym(vp8_get8x8var_mmx): 77 push rbp 78 mov rbp, rsp 79 SHADOW_ARGS_TO_STACK 6 80 push rsi 81 push rdi 82 push rbx 83 sub rsp, 16 84 ; end prolog 85 86 87 pxor mm5, mm5 ; Blank mmx6 88 pxor mm6, mm6 ; Blank mmx7 89 pxor mm7, mm7 ; Blank mmx7 90 91 mov rax, arg(0) ;[src_ptr] ; Load base addresses 92 mov rbx, arg(2) ;[ref_ptr] 93 movsxd rcx, dword ptr arg(1) ;[source_stride] 94 movsxd rdx, dword ptr arg(3) ;[recon_stride] 95 96 ; Row 1 97 movq mm0, [rax] ; Copy eight bytes to mm0 98 movq mm1, [rbx] ; Copy eight bytes to mm1 99 movq mm2, mm0 ; Take copies 100 movq mm3, mm1 ; Take copies 101 102 punpcklbw mm0, mm6 ; unpack to higher prrcision 103 punpcklbw mm1, mm6 104 punpckhbw mm2, mm6 ; unpack to higher prrcision 105 punpckhbw mm3, mm6 106 psubsw mm0, mm1 ; A-B (low order) to MM0 107 psubsw mm2, mm3 ; A-B (high order) to MM2 108 109 paddw mm5, mm0 ; accumulate differences in mm5 110 paddw mm5, mm2 ; accumulate differences in mm5 111 112 pmaddwd mm0, mm0 ; square and accumulate 113 pmaddwd mm2, mm2 ; square and accumulate 114 add rbx,rdx ; Inc pointer into ref data 115 add rax,rcx ; Inc pointer into the new data 116 movq mm1, [rbx] ; Copy eight bytes to mm1 117 paddd mm7, mm0 ; accumulate in mm7 118 paddd mm7, mm2 ; accumulate in mm7 119 120 121 ; Row 2 122 movq mm0, [rax] ; Copy eight bytes to mm0 123 movq mm2, mm0 ; Take copies 124 movq mm3, mm1 ; Take copies 125 126 punpcklbw mm0, mm6 ; unpack to higher prrcision 127 punpcklbw mm1, mm6 128 punpckhbw mm2, mm6 ; unpack to higher prrcision 129 punpckhbw mm3, mm6 130 psubsw mm0, mm1 ; A-B (low order) to MM0 131 psubsw mm2, mm3 ; A-B (high order) to MM2 132 133 paddw mm5, mm0 ; accumulate differences in mm5 134 paddw mm5, mm2 ; accumulate differences in mm5 135 136 pmaddwd mm0, mm0 ; square and accumulate 137 pmaddwd mm2, mm2 ; square and accumulate 138 add rbx,rdx ; Inc pointer into ref data 139 add rax,rcx ; Inc pointer into the new data 140 movq mm1, [rbx] ; Copy eight bytes to mm1 141 paddd mm7, mm0 ; accumulate in mm7 142 paddd mm7, mm2 ; accumulate in mm7 143 144 ; Row 3 145 movq mm0, [rax] ; Copy eight bytes to mm0 146 movq mm2, mm0 ; Take copies 147 movq mm3, mm1 ; Take copies 148 149 punpcklbw mm0, mm6 ; unpack to higher prrcision 150 punpcklbw mm1, mm6 151 punpckhbw mm2, mm6 ; unpack to higher prrcision 152 punpckhbw mm3, mm6 153 psubsw mm0, mm1 ; A-B (low order) to MM0 154 psubsw mm2, mm3 ; A-B (high order) to MM2 155 156 paddw mm5, mm0 ; accumulate differences in mm5 157 paddw mm5, mm2 ; accumulate differences in mm5 158 159 pmaddwd mm0, mm0 ; square and accumulate 160 pmaddwd mm2, mm2 ; square and accumulate 161 add rbx,rdx ; Inc pointer into ref data 162 add rax,rcx ; Inc pointer into the new data 163 movq mm1, [rbx] ; Copy eight bytes to mm1 164 paddd mm7, mm0 ; accumulate in mm7 165 paddd mm7, mm2 ; accumulate in mm7 166 167 ; Row 4 168 movq mm0, [rax] ; Copy eight bytes to mm0 169 movq mm2, mm0 ; Take copies 170 movq mm3, mm1 ; Take copies 171 172 punpcklbw mm0, mm6 ; unpack to higher prrcision 173 punpcklbw mm1, mm6 174 punpckhbw mm2, mm6 ; unpack to higher prrcision 175 punpckhbw mm3, mm6 176 psubsw mm0, mm1 ; A-B (low order) to MM0 177 psubsw mm2, mm3 ; A-B (high order) to MM2 178 179 paddw mm5, mm0 ; accumulate differences in mm5 180 paddw mm5, mm2 ; accumulate differences in mm5 181 182 pmaddwd mm0, mm0 ; square and accumulate 183 pmaddwd mm2, mm2 ; square and accumulate 184 add rbx,rdx ; Inc pointer into ref data 185 add rax,rcx ; Inc pointer into the new data 186 movq mm1, [rbx] ; Copy eight bytes to mm1 187 paddd mm7, mm0 ; accumulate in mm7 188 paddd mm7, mm2 ; accumulate in mm7 189 190 ; Row 5 191 movq mm0, [rax] ; Copy eight bytes to mm0 192 movq mm2, mm0 ; Take copies 193 movq mm3, mm1 ; Take copies 194 195 punpcklbw mm0, mm6 ; unpack to higher prrcision 196 punpcklbw mm1, mm6 197 punpckhbw mm2, mm6 ; unpack to higher prrcision 198 punpckhbw mm3, mm6 199 psubsw mm0, mm1 ; A-B (low order) to MM0 200 psubsw mm2, mm3 ; A-B (high order) to MM2 201 202 paddw mm5, mm0 ; accumulate differences in mm5 203 paddw mm5, mm2 ; accumulate differences in mm5 204 205 pmaddwd mm0, mm0 ; square and accumulate 206 pmaddwd mm2, mm2 ; square and accumulate 207 add rbx,rdx ; Inc pointer into ref data 208 add rax,rcx ; Inc pointer into the new data 209 movq mm1, [rbx] ; Copy eight bytes to mm1 210 ; movq mm4, [rbx + rdx] 211 paddd mm7, mm0 ; accumulate in mm7 212 paddd mm7, mm2 ; accumulate in mm7 213 214 ; Row 6 215 movq mm0, [rax] ; Copy eight bytes to mm0 216 movq mm2, mm0 ; Take copies 217 movq mm3, mm1 ; Take copies 218 219 punpcklbw mm0, mm6 ; unpack to higher prrcision 220 punpcklbw mm1, mm6 221 punpckhbw mm2, mm6 ; unpack to higher prrcision 222 punpckhbw mm3, mm6 223 psubsw mm0, mm1 ; A-B (low order) to MM0 224 psubsw mm2, mm3 ; A-B (high order) to MM2 225 226 paddw mm5, mm0 ; accumulate differences in mm5 227 paddw mm5, mm2 ; accumulate differences in mm5 228 229 pmaddwd mm0, mm0 ; square and accumulate 230 pmaddwd mm2, mm2 ; square and accumulate 231 add rbx,rdx ; Inc pointer into ref data 232 add rax,rcx ; Inc pointer into the new data 233 movq mm1, [rbx] ; Copy eight bytes to mm1 234 paddd mm7, mm0 ; accumulate in mm7 235 paddd mm7, mm2 ; accumulate in mm7 236 237 ; Row 7 238 movq mm0, [rax] ; Copy eight bytes to mm0 239 movq mm2, mm0 ; Take copies 240 movq mm3, mm1 ; Take copies 241 242 punpcklbw mm0, mm6 ; unpack to higher prrcision 243 punpcklbw mm1, mm6 244 punpckhbw mm2, mm6 ; unpack to higher prrcision 245 punpckhbw mm3, mm6 246 psubsw mm0, mm1 ; A-B (low order) to MM0 247 psubsw mm2, mm3 ; A-B (high order) to MM2 248 249 paddw mm5, mm0 ; accumulate differences in mm5 250 paddw mm5, mm2 ; accumulate differences in mm5 251 252 pmaddwd mm0, mm0 ; square and accumulate 253 pmaddwd mm2, mm2 ; square and accumulate 254 add rbx,rdx ; Inc pointer into ref data 255 add rax,rcx ; Inc pointer into the new data 256 movq mm1, [rbx] ; Copy eight bytes to mm1 257 paddd mm7, mm0 ; accumulate in mm7 258 paddd mm7, mm2 ; accumulate in mm7 259 260 ; Row 8 261 movq mm0, [rax] ; Copy eight bytes to mm0 262 movq mm2, mm0 ; Take copies 263 movq mm3, mm1 ; Take copies 264 265 punpcklbw mm0, mm6 ; unpack to higher prrcision 266 punpcklbw mm1, mm6 267 punpckhbw mm2, mm6 ; unpack to higher prrcision 268 punpckhbw mm3, mm6 269 psubsw mm0, mm1 ; A-B (low order) to MM0 270 psubsw mm2, mm3 ; A-B (high order) to MM2 271 272 paddw mm5, mm0 ; accumulate differences in mm5 273 paddw mm5, mm2 ; accumulate differences in mm5 274 275 pmaddwd mm0, mm0 ; square and accumulate 276 pmaddwd mm2, mm2 ; square and accumulate 277 add rbx,rdx ; Inc pointer into ref data 278 add rax,rcx ; Inc pointer into the new data 279 paddd mm7, mm0 ; accumulate in mm7 280 paddd mm7, mm2 ; accumulate in mm7 281 282 ; Now accumulate the final results. 283 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 284 movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 285 movsx rdx, WORD PTR [rsp+8] 286 movsx rcx, WORD PTR [rsp+10] 287 movsx rbx, WORD PTR [rsp+12] 288 movsx rax, WORD PTR [rsp+14] 289 add rdx, rcx 290 add rbx, rax 291 add rdx, rbx ;XSum 292 movsxd rax, DWORD PTR [rsp] 293 movsxd rcx, DWORD PTR [rsp+4] 294 add rax, rcx ;XXSum 295 mov rsi, arg(4) ;SSE 296 mov rdi, arg(5) ;Sum 297 mov dword ptr [rsi], eax 298 mov dword ptr [rdi], edx 299 xor rax, rax ; return 0 300 301 302 ; begin epilog 303 add rsp, 16 304 pop rbx 305 pop rdi 306 pop rsi 307 UNSHADOW_ARGS 308 pop rbp 309 ret 310 311 312 313;unsigned int 314;vp8_get4x4var_mmx 315;( 316; unsigned char *src_ptr, 317; int source_stride, 318; unsigned char *ref_ptr, 319; int recon_stride, 320; unsigned int *SSE, 321; int *Sum 322;) 323global sym(vp8_get4x4var_mmx) PRIVATE 324sym(vp8_get4x4var_mmx): 325 push rbp 326 mov rbp, rsp 327 SHADOW_ARGS_TO_STACK 6 328 push rsi 329 push rdi 330 push rbx 331 sub rsp, 16 332 ; end prolog 333 334 335 pxor mm5, mm5 ; Blank mmx6 336 pxor mm6, mm6 ; Blank mmx7 337 pxor mm7, mm7 ; Blank mmx7 338 339 mov rax, arg(0) ;[src_ptr] ; Load base addresses 340 mov rbx, arg(2) ;[ref_ptr] 341 movsxd rcx, dword ptr arg(1) ;[source_stride] 342 movsxd rdx, dword ptr arg(3) ;[recon_stride] 343 344 ; Row 1 345 movq mm0, [rax] ; Copy eight bytes to mm0 346 movq mm1, [rbx] ; Copy eight bytes to mm1 347 punpcklbw mm0, mm6 ; unpack to higher prrcision 348 punpcklbw mm1, mm6 349 psubsw mm0, mm1 ; A-B (low order) to MM0 350 paddw mm5, mm0 ; accumulate differences in mm5 351 pmaddwd mm0, mm0 ; square and accumulate 352 add rbx,rdx ; Inc pointer into ref data 353 add rax,rcx ; Inc pointer into the new data 354 movq mm1, [rbx] ; Copy eight bytes to mm1 355 paddd mm7, mm0 ; accumulate in mm7 356 357 358 ; Row 2 359 movq mm0, [rax] ; Copy eight bytes to mm0 360 punpcklbw mm0, mm6 ; unpack to higher prrcision 361 punpcklbw mm1, mm6 362 psubsw mm0, mm1 ; A-B (low order) to MM0 363 paddw mm5, mm0 ; accumulate differences in mm5 364 365 pmaddwd mm0, mm0 ; square and accumulate 366 add rbx,rdx ; Inc pointer into ref data 367 add rax,rcx ; Inc pointer into the new data 368 movq mm1, [rbx] ; Copy eight bytes to mm1 369 paddd mm7, mm0 ; accumulate in mm7 370 371 ; Row 3 372 movq mm0, [rax] ; Copy eight bytes to mm0 373 punpcklbw mm0, mm6 ; unpack to higher prrcision 374 punpcklbw mm1, mm6 375 psubsw mm0, mm1 ; A-B (low order) to MM0 376 paddw mm5, mm0 ; accumulate differences in mm5 377 378 pmaddwd mm0, mm0 ; square and accumulate 379 add rbx,rdx ; Inc pointer into ref data 380 add rax,rcx ; Inc pointer into the new data 381 movq mm1, [rbx] ; Copy eight bytes to mm1 382 paddd mm7, mm0 ; accumulate in mm7 383 384 ; Row 4 385 movq mm0, [rax] ; Copy eight bytes to mm0 386 387 punpcklbw mm0, mm6 ; unpack to higher prrcision 388 punpcklbw mm1, mm6 389 psubsw mm0, mm1 ; A-B (low order) to MM0 390 391 paddw mm5, mm0 ; accumulate differences in mm5 392 393 pmaddwd mm0, mm0 ; square and accumulate 394 paddd mm7, mm0 ; accumulate in mm7 395 396 397 ; Now accumulate the final results. 398 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 399 movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 400 movsx rdx, WORD PTR [rsp+8] 401 movsx rcx, WORD PTR [rsp+10] 402 movsx rbx, WORD PTR [rsp+12] 403 movsx rax, WORD PTR [rsp+14] 404 add rdx, rcx 405 add rbx, rax 406 add rdx, rbx ;XSum 407 movsxd rax, DWORD PTR [rsp] 408 movsxd rcx, DWORD PTR [rsp+4] 409 add rax, rcx ;XXSum 410 mov rsi, arg(4) ;SSE 411 mov rdi, arg(5) ;Sum 412 mov dword ptr [rsi], eax 413 mov dword ptr [rdi], edx 414 xor rax, rax ; return 0 415 416 417 ; begin epilog 418 add rsp, 16 419 pop rbx 420 pop rdi 421 pop rsi 422 UNSHADOW_ARGS 423 pop rbp 424 ret 425 426 427 428;unsigned int 429;vp8_get4x4sse_cs_mmx 430;( 431; unsigned char *src_ptr, 432; int source_stride, 433; unsigned char *ref_ptr, 434; int recon_stride 435;) 436global sym(vp8_get4x4sse_cs_mmx) PRIVATE 437sym(vp8_get4x4sse_cs_mmx): 438 push rbp 439 mov rbp, rsp 440 SHADOW_ARGS_TO_STACK 4 441 push rsi 442 push rdi 443 push rbx 444 ; end prolog 445 446 447 pxor mm6, mm6 ; Blank mmx7 448 pxor mm7, mm7 ; Blank mmx7 449 450 mov rax, arg(0) ;[src_ptr] ; Load base addresses 451 mov rbx, arg(2) ;[ref_ptr] 452 movsxd rcx, dword ptr arg(1) ;[source_stride] 453 movsxd rdx, dword ptr arg(3) ;[recon_stride] 454 ; Row 1 455 movd mm0, [rax] ; Copy eight bytes to mm0 456 movd mm1, [rbx] ; Copy eight bytes to mm1 457 punpcklbw mm0, mm6 ; unpack to higher prrcision 458 punpcklbw mm1, mm6 459 psubsw mm0, mm1 ; A-B (low order) to MM0 460 pmaddwd mm0, mm0 ; square and accumulate 461 add rbx,rdx ; Inc pointer into ref data 462 add rax,rcx ; Inc pointer into the new data 463 movd mm1, [rbx] ; Copy eight bytes to mm1 464 paddd mm7, mm0 ; accumulate in mm7 465 466 ; Row 2 467 movd mm0, [rax] ; Copy eight bytes to mm0 468 punpcklbw mm0, mm6 ; unpack to higher prrcision 469 punpcklbw mm1, mm6 470 psubsw mm0, mm1 ; A-B (low order) to MM0 471 pmaddwd mm0, mm0 ; square and accumulate 472 add rbx,rdx ; Inc pointer into ref data 473 add rax,rcx ; Inc pointer into the new data 474 movd mm1, [rbx] ; Copy eight bytes to mm1 475 paddd mm7, mm0 ; accumulate in mm7 476 477 ; Row 3 478 movd mm0, [rax] ; Copy eight bytes to mm0 479 punpcklbw mm1, mm6 480 punpcklbw mm0, mm6 ; unpack to higher prrcision 481 psubsw mm0, mm1 ; A-B (low order) to MM0 482 483 pmaddwd mm0, mm0 ; square and accumulate 484 add rbx,rdx ; Inc pointer into ref data 485 add rax,rcx ; Inc pointer into the new data 486 movd mm1, [rbx] ; Copy eight bytes to mm1 487 paddd mm7, mm0 ; accumulate in mm7 488 489 ; Row 4 490 movd mm0, [rax] ; Copy eight bytes to mm0 491 punpcklbw mm0, mm6 ; unpack to higher prrcision 492 punpcklbw mm1, mm6 493 psubsw mm0, mm1 ; A-B (low order) to MM0 494 pmaddwd mm0, mm0 ; square and accumulate 495 paddd mm7, mm0 ; accumulate in mm7 496 497 movq mm0, mm7 ; 498 psrlq mm7, 32 499 500 paddd mm0, mm7 501 movq rax, mm0 502 503 504 ; begin epilog 505 pop rbx 506 pop rdi 507 pop rsi 508 UNSHADOW_ARGS 509 pop rbp 510 ret 511 512%define mmx_filter_shift 7 513 514;void vp8_filter_block2d_bil4x4_var_mmx 515;( 516; unsigned char *ref_ptr, 517; int ref_pixels_per_line, 518; unsigned char *src_ptr, 519; int src_pixels_per_line, 520; unsigned short *HFilter, 521; unsigned short *VFilter, 522; int *sum, 523; unsigned int *sumsquared 524;) 525global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE 526sym(vp8_filter_block2d_bil4x4_var_mmx): 527 push rbp 528 mov rbp, rsp 529 SHADOW_ARGS_TO_STACK 8 530 GET_GOT rbx 531 push rsi 532 push rdi 533 sub rsp, 16 534 ; end prolog 535 536 537 pxor mm6, mm6 ; 538 pxor mm7, mm7 ; 539 540 mov rax, arg(4) ;HFilter ; 541 mov rdx, arg(5) ;VFilter ; 542 543 mov rsi, arg(0) ;ref_ptr ; 544 mov rdi, arg(2) ;src_ptr ; 545 546 mov rcx, 4 ; 547 pxor mm0, mm0 ; 548 549 movd mm1, [rsi] ; 550 movd mm3, [rsi+1] ; 551 552 punpcklbw mm1, mm0 ; 553 pmullw mm1, [rax] ; 554 555 punpcklbw mm3, mm0 ; 556 pmullw mm3, [rax+8] ; 557 558 paddw mm1, mm3 ; 559 paddw mm1, [GLOBAL(mmx_bi_rd)] ; 560 561 psraw mm1, mmx_filter_shift ; 562 movq mm5, mm1 563 564%if ABI_IS_32BIT 565 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 566%else 567 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; 568 add rsi, r8 569%endif 570 571.filter_block2d_bil4x4_var_mmx_loop: 572 573 movd mm1, [rsi] ; 574 movd mm3, [rsi+1] ; 575 576 punpcklbw mm1, mm0 ; 577 pmullw mm1, [rax] ; 578 579 punpcklbw mm3, mm0 ; 580 pmullw mm3, [rax+8] ; 581 582 paddw mm1, mm3 ; 583 paddw mm1, [GLOBAL(mmx_bi_rd)] ; 584 585 psraw mm1, mmx_filter_shift ; 586 movq mm3, mm5 ; 587 588 movq mm5, mm1 ; 589 pmullw mm3, [rdx] ; 590 591 pmullw mm1, [rdx+8] ; 592 paddw mm1, mm3 ; 593 594 595 paddw mm1, [GLOBAL(mmx_bi_rd)] ; 596 psraw mm1, mmx_filter_shift ; 597 598 movd mm3, [rdi] ; 599 punpcklbw mm3, mm0 ; 600 601 psubw mm1, mm3 ; 602 paddw mm6, mm1 ; 603 604 pmaddwd mm1, mm1 ; 605 paddd mm7, mm1 ; 606 607%if ABI_IS_32BIT 608 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 609 add rdi, dword ptr arg(3) ;src_pixels_per_line ; 610%else 611 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 612 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 613 add rsi, r8 614 add rdi, r9 615%endif 616 sub rcx, 1 ; 617 jnz .filter_block2d_bil4x4_var_mmx_loop ; 618 619 620 pxor mm3, mm3 ; 621 pxor mm2, mm2 ; 622 623 punpcklwd mm2, mm6 ; 624 punpckhwd mm3, mm6 ; 625 626 paddd mm2, mm3 ; 627 movq mm6, mm2 ; 628 629 psrlq mm6, 32 ; 630 paddd mm2, mm6 ; 631 632 psrad mm2, 16 ; 633 movq mm4, mm7 ; 634 635 psrlq mm4, 32 ; 636 paddd mm4, mm7 ; 637 638 mov rdi, arg(6) ;sum 639 mov rsi, arg(7) ;sumsquared 640 641 movd dword ptr [rdi], mm2 ; 642 movd dword ptr [rsi], mm4 ; 643 644 645 646 ; begin epilog 647 add rsp, 16 648 pop rdi 649 pop rsi 650 RESTORE_GOT 651 UNSHADOW_ARGS 652 pop rbp 653 ret 654 655 656 657 658;void vp8_filter_block2d_bil_var_mmx 659;( 660; unsigned char *ref_ptr, 661; int ref_pixels_per_line, 662; unsigned char *src_ptr, 663; int src_pixels_per_line, 664; unsigned int Height, 665; unsigned short *HFilter, 666; unsigned short *VFilter, 667; int *sum, 668; unsigned int *sumsquared 669;) 670global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE 671sym(vp8_filter_block2d_bil_var_mmx): 672 push rbp 673 mov rbp, rsp 674 SHADOW_ARGS_TO_STACK 9 675 GET_GOT rbx 676 push rsi 677 push rdi 678 sub rsp, 16 679 ; end prolog 680 681 pxor mm6, mm6 ; 682 pxor mm7, mm7 ; 683 mov rax, arg(5) ;HFilter ; 684 685 mov rdx, arg(6) ;VFilter ; 686 mov rsi, arg(0) ;ref_ptr ; 687 688 mov rdi, arg(2) ;src_ptr ; 689 movsxd rcx, dword ptr arg(4) ;Height ; 690 691 pxor mm0, mm0 ; 692 movq mm1, [rsi] ; 693 694 movq mm3, [rsi+1] ; 695 movq mm2, mm1 ; 696 697 movq mm4, mm3 ; 698 punpcklbw mm1, mm0 ; 699 700 punpckhbw mm2, mm0 ; 701 pmullw mm1, [rax] ; 702 703 pmullw mm2, [rax] ; 704 punpcklbw mm3, mm0 ; 705 706 punpckhbw mm4, mm0 ; 707 pmullw mm3, [rax+8] ; 708 709 pmullw mm4, [rax+8] ; 710 paddw mm1, mm3 ; 711 712 paddw mm2, mm4 ; 713 paddw mm1, [GLOBAL(mmx_bi_rd)] ; 714 715 psraw mm1, mmx_filter_shift ; 716 paddw mm2, [GLOBAL(mmx_bi_rd)] ; 717 718 psraw mm2, mmx_filter_shift ; 719 movq mm5, mm1 720 721 packuswb mm5, mm2 ; 722%if ABI_IS_32BIT 723 add rsi, dword ptr arg(1) ;ref_pixels_per_line 724%else 725 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 726 add rsi, r8 727%endif 728 729.filter_block2d_bil_var_mmx_loop: 730 731 movq mm1, [rsi] ; 732 movq mm3, [rsi+1] ; 733 734 movq mm2, mm1 ; 735 movq mm4, mm3 ; 736 737 punpcklbw mm1, mm0 ; 738 punpckhbw mm2, mm0 ; 739 740 pmullw mm1, [rax] ; 741 pmullw mm2, [rax] ; 742 743 punpcklbw mm3, mm0 ; 744 punpckhbw mm4, mm0 ; 745 746 pmullw mm3, [rax+8] ; 747 pmullw mm4, [rax+8] ; 748 749 paddw mm1, mm3 ; 750 paddw mm2, mm4 ; 751 752 paddw mm1, [GLOBAL(mmx_bi_rd)] ; 753 psraw mm1, mmx_filter_shift ; 754 755 paddw mm2, [GLOBAL(mmx_bi_rd)] ; 756 psraw mm2, mmx_filter_shift ; 757 758 movq mm3, mm5 ; 759 movq mm4, mm5 ; 760 761 punpcklbw mm3, mm0 ; 762 punpckhbw mm4, mm0 ; 763 764 movq mm5, mm1 ; 765 packuswb mm5, mm2 ; 766 767 pmullw mm3, [rdx] ; 768 pmullw mm4, [rdx] ; 769 770 pmullw mm1, [rdx+8] ; 771 pmullw mm2, [rdx+8] ; 772 773 paddw mm1, mm3 ; 774 paddw mm2, mm4 ; 775 776 paddw mm1, [GLOBAL(mmx_bi_rd)] ; 777 paddw mm2, [GLOBAL(mmx_bi_rd)] ; 778 779 psraw mm1, mmx_filter_shift ; 780 psraw mm2, mmx_filter_shift ; 781 782 movq mm3, [rdi] ; 783 movq mm4, mm3 ; 784 785 punpcklbw mm3, mm0 ; 786 punpckhbw mm4, mm0 ; 787 788 psubw mm1, mm3 ; 789 psubw mm2, mm4 ; 790 791 paddw mm6, mm1 ; 792 pmaddwd mm1, mm1 ; 793 794 paddw mm6, mm2 ; 795 pmaddwd mm2, mm2 ; 796 797 paddd mm7, mm1 ; 798 paddd mm7, mm2 ; 799 800%if ABI_IS_32BIT 801 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 802 add rdi, dword ptr arg(3) ;src_pixels_per_line ; 803%else 804 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; 805 movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; 806 add rsi, r8 807 add rdi, r9 808%endif 809 sub rcx, 1 ; 810 jnz .filter_block2d_bil_var_mmx_loop ; 811 812 813 pxor mm3, mm3 ; 814 pxor mm2, mm2 ; 815 816 punpcklwd mm2, mm6 ; 817 punpckhwd mm3, mm6 ; 818 819 paddd mm2, mm3 ; 820 movq mm6, mm2 ; 821 822 psrlq mm6, 32 ; 823 paddd mm2, mm6 ; 824 825 psrad mm2, 16 ; 826 movq mm4, mm7 ; 827 828 psrlq mm4, 32 ; 829 paddd mm4, mm7 ; 830 831 mov rdi, arg(7) ;sum 832 mov rsi, arg(8) ;sumsquared 833 834 movd dword ptr [rdi], mm2 ; 835 movd dword ptr [rsi], mm4 ; 836 837 ; begin epilog 838 add rsp, 16 839 pop rdi 840 pop rsi 841 RESTORE_GOT 842 UNSHADOW_ARGS 843 pop rbp 844 ret 845 846 847SECTION_RODATA 848;short mmx_bi_rd[4] = { 64, 64, 64, 64}; 849align 16 850mmx_bi_rd: 851 times 4 dw 64 852