1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14pw_8: times 8 dw 8 15bilin_filter_m_sse2: times 8 dw 16 16 times 8 dw 0 17 times 8 dw 14 18 times 8 dw 2 19 times 8 dw 12 20 times 8 dw 4 21 times 8 dw 10 22 times 8 dw 6 23 times 16 dw 8 24 times 8 dw 6 25 times 8 dw 10 26 times 8 dw 4 27 times 8 dw 12 28 times 8 dw 2 29 times 8 dw 14 30 31bilin_filter_m_ssse3: times 8 db 16, 0 32 times 8 db 14, 2 33 times 8 db 12, 4 34 times 8 db 10, 6 35 times 16 db 8 36 times 8 db 6, 10 37 times 8 db 4, 12 38 times 8 db 2, 14 39 40SECTION .text 41 42; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, 43; int x_offset, int y_offset, 44; const uint8_t *dst, ptrdiff_t dst_stride, 45; int height, unsigned int *sse); 46; 47; This function returns the SE and stores SSE in the given pointer. 48 49%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse 50 psubw %3, %4 51 psubw %1, %2 52 paddw %5, %3 53 pmaddwd %3, %3 54 paddw %5, %1 55 pmaddwd %1, %1 56 paddd %6, %3 57 paddd %6, %1 58%endmacro 59 60%macro STORE_AND_RET 1 61%if %1 > 4 62 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit 63 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. 64 ; We have to sign-extend it before adding the words within the register 65 ; and outputing to a dword. 66 pcmpgtw m5, m6 ; mask for 0 > x 67 movhlps m3, m7 68 punpcklwd m4, m6, m5 69 punpckhwd m6, m5 ; sign-extend m6 word->dword 70 paddd m7, m3 71 paddd m6, m4 72 pshufd m3, m7, 0x1 73 movhlps m4, m6 74 paddd m7, m3 75 paddd m6, m4 76 mov r1, ssem ; r1 = unsigned int *sse 77 pshufd m4, m6, 0x1 78 movd [r1], m7 ; store sse 79 paddd m6, m4 80 movd raxd, m6 ; store sum as return value 81%else ; 4xh 82 pshuflw m4, m6, 0xe 83 pshuflw m3, m7, 0xe 84 paddw m6, m4 85 paddd m7, m3 86 pcmpgtw m5, m6 ; mask for 0 > x 87 mov r1, ssem ; r1 = unsigned int *sse 88 punpcklwd m6, m5 ; sign-extend m6 word->dword 89 movd [r1], m7 ; store sse 90 pshuflw m4, m6, 0xe 91 paddd m6, m4 92 movd raxd, m6 ; store sum as return value 93%endif 94 RET 95%endmacro 96 97%macro INC_SRC_BY_SRC_STRIDE 0 98%if ARCH_X86=1 && CONFIG_PIC=1 99 add srcq, src_stridemp 100%else 101 add srcq, src_strideq 102%endif 103%endmacro 104 105%macro SUBPEL_VARIANCE 1-2 0 ; W 106%if cpuflag(ssse3) 107%define bilin_filter_m bilin_filter_m_ssse3 108%define filter_idx_shift 4 109%else 110%define bilin_filter_m bilin_filter_m_sse2 111%define filter_idx_shift 5 112%endif 113; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses 114; 11, not 13, if the registers are ordered correctly. May make a minor speed 115; difference on Win64 116 117%ifdef PIC ; 64bit PIC 118 %if %2 == 1 ; avg 119 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ 120 x_offset, y_offset, \ 121 dst, dst_stride, \ 122 sec, sec_stride, height, sse 123 %define sec_str sec_strideq 124 %else 125 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ 126 y_offset, dst, dst_stride, height, sse 127 %endif 128 %define block_height heightd 129 %define bilin_filter sseq 130%else 131 %if ARCH_X86=1 && CONFIG_PIC=1 132 %if %2 == 1 ; avg 133 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 134 x_offset, y_offset, \ 135 dst, dst_stride, \ 136 sec, sec_stride, \ 137 height, sse, g_bilin_filter, g_pw_8 138 %define block_height dword heightm 139 %define sec_str sec_stridemp 140 141 ;Store bilin_filter and pw_8 location in stack 142 %if GET_GOT_DEFINED == 1 143 GET_GOT eax 144 add esp, 4 ; restore esp 145 %endif 146 147 lea ecx, [GLOBAL(bilin_filter_m)] 148 mov g_bilin_filterm, ecx 149 150 lea ecx, [GLOBAL(pw_8)] 151 mov g_pw_8m, ecx 152 153 LOAD_IF_USED 0, 1 ; load eax, ecx back 154 %else 155 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ 156 y_offset, dst, dst_stride, height, sse, \ 157 g_bilin_filter, g_pw_8 158 %define block_height heightd 159 160 ;Store bilin_filter and pw_8 location in stack 161 %if GET_GOT_DEFINED == 1 162 GET_GOT eax 163 add esp, 4 ; restore esp 164 %endif 165 166 lea ecx, [GLOBAL(bilin_filter_m)] 167 mov g_bilin_filterm, ecx 168 169 lea ecx, [GLOBAL(pw_8)] 170 mov g_pw_8m, ecx 171 172 LOAD_IF_USED 0, 1 ; load eax, ecx back 173 %endif 174 %else 175 %if %2 == 1 ; avg 176 cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ 177 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ 178 x_offset, y_offset, \ 179 dst, dst_stride, \ 180 sec, sec_stride, \ 181 height, sse 182 %if ARCH_X86_64 183 %define block_height heightd 184 %define sec_str sec_strideq 185 %else 186 %define block_height dword heightm 187 %define sec_str sec_stridemp 188 %endif 189 %else 190 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ 191 y_offset, dst, dst_stride, height, sse 192 %define block_height heightd 193 %endif 194 195 %define bilin_filter bilin_filter_m 196 %endif 197%endif 198 199%if %1 == 4 200 %define movx movd 201%else 202 %define movx movh 203%endif 204 205 ASSERT %1 <= 16 ; m6 overflows if w > 16 206 pxor m6, m6 ; sum 207 pxor m7, m7 ; sse 208 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we 209 ; could perhaps use it for something more productive then 210 pxor m5, m5 ; dedicated zero register 211%if %1 < 16 212 sar block_height, 1 213%if %2 == 1 ; avg 214 shl sec_str, 1 215%endif 216%endif 217 218 ; FIXME(rbultje) replace by jumptable? 219 test x_offsetd, x_offsetd 220 jnz .x_nonzero 221 ; x_offset == 0 222 test y_offsetd, y_offsetd 223 jnz .x_zero_y_nonzero 224 225 ; x_offset == 0 && y_offset == 0 226.x_zero_y_zero_loop: 227%if %1 == 16 228 movu m0, [srcq] 229 mova m1, [dstq] 230%if %2 == 1 ; avg 231 pavgb m0, [secq] 232 punpckhbw m3, m1, m5 233 punpcklbw m1, m5 234%endif 235 punpckhbw m2, m0, m5 236 punpcklbw m0, m5 237 238%if %2 == 0 ; !avg 239 punpckhbw m3, m1, m5 240 punpcklbw m1, m5 241%endif 242 SUM_SSE m0, m1, m2, m3, m6, m7 243 244 add srcq, src_strideq 245 add dstq, dst_strideq 246%else ; %1 < 16 247 movx m0, [srcq] 248%if %2 == 1 ; avg 249%if %1 > 4 250 movhps m0, [srcq+src_strideq] 251%else ; 4xh 252 movx m1, [srcq+src_strideq] 253 punpckldq m0, m1 254%endif 255%else ; !avg 256 movx m2, [srcq+src_strideq] 257%endif 258 259 movx m1, [dstq] 260 movx m3, [dstq+dst_strideq] 261 262%if %2 == 1 ; avg 263%if %1 > 4 264 pavgb m0, [secq] 265%else 266 movh m2, [secq] 267 pavgb m0, m2 268%endif 269 punpcklbw m3, m5 270 punpcklbw m1, m5 271%if %1 > 4 272 punpckhbw m2, m0, m5 273 punpcklbw m0, m5 274%else ; 4xh 275 punpcklbw m0, m5 276 movhlps m2, m0 277%endif 278%else ; !avg 279 punpcklbw m0, m5 280 punpcklbw m2, m5 281 punpcklbw m3, m5 282 punpcklbw m1, m5 283%endif 284 SUM_SSE m0, m1, m2, m3, m6, m7 285 286 lea srcq, [srcq+src_strideq*2] 287 lea dstq, [dstq+dst_strideq*2] 288%endif 289%if %2 == 1 ; avg 290 add secq, sec_str 291%endif 292 dec block_height 293 jg .x_zero_y_zero_loop 294 STORE_AND_RET %1 295 296.x_zero_y_nonzero: 297 cmp y_offsetd, 4 298 jne .x_zero_y_nonhalf 299 300 ; x_offset == 0 && y_offset == 0.5 301.x_zero_y_half_loop: 302%if %1 == 16 303 movu m0, [srcq] 304 movu m4, [srcq+src_strideq] 305 mova m1, [dstq] 306 pavgb m0, m4 307 punpckhbw m3, m1, m5 308%if %2 == 1 ; avg 309 pavgb m0, [secq] 310%endif 311 punpcklbw m1, m5 312 punpckhbw m2, m0, m5 313 punpcklbw m0, m5 314 SUM_SSE m0, m1, m2, m3, m6, m7 315 316 add srcq, src_strideq 317 add dstq, dst_strideq 318%else ; %1 < 16 319 movx m0, [srcq] 320 movx m2, [srcq+src_strideq] 321%if %2 == 1 ; avg 322%if %1 > 4 323 movhps m2, [srcq+src_strideq*2] 324%else ; 4xh 325 movx m1, [srcq+src_strideq*2] 326 punpckldq m2, m1 327%endif 328 movx m1, [dstq] 329%if %1 > 4 330 movlhps m0, m2 331%else ; 4xh 332 punpckldq m0, m2 333%endif 334 movx m3, [dstq+dst_strideq] 335 pavgb m0, m2 336 punpcklbw m1, m5 337%if %1 > 4 338 pavgb m0, [secq] 339 punpcklbw m3, m5 340 punpckhbw m2, m0, m5 341 punpcklbw m0, m5 342%else ; 4xh 343 movh m4, [secq] 344 pavgb m0, m4 345 punpcklbw m3, m5 346 punpcklbw m0, m5 347 movhlps m2, m0 348%endif 349%else ; !avg 350 movx m4, [srcq+src_strideq*2] 351 movx m1, [dstq] 352 pavgb m0, m2 353 movx m3, [dstq+dst_strideq] 354 pavgb m2, m4 355 punpcklbw m0, m5 356 punpcklbw m2, m5 357 punpcklbw m3, m5 358 punpcklbw m1, m5 359%endif 360 SUM_SSE m0, m1, m2, m3, m6, m7 361 362 lea srcq, [srcq+src_strideq*2] 363 lea dstq, [dstq+dst_strideq*2] 364%endif 365%if %2 == 1 ; avg 366 add secq, sec_str 367%endif 368 dec block_height 369 jg .x_zero_y_half_loop 370 STORE_AND_RET %1 371 372.x_zero_y_nonhalf: 373 ; x_offset == 0 && y_offset == bilin interpolation 374%ifdef PIC 375 lea bilin_filter, [bilin_filter_m] 376%endif 377 shl y_offsetd, filter_idx_shift 378%if ARCH_X86_64 && %1 > 4 379 mova m8, [bilin_filter+y_offsetq] 380%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 381 mova m9, [bilin_filter+y_offsetq+16] 382%endif 383 mova m10, [pw_8] 384%define filter_y_a m8 385%define filter_y_b m9 386%define filter_rnd m10 387%else ; x86-32 or mmx 388%if ARCH_X86=1 && CONFIG_PIC=1 389; x_offset == 0, reuse x_offset reg 390%define tempq x_offsetq 391 add y_offsetq, g_bilin_filterm 392%define filter_y_a [y_offsetq] 393%define filter_y_b [y_offsetq+16] 394 mov tempq, g_pw_8m 395%define filter_rnd [tempq] 396%else 397 add y_offsetq, bilin_filter 398%define filter_y_a [y_offsetq] 399%define filter_y_b [y_offsetq+16] 400%define filter_rnd [pw_8] 401%endif 402%endif 403 404.x_zero_y_other_loop: 405%if %1 == 16 406 movu m0, [srcq] 407 movu m4, [srcq+src_strideq] 408 mova m1, [dstq] 409%if cpuflag(ssse3) 410 punpckhbw m2, m0, m4 411 punpcklbw m0, m4 412 pmaddubsw m2, filter_y_a 413 pmaddubsw m0, filter_y_a 414 paddw m2, filter_rnd 415 paddw m0, filter_rnd 416%else 417 punpckhbw m2, m0, m5 418 punpckhbw m3, m4, m5 419 punpcklbw m0, m5 420 punpcklbw m4, m5 421 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can 422 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of 423 ; instructions is the same (5), but it is 1 mul instead of 2, so might be 424 ; slightly faster because of pmullw latency. It would also cut our rodata 425 ; tables in half for this function, and save 1-2 registers on x86-64. 426 pmullw m2, filter_y_a 427 pmullw m3, filter_y_b 428 paddw m2, filter_rnd 429 pmullw m0, filter_y_a 430 pmullw m4, filter_y_b 431 paddw m0, filter_rnd 432 paddw m2, m3 433 paddw m0, m4 434%endif 435 psraw m2, 4 436 psraw m0, 4 437%if %2 == 1 ; avg 438 ; FIXME(rbultje) pipeline 439 packuswb m0, m2 440 pavgb m0, [secq] 441 punpckhbw m2, m0, m5 442 punpcklbw m0, m5 443%endif 444 punpckhbw m3, m1, m5 445 punpcklbw m1, m5 446 SUM_SSE m0, m1, m2, m3, m6, m7 447 448 add srcq, src_strideq 449 add dstq, dst_strideq 450%else ; %1 < 16 451 movx m0, [srcq] 452 movx m2, [srcq+src_strideq] 453 movx m4, [srcq+src_strideq*2] 454 movx m3, [dstq+dst_strideq] 455%if cpuflag(ssse3) 456 movx m1, [dstq] 457 punpcklbw m0, m2 458 punpcklbw m2, m4 459 pmaddubsw m0, filter_y_a 460 pmaddubsw m2, filter_y_a 461 punpcklbw m3, m5 462 paddw m2, filter_rnd 463 paddw m0, filter_rnd 464%else 465 punpcklbw m0, m5 466 punpcklbw m2, m5 467 punpcklbw m4, m5 468 pmullw m0, filter_y_a 469 pmullw m1, m2, filter_y_b 470 punpcklbw m3, m5 471 paddw m0, filter_rnd 472 pmullw m2, filter_y_a 473 pmullw m4, filter_y_b 474 paddw m0, m1 475 paddw m2, filter_rnd 476 movx m1, [dstq] 477 paddw m2, m4 478%endif 479 psraw m0, 4 480 psraw m2, 4 481%if %2 == 1 ; avg 482 ; FIXME(rbultje) pipeline 483%if %1 == 4 484 movlhps m0, m2 485%endif 486 packuswb m0, m2 487%if %1 > 4 488 pavgb m0, [secq] 489 punpckhbw m2, m0, m5 490 punpcklbw m0, m5 491%else ; 4xh 492 movh m2, [secq] 493 pavgb m0, m2 494 punpcklbw m0, m5 495 movhlps m2, m0 496%endif 497%endif 498 punpcklbw m1, m5 499 SUM_SSE m0, m1, m2, m3, m6, m7 500 501 lea srcq, [srcq+src_strideq*2] 502 lea dstq, [dstq+dst_strideq*2] 503%endif 504%if %2 == 1 ; avg 505 add secq, sec_str 506%endif 507 dec block_height 508 jg .x_zero_y_other_loop 509%undef filter_y_a 510%undef filter_y_b 511%undef filter_rnd 512 STORE_AND_RET %1 513 514.x_nonzero: 515 cmp x_offsetd, 4 516 jne .x_nonhalf 517 ; x_offset == 0.5 518 test y_offsetd, y_offsetd 519 jnz .x_half_y_nonzero 520 521 ; x_offset == 0.5 && y_offset == 0 522.x_half_y_zero_loop: 523%if %1 == 16 524 movu m0, [srcq] 525 movu m4, [srcq+1] 526 mova m1, [dstq] 527 pavgb m0, m4 528 punpckhbw m3, m1, m5 529%if %2 == 1 ; avg 530 pavgb m0, [secq] 531%endif 532 punpcklbw m1, m5 533 punpckhbw m2, m0, m5 534 punpcklbw m0, m5 535 SUM_SSE m0, m1, m2, m3, m6, m7 536 537 add srcq, src_strideq 538 add dstq, dst_strideq 539%else ; %1 < 16 540 movx m0, [srcq] 541 movx m4, [srcq+1] 542%if %2 == 1 ; avg 543%if %1 > 4 544 movhps m0, [srcq+src_strideq] 545 movhps m4, [srcq+src_strideq+1] 546%else ; 4xh 547 movx m1, [srcq+src_strideq] 548 punpckldq m0, m1 549 movx m2, [srcq+src_strideq+1] 550 punpckldq m4, m2 551%endif 552 movx m1, [dstq] 553 movx m3, [dstq+dst_strideq] 554 pavgb m0, m4 555 punpcklbw m3, m5 556%if %1 > 4 557 pavgb m0, [secq] 558 punpcklbw m1, m5 559 punpckhbw m2, m0, m5 560 punpcklbw m0, m5 561%else ; 4xh 562 movh m2, [secq] 563 pavgb m0, m2 564 punpcklbw m1, m5 565 punpcklbw m0, m5 566 movhlps m2, m0 567%endif 568%else ; !avg 569 movx m2, [srcq+src_strideq] 570 movx m1, [dstq] 571 pavgb m0, m4 572 movx m4, [srcq+src_strideq+1] 573 movx m3, [dstq+dst_strideq] 574 pavgb m2, m4 575 punpcklbw m0, m5 576 punpcklbw m2, m5 577 punpcklbw m3, m5 578 punpcklbw m1, m5 579%endif 580 SUM_SSE m0, m1, m2, m3, m6, m7 581 582 lea srcq, [srcq+src_strideq*2] 583 lea dstq, [dstq+dst_strideq*2] 584%endif 585%if %2 == 1 ; avg 586 add secq, sec_str 587%endif 588 dec block_height 589 jg .x_half_y_zero_loop 590 STORE_AND_RET %1 591 592.x_half_y_nonzero: 593 cmp y_offsetd, 4 594 jne .x_half_y_nonhalf 595 596 ; x_offset == 0.5 && y_offset == 0.5 597%if %1 == 16 598 movu m0, [srcq] 599 movu m3, [srcq+1] 600 add srcq, src_strideq 601 pavgb m0, m3 602.x_half_y_half_loop: 603 movu m4, [srcq] 604 movu m3, [srcq+1] 605 mova m1, [dstq] 606 pavgb m4, m3 607 punpckhbw m3, m1, m5 608 pavgb m0, m4 609%if %2 == 1 ; avg 610 punpcklbw m1, m5 611 pavgb m0, [secq] 612 punpckhbw m2, m0, m5 613 punpcklbw m0, m5 614%else 615 punpckhbw m2, m0, m5 616 punpcklbw m0, m5 617 punpcklbw m1, m5 618%endif 619 SUM_SSE m0, m1, m2, m3, m6, m7 620 mova m0, m4 621 622 add srcq, src_strideq 623 add dstq, dst_strideq 624%else ; %1 < 16 625 movx m0, [srcq] 626 movx m3, [srcq+1] 627 add srcq, src_strideq 628 pavgb m0, m3 629.x_half_y_half_loop: 630 movx m2, [srcq] 631 movx m3, [srcq+1] 632%if %2 == 1 ; avg 633%if %1 > 4 634 movhps m2, [srcq+src_strideq] 635 movhps m3, [srcq+src_strideq+1] 636%else 637 movx m1, [srcq+src_strideq] 638 punpckldq m2, m1 639 movx m1, [srcq+src_strideq+1] 640 punpckldq m3, m1 641%endif 642 pavgb m2, m3 643%if %1 > 4 644 movlhps m0, m2 645 movhlps m4, m2 646%else ; 4xh 647 punpckldq m0, m2 648 pshuflw m4, m2, 0xe 649%endif 650 movx m1, [dstq] 651 pavgb m0, m2 652 movx m3, [dstq+dst_strideq] 653%if %1 > 4 654 pavgb m0, [secq] 655%else 656 movh m2, [secq] 657 pavgb m0, m2 658%endif 659 punpcklbw m3, m5 660 punpcklbw m1, m5 661%if %1 > 4 662 punpckhbw m2, m0, m5 663 punpcklbw m0, m5 664%else 665 punpcklbw m0, m5 666 movhlps m2, m0 667%endif 668%else ; !avg 669 movx m4, [srcq+src_strideq] 670 movx m1, [srcq+src_strideq+1] 671 pavgb m2, m3 672 pavgb m4, m1 673 pavgb m0, m2 674 pavgb m2, m4 675 movx m1, [dstq] 676 movx m3, [dstq+dst_strideq] 677 punpcklbw m0, m5 678 punpcklbw m2, m5 679 punpcklbw m3, m5 680 punpcklbw m1, m5 681%endif 682 SUM_SSE m0, m1, m2, m3, m6, m7 683 mova m0, m4 684 685 lea srcq, [srcq+src_strideq*2] 686 lea dstq, [dstq+dst_strideq*2] 687%endif 688%if %2 == 1 ; avg 689 add secq, sec_str 690%endif 691 dec block_height 692 jg .x_half_y_half_loop 693 STORE_AND_RET %1 694 695.x_half_y_nonhalf: 696 ; x_offset == 0.5 && y_offset == bilin interpolation 697%ifdef PIC 698 lea bilin_filter, [bilin_filter_m] 699%endif 700 shl y_offsetd, filter_idx_shift 701%if ARCH_X86_64 && %1 > 4 702 mova m8, [bilin_filter+y_offsetq] 703%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 704 mova m9, [bilin_filter+y_offsetq+16] 705%endif 706 mova m10, [pw_8] 707%define filter_y_a m8 708%define filter_y_b m9 709%define filter_rnd m10 710%else ;x86_32 711%if ARCH_X86=1 && CONFIG_PIC=1 712; x_offset == 0.5. We can reuse x_offset reg 713%define tempq x_offsetq 714 add y_offsetq, g_bilin_filterm 715%define filter_y_a [y_offsetq] 716%define filter_y_b [y_offsetq+16] 717 mov tempq, g_pw_8m 718%define filter_rnd [tempq] 719%else 720 add y_offsetq, bilin_filter 721%define filter_y_a [y_offsetq] 722%define filter_y_b [y_offsetq+16] 723%define filter_rnd [pw_8] 724%endif 725%endif 726 727%if %1 == 16 728 movu m0, [srcq] 729 movu m3, [srcq+1] 730 add srcq, src_strideq 731 pavgb m0, m3 732.x_half_y_other_loop: 733 movu m4, [srcq] 734 movu m2, [srcq+1] 735 mova m1, [dstq] 736 pavgb m4, m2 737%if cpuflag(ssse3) 738 punpckhbw m2, m0, m4 739 punpcklbw m0, m4 740 pmaddubsw m2, filter_y_a 741 pmaddubsw m0, filter_y_a 742 paddw m2, filter_rnd 743 paddw m0, filter_rnd 744 psraw m2, 4 745%else 746 punpckhbw m2, m0, m5 747 punpckhbw m3, m4, m5 748 pmullw m2, filter_y_a 749 pmullw m3, filter_y_b 750 paddw m2, filter_rnd 751 punpcklbw m0, m5 752 paddw m2, m3 753 punpcklbw m3, m4, m5 754 pmullw m0, filter_y_a 755 pmullw m3, filter_y_b 756 paddw m0, filter_rnd 757 psraw m2, 4 758 paddw m0, m3 759%endif 760 punpckhbw m3, m1, m5 761 psraw m0, 4 762%if %2 == 1 ; avg 763 ; FIXME(rbultje) pipeline 764 packuswb m0, m2 765 pavgb m0, [secq] 766 punpckhbw m2, m0, m5 767 punpcklbw m0, m5 768%endif 769 punpcklbw m1, m5 770 SUM_SSE m0, m1, m2, m3, m6, m7 771 mova m0, m4 772 773 add srcq, src_strideq 774 add dstq, dst_strideq 775%else ; %1 < 16 776 movx m0, [srcq] 777 movx m3, [srcq+1] 778 add srcq, src_strideq 779 pavgb m0, m3 780%if notcpuflag(ssse3) 781 punpcklbw m0, m5 782%endif 783.x_half_y_other_loop: 784 movx m2, [srcq] 785 movx m1, [srcq+1] 786 movx m4, [srcq+src_strideq] 787 movx m3, [srcq+src_strideq+1] 788 pavgb m2, m1 789 pavgb m4, m3 790 movx m3, [dstq+dst_strideq] 791%if cpuflag(ssse3) 792 movx m1, [dstq] 793 punpcklbw m0, m2 794 punpcklbw m2, m4 795 pmaddubsw m0, filter_y_a 796 pmaddubsw m2, filter_y_a 797 punpcklbw m3, m5 798 paddw m0, filter_rnd 799 paddw m2, filter_rnd 800%else 801 punpcklbw m2, m5 802 punpcklbw m4, m5 803 pmullw m0, filter_y_a 804 pmullw m1, m2, filter_y_b 805 punpcklbw m3, m5 806 paddw m0, filter_rnd 807 pmullw m2, filter_y_a 808 paddw m0, m1 809 pmullw m1, m4, filter_y_b 810 paddw m2, filter_rnd 811 paddw m2, m1 812 movx m1, [dstq] 813%endif 814 psraw m0, 4 815 psraw m2, 4 816%if %2 == 1 ; avg 817 ; FIXME(rbultje) pipeline 818%if %1 == 4 819 movlhps m0, m2 820%endif 821 packuswb m0, m2 822%if %1 > 4 823 pavgb m0, [secq] 824 punpckhbw m2, m0, m5 825 punpcklbw m0, m5 826%else 827 movh m2, [secq] 828 pavgb m0, m2 829 punpcklbw m0, m5 830 movhlps m2, m0 831%endif 832%endif 833 punpcklbw m1, m5 834 SUM_SSE m0, m1, m2, m3, m6, m7 835 mova m0, m4 836 837 lea srcq, [srcq+src_strideq*2] 838 lea dstq, [dstq+dst_strideq*2] 839%endif 840%if %2 == 1 ; avg 841 add secq, sec_str 842%endif 843 dec block_height 844 jg .x_half_y_other_loop 845%undef filter_y_a 846%undef filter_y_b 847%undef filter_rnd 848 STORE_AND_RET %1 849 850.x_nonhalf: 851 test y_offsetd, y_offsetd 852 jnz .x_nonhalf_y_nonzero 853 854 ; x_offset == bilin interpolation && y_offset == 0 855%ifdef PIC 856 lea bilin_filter, [bilin_filter_m] 857%endif 858 shl x_offsetd, filter_idx_shift 859%if ARCH_X86_64 && %1 > 4 860 mova m8, [bilin_filter+x_offsetq] 861%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 862 mova m9, [bilin_filter+x_offsetq+16] 863%endif 864 mova m10, [pw_8] 865%define filter_x_a m8 866%define filter_x_b m9 867%define filter_rnd m10 868%else ; x86-32 869%if ARCH_X86=1 && CONFIG_PIC=1 870;y_offset == 0. We can reuse y_offset reg. 871%define tempq y_offsetq 872 add x_offsetq, g_bilin_filterm 873%define filter_x_a [x_offsetq] 874%define filter_x_b [x_offsetq+16] 875 mov tempq, g_pw_8m 876%define filter_rnd [tempq] 877%else 878 add x_offsetq, bilin_filter 879%define filter_x_a [x_offsetq] 880%define filter_x_b [x_offsetq+16] 881%define filter_rnd [pw_8] 882%endif 883%endif 884 885.x_other_y_zero_loop: 886%if %1 == 16 887 movu m0, [srcq] 888 movu m4, [srcq+1] 889 mova m1, [dstq] 890%if cpuflag(ssse3) 891 punpckhbw m2, m0, m4 892 punpcklbw m0, m4 893 pmaddubsw m2, filter_x_a 894 pmaddubsw m0, filter_x_a 895 paddw m2, filter_rnd 896 paddw m0, filter_rnd 897%else 898 punpckhbw m2, m0, m5 899 punpckhbw m3, m4, m5 900 punpcklbw m0, m5 901 punpcklbw m4, m5 902 pmullw m2, filter_x_a 903 pmullw m3, filter_x_b 904 paddw m2, filter_rnd 905 pmullw m0, filter_x_a 906 pmullw m4, filter_x_b 907 paddw m0, filter_rnd 908 paddw m2, m3 909 paddw m0, m4 910%endif 911 psraw m2, 4 912 psraw m0, 4 913%if %2 == 1 ; avg 914 ; FIXME(rbultje) pipeline 915 packuswb m0, m2 916 pavgb m0, [secq] 917 punpckhbw m2, m0, m5 918 punpcklbw m0, m5 919%endif 920 punpckhbw m3, m1, m5 921 punpcklbw m1, m5 922 SUM_SSE m0, m1, m2, m3, m6, m7 923 924 add srcq, src_strideq 925 add dstq, dst_strideq 926%else ; %1 < 16 927 movx m0, [srcq] 928 movx m1, [srcq+1] 929 movx m2, [srcq+src_strideq] 930 movx m4, [srcq+src_strideq+1] 931 movx m3, [dstq+dst_strideq] 932%if cpuflag(ssse3) 933 punpcklbw m0, m1 934 movx m1, [dstq] 935 punpcklbw m2, m4 936 pmaddubsw m0, filter_x_a 937 pmaddubsw m2, filter_x_a 938 punpcklbw m3, m5 939 paddw m0, filter_rnd 940 paddw m2, filter_rnd 941%else 942 punpcklbw m0, m5 943 punpcklbw m1, m5 944 punpcklbw m2, m5 945 punpcklbw m4, m5 946 pmullw m0, filter_x_a 947 pmullw m1, filter_x_b 948 punpcklbw m3, m5 949 paddw m0, filter_rnd 950 pmullw m2, filter_x_a 951 pmullw m4, filter_x_b 952 paddw m0, m1 953 paddw m2, filter_rnd 954 movx m1, [dstq] 955 paddw m2, m4 956%endif 957 psraw m0, 4 958 psraw m2, 4 959%if %2 == 1 ; avg 960 ; FIXME(rbultje) pipeline 961%if %1 == 4 962 movlhps m0, m2 963%endif 964 packuswb m0, m2 965%if %1 > 4 966 pavgb m0, [secq] 967 punpckhbw m2, m0, m5 968 punpcklbw m0, m5 969%else 970 movh m2, [secq] 971 pavgb m0, m2 972 punpcklbw m0, m5 973 movhlps m2, m0 974%endif 975%endif 976 punpcklbw m1, m5 977 SUM_SSE m0, m1, m2, m3, m6, m7 978 979 lea srcq, [srcq+src_strideq*2] 980 lea dstq, [dstq+dst_strideq*2] 981%endif 982%if %2 == 1 ; avg 983 add secq, sec_str 984%endif 985 dec block_height 986 jg .x_other_y_zero_loop 987%undef filter_x_a 988%undef filter_x_b 989%undef filter_rnd 990 STORE_AND_RET %1 991 992.x_nonhalf_y_nonzero: 993 cmp y_offsetd, 4 994 jne .x_nonhalf_y_nonhalf 995 996 ; x_offset == bilin interpolation && y_offset == 0.5 997%ifdef PIC 998 lea bilin_filter, [bilin_filter_m] 999%endif 1000 shl x_offsetd, filter_idx_shift 1001%if ARCH_X86_64 && %1 > 4 1002 mova m8, [bilin_filter+x_offsetq] 1003%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1004 mova m9, [bilin_filter+x_offsetq+16] 1005%endif 1006 mova m10, [pw_8] 1007%define filter_x_a m8 1008%define filter_x_b m9 1009%define filter_rnd m10 1010%else ; x86-32 1011%if ARCH_X86=1 && CONFIG_PIC=1 1012; y_offset == 0.5. We can reuse y_offset reg. 1013%define tempq y_offsetq 1014 add x_offsetq, g_bilin_filterm 1015%define filter_x_a [x_offsetq] 1016%define filter_x_b [x_offsetq+16] 1017 mov tempq, g_pw_8m 1018%define filter_rnd [tempq] 1019%else 1020 add x_offsetq, bilin_filter 1021%define filter_x_a [x_offsetq] 1022%define filter_x_b [x_offsetq+16] 1023%define filter_rnd [pw_8] 1024%endif 1025%endif 1026 1027%if %1 == 16 1028 movu m0, [srcq] 1029 movu m1, [srcq+1] 1030%if cpuflag(ssse3) 1031 punpckhbw m2, m0, m1 1032 punpcklbw m0, m1 1033 pmaddubsw m2, filter_x_a 1034 pmaddubsw m0, filter_x_a 1035 paddw m2, filter_rnd 1036 paddw m0, filter_rnd 1037%else 1038 punpckhbw m2, m0, m5 1039 punpckhbw m3, m1, m5 1040 punpcklbw m0, m5 1041 punpcklbw m1, m5 1042 pmullw m0, filter_x_a 1043 pmullw m1, filter_x_b 1044 paddw m0, filter_rnd 1045 pmullw m2, filter_x_a 1046 pmullw m3, filter_x_b 1047 paddw m2, filter_rnd 1048 paddw m0, m1 1049 paddw m2, m3 1050%endif 1051 psraw m0, 4 1052 psraw m2, 4 1053 add srcq, src_strideq 1054 packuswb m0, m2 1055.x_other_y_half_loop: 1056 movu m4, [srcq] 1057 movu m3, [srcq+1] 1058%if cpuflag(ssse3) 1059 mova m1, [dstq] 1060 punpckhbw m2, m4, m3 1061 punpcklbw m4, m3 1062 pmaddubsw m2, filter_x_a 1063 pmaddubsw m4, filter_x_a 1064 paddw m2, filter_rnd 1065 paddw m4, filter_rnd 1066 psraw m2, 4 1067 psraw m4, 4 1068 packuswb m4, m2 1069 pavgb m0, m4 1070 punpckhbw m3, m1, m5 1071 punpcklbw m1, m5 1072%else 1073 punpckhbw m2, m4, m5 1074 punpckhbw m1, m3, m5 1075 punpcklbw m4, m5 1076 punpcklbw m3, m5 1077 pmullw m4, filter_x_a 1078 pmullw m3, filter_x_b 1079 paddw m4, filter_rnd 1080 pmullw m2, filter_x_a 1081 pmullw m1, filter_x_b 1082 paddw m2, filter_rnd 1083 paddw m4, m3 1084 paddw m2, m1 1085 mova m1, [dstq] 1086 psraw m4, 4 1087 psraw m2, 4 1088 punpckhbw m3, m1, m5 1089 ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we 1090 ; have a 1-register shortage to be able to store the backup of the bilin 1091 ; filtered second line as words as cache for the next line. Packing into 1092 ; a byte costs 1 pack and 2 unpacks, but saves a register. 1093 packuswb m4, m2 1094 punpcklbw m1, m5 1095 pavgb m0, m4 1096%endif 1097%if %2 == 1 ; avg 1098 ; FIXME(rbultje) pipeline 1099 pavgb m0, [secq] 1100%endif 1101 punpckhbw m2, m0, m5 1102 punpcklbw m0, m5 1103 SUM_SSE m0, m1, m2, m3, m6, m7 1104 mova m0, m4 1105 1106 add srcq, src_strideq 1107 add dstq, dst_strideq 1108%else ; %1 < 16 1109 movx m0, [srcq] 1110 movx m1, [srcq+1] 1111%if cpuflag(ssse3) 1112 punpcklbw m0, m1 1113 pmaddubsw m0, filter_x_a 1114 paddw m0, filter_rnd 1115%else 1116 punpcklbw m0, m5 1117 punpcklbw m1, m5 1118 pmullw m0, filter_x_a 1119 pmullw m1, filter_x_b 1120 paddw m0, filter_rnd 1121 paddw m0, m1 1122%endif 1123 add srcq, src_strideq 1124 psraw m0, 4 1125.x_other_y_half_loop: 1126 movx m2, [srcq] 1127 movx m1, [srcq+1] 1128 movx m4, [srcq+src_strideq] 1129 movx m3, [srcq+src_strideq+1] 1130%if cpuflag(ssse3) 1131 punpcklbw m2, m1 1132 punpcklbw m4, m3 1133 pmaddubsw m2, filter_x_a 1134 pmaddubsw m4, filter_x_a 1135 movx m1, [dstq] 1136 movx m3, [dstq+dst_strideq] 1137 paddw m2, filter_rnd 1138 paddw m4, filter_rnd 1139%else 1140 punpcklbw m2, m5 1141 punpcklbw m1, m5 1142 punpcklbw m4, m5 1143 punpcklbw m3, m5 1144 pmullw m2, filter_x_a 1145 pmullw m1, filter_x_b 1146 paddw m2, filter_rnd 1147 pmullw m4, filter_x_a 1148 pmullw m3, filter_x_b 1149 paddw m4, filter_rnd 1150 paddw m2, m1 1151 movx m1, [dstq] 1152 paddw m4, m3 1153 movx m3, [dstq+dst_strideq] 1154%endif 1155 psraw m2, 4 1156 psraw m4, 4 1157 pavgw m0, m2 1158 pavgw m2, m4 1159%if %2 == 1 ; avg 1160 ; FIXME(rbultje) pipeline - also consider going to bytes here 1161%if %1 == 4 1162 movlhps m0, m2 1163%endif 1164 packuswb m0, m2 1165%if %1 > 4 1166 pavgb m0, [secq] 1167 punpckhbw m2, m0, m5 1168 punpcklbw m0, m5 1169%else 1170 movh m2, [secq] 1171 pavgb m0, m2 1172 punpcklbw m0, m5 1173 movhlps m2, m0 1174%endif 1175%endif 1176 punpcklbw m3, m5 1177 punpcklbw m1, m5 1178 SUM_SSE m0, m1, m2, m3, m6, m7 1179 mova m0, m4 1180 1181 lea srcq, [srcq+src_strideq*2] 1182 lea dstq, [dstq+dst_strideq*2] 1183%endif 1184%if %2 == 1 ; avg 1185 add secq, sec_str 1186%endif 1187 dec block_height 1188 jg .x_other_y_half_loop 1189%undef filter_x_a 1190%undef filter_x_b 1191%undef filter_rnd 1192 STORE_AND_RET %1 1193 1194.x_nonhalf_y_nonhalf: 1195%ifdef PIC 1196 lea bilin_filter, [bilin_filter_m] 1197%endif 1198 shl x_offsetd, filter_idx_shift 1199 shl y_offsetd, filter_idx_shift 1200%if ARCH_X86_64 && %1 > 4 1201 mova m8, [bilin_filter+x_offsetq] 1202%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1203 mova m9, [bilin_filter+x_offsetq+16] 1204%endif 1205 mova m10, [bilin_filter+y_offsetq] 1206%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1207 mova m11, [bilin_filter+y_offsetq+16] 1208%endif 1209 mova m12, [pw_8] 1210%define filter_x_a m8 1211%define filter_x_b m9 1212%define filter_y_a m10 1213%define filter_y_b m11 1214%define filter_rnd m12 1215%else ; x86-32 1216%if ARCH_X86=1 && CONFIG_PIC=1 1217; In this case, there is NO unused register. Used src_stride register. Later, 1218; src_stride has to be loaded from stack when it is needed. 1219%define tempq src_strideq 1220 mov tempq, g_bilin_filterm 1221 add x_offsetq, tempq 1222 add y_offsetq, tempq 1223%define filter_x_a [x_offsetq] 1224%define filter_x_b [x_offsetq+16] 1225%define filter_y_a [y_offsetq] 1226%define filter_y_b [y_offsetq+16] 1227 1228 mov tempq, g_pw_8m 1229%define filter_rnd [tempq] 1230%else 1231 add x_offsetq, bilin_filter 1232 add y_offsetq, bilin_filter 1233%define filter_x_a [x_offsetq] 1234%define filter_x_b [x_offsetq+16] 1235%define filter_y_a [y_offsetq] 1236%define filter_y_b [y_offsetq+16] 1237%define filter_rnd [pw_8] 1238%endif 1239%endif 1240 1241 ; x_offset == bilin interpolation && y_offset == bilin interpolation 1242%if %1 == 16 1243 movu m0, [srcq] 1244 movu m1, [srcq+1] 1245%if cpuflag(ssse3) 1246 punpckhbw m2, m0, m1 1247 punpcklbw m0, m1 1248 pmaddubsw m2, filter_x_a 1249 pmaddubsw m0, filter_x_a 1250 paddw m2, filter_rnd 1251 paddw m0, filter_rnd 1252%else 1253 punpckhbw m2, m0, m5 1254 punpckhbw m3, m1, m5 1255 punpcklbw m0, m5 1256 punpcklbw m1, m5 1257 pmullw m0, filter_x_a 1258 pmullw m1, filter_x_b 1259 paddw m0, filter_rnd 1260 pmullw m2, filter_x_a 1261 pmullw m3, filter_x_b 1262 paddw m2, filter_rnd 1263 paddw m0, m1 1264 paddw m2, m3 1265%endif 1266 psraw m0, 4 1267 psraw m2, 4 1268 1269 INC_SRC_BY_SRC_STRIDE 1270 1271 packuswb m0, m2 1272.x_other_y_other_loop: 1273%if cpuflag(ssse3) 1274 movu m4, [srcq] 1275 movu m3, [srcq+1] 1276 mova m1, [dstq] 1277 punpckhbw m2, m4, m3 1278 punpcklbw m4, m3 1279 pmaddubsw m2, filter_x_a 1280 pmaddubsw m4, filter_x_a 1281 punpckhbw m3, m1, m5 1282 paddw m2, filter_rnd 1283 paddw m4, filter_rnd 1284 psraw m2, 4 1285 psraw m4, 4 1286 packuswb m4, m2 1287 punpckhbw m2, m0, m4 1288 punpcklbw m0, m4 1289 pmaddubsw m2, filter_y_a 1290 pmaddubsw m0, filter_y_a 1291 punpcklbw m1, m5 1292 paddw m2, filter_rnd 1293 paddw m0, filter_rnd 1294 psraw m2, 4 1295 psraw m0, 4 1296%else 1297 movu m3, [srcq] 1298 movu m4, [srcq+1] 1299 punpckhbw m1, m3, m5 1300 punpckhbw m2, m4, m5 1301 punpcklbw m3, m5 1302 punpcklbw m4, m5 1303 pmullw m3, filter_x_a 1304 pmullw m4, filter_x_b 1305 paddw m3, filter_rnd 1306 pmullw m1, filter_x_a 1307 pmullw m2, filter_x_b 1308 paddw m1, filter_rnd 1309 paddw m3, m4 1310 paddw m1, m2 1311 psraw m3, 4 1312 psraw m1, 4 1313 packuswb m4, m3, m1 1314 punpckhbw m2, m0, m5 1315 punpcklbw m0, m5 1316 pmullw m2, filter_y_a 1317 pmullw m1, filter_y_b 1318 paddw m2, filter_rnd 1319 pmullw m0, filter_y_a 1320 pmullw m3, filter_y_b 1321 paddw m2, m1 1322 mova m1, [dstq] 1323 paddw m0, filter_rnd 1324 psraw m2, 4 1325 paddw m0, m3 1326 punpckhbw m3, m1, m5 1327 psraw m0, 4 1328 punpcklbw m1, m5 1329%endif 1330%if %2 == 1 ; avg 1331 ; FIXME(rbultje) pipeline 1332 packuswb m0, m2 1333 pavgb m0, [secq] 1334 punpckhbw m2, m0, m5 1335 punpcklbw m0, m5 1336%endif 1337 SUM_SSE m0, m1, m2, m3, m6, m7 1338 mova m0, m4 1339 1340 INC_SRC_BY_SRC_STRIDE 1341 add dstq, dst_strideq 1342%else ; %1 < 16 1343 movx m0, [srcq] 1344 movx m1, [srcq+1] 1345%if cpuflag(ssse3) 1346 punpcklbw m0, m1 1347 pmaddubsw m0, filter_x_a 1348 paddw m0, filter_rnd 1349%else 1350 punpcklbw m0, m5 1351 punpcklbw m1, m5 1352 pmullw m0, filter_x_a 1353 pmullw m1, filter_x_b 1354 paddw m0, filter_rnd 1355 paddw m0, m1 1356%endif 1357 psraw m0, 4 1358%if cpuflag(ssse3) 1359 packuswb m0, m0 1360%endif 1361 1362 INC_SRC_BY_SRC_STRIDE 1363 1364.x_other_y_other_loop: 1365 movx m2, [srcq] 1366 movx m1, [srcq+1] 1367 1368 INC_SRC_BY_SRC_STRIDE 1369 movx m4, [srcq] 1370 movx m3, [srcq+1] 1371 1372%if cpuflag(ssse3) 1373 punpcklbw m2, m1 1374 punpcklbw m4, m3 1375 pmaddubsw m2, filter_x_a 1376 pmaddubsw m4, filter_x_a 1377 movx m3, [dstq+dst_strideq] 1378 movx m1, [dstq] 1379 paddw m2, filter_rnd 1380 paddw m4, filter_rnd 1381 psraw m2, 4 1382 psraw m4, 4 1383 packuswb m2, m2 1384 packuswb m4, m4 1385 punpcklbw m0, m2 1386 punpcklbw m2, m4 1387 pmaddubsw m0, filter_y_a 1388 pmaddubsw m2, filter_y_a 1389 punpcklbw m3, m5 1390 paddw m0, filter_rnd 1391 paddw m2, filter_rnd 1392 psraw m0, 4 1393 psraw m2, 4 1394 punpcklbw m1, m5 1395%else 1396 punpcklbw m2, m5 1397 punpcklbw m1, m5 1398 punpcklbw m4, m5 1399 punpcklbw m3, m5 1400 pmullw m2, filter_x_a 1401 pmullw m1, filter_x_b 1402 paddw m2, filter_rnd 1403 pmullw m4, filter_x_a 1404 pmullw m3, filter_x_b 1405 paddw m4, filter_rnd 1406 paddw m2, m1 1407 paddw m4, m3 1408 psraw m2, 4 1409 psraw m4, 4 1410 pmullw m0, filter_y_a 1411 pmullw m3, m2, filter_y_b 1412 paddw m0, filter_rnd 1413 pmullw m2, filter_y_a 1414 pmullw m1, m4, filter_y_b 1415 paddw m2, filter_rnd 1416 paddw m0, m3 1417 movx m3, [dstq+dst_strideq] 1418 paddw m2, m1 1419 movx m1, [dstq] 1420 psraw m0, 4 1421 psraw m2, 4 1422 punpcklbw m3, m5 1423 punpcklbw m1, m5 1424%endif 1425%if %2 == 1 ; avg 1426 ; FIXME(rbultje) pipeline 1427%if %1 == 4 1428 movlhps m0, m2 1429%endif 1430 packuswb m0, m2 1431%if %1 > 4 1432 pavgb m0, [secq] 1433 punpckhbw m2, m0, m5 1434 punpcklbw m0, m5 1435%else 1436 movh m2, [secq] 1437 pavgb m0, m2 1438 punpcklbw m0, m5 1439 movhlps m2, m0 1440%endif 1441%endif 1442 SUM_SSE m0, m1, m2, m3, m6, m7 1443 mova m0, m4 1444 1445 INC_SRC_BY_SRC_STRIDE 1446 lea dstq, [dstq+dst_strideq*2] 1447%endif 1448%if %2 == 1 ; avg 1449 add secq, sec_str 1450%endif 1451 dec block_height 1452 jg .x_other_y_other_loop 1453%undef filter_x_a 1454%undef filter_x_b 1455%undef filter_y_a 1456%undef filter_y_b 1457%undef filter_rnd 1458%undef movx 1459 STORE_AND_RET %1 1460%endmacro 1461 1462; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical 1463; between the ssse3 and non-ssse3 version. It may make sense to merge their 1464; code in the sense that the ssse3 version would jump to the appropriate 1465; location in the sse/2 version, rather than duplicating that code in the 1466; binary. 1467 1468INIT_XMM sse2 1469SUBPEL_VARIANCE 4 1470SUBPEL_VARIANCE 8 1471SUBPEL_VARIANCE 16 1472 1473INIT_XMM ssse3 1474SUBPEL_VARIANCE 4 1475SUBPEL_VARIANCE 8 1476SUBPEL_VARIANCE 16 1477 1478INIT_XMM sse2 1479SUBPEL_VARIANCE 4, 1 1480SUBPEL_VARIANCE 8, 1 1481SUBPEL_VARIANCE 16, 1 1482 1483INIT_XMM ssse3 1484SUBPEL_VARIANCE 4, 1 1485SUBPEL_VARIANCE 8, 1 1486SUBPEL_VARIANCE 16, 1 1487