1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14pw_8: times 8 dw 8 15bilin_filter_m_sse2: times 8 dw 16 16 times 8 dw 0 17 times 8 dw 14 18 times 8 dw 2 19 times 8 dw 12 20 times 8 dw 4 21 times 8 dw 10 22 times 8 dw 6 23 times 16 dw 8 24 times 8 dw 6 25 times 8 dw 10 26 times 8 dw 4 27 times 8 dw 12 28 times 8 dw 2 29 times 8 dw 14 30 31SECTION .text 32 33; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, 34; int x_offset, int y_offset, 35; const uint8_t *dst, ptrdiff_t dst_stride, 36; int height, unsigned int *sse); 37; 38; This function returns the SE and stores SSE in the given pointer. 39 40%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse 41 psubw %3, %4 42 psubw %1, %2 43 mova %4, %3 ; make copies to manipulate to calc sum 44 mova %2, %1 ; use originals for calc sse 45 pmaddwd %3, %3 46 paddw %4, %2 47 pmaddwd %1, %1 48 movhlps %2, %4 49 paddd %6, %3 50 paddw %4, %2 51 pxor %2, %2 52 pcmpgtw %2, %4 ; mask for 0 > %4 (sum) 53 punpcklwd %4, %2 ; sign-extend word to dword 54 paddd %6, %1 55 paddd %5, %4 56 57%endmacro 58 59%macro STORE_AND_RET 0 60%if mmsize == 16 61 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit 62 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. 63 ; We have to sign-extend it before adding the words within the register 64 ; and outputing to a dword. 65 movhlps m3, m7 66 movhlps m4, m6 67 paddd m7, m3 68 paddd m6, m4 69 pshufd m3, m7, 0x1 70 pshufd m4, m6, 0x1 71 paddd m7, m3 72 paddd m6, m4 73 mov r1, ssem ; r1 = unsigned int *sse 74 movd [r1], m7 ; store sse 75 movd rax, m6 ; store sum as return value 76%endif 77 RET 78%endmacro 79 80%macro INC_SRC_BY_SRC_STRIDE 0 81%if ARCH_X86=1 && CONFIG_PIC=1 82 add srcq, src_stridemp 83 add srcq, src_stridemp 84%else 85 lea srcq, [srcq + src_strideq*2] 86%endif 87%endmacro 88 89%macro SUBPEL_VARIANCE 1-2 0 ; W 90%define bilin_filter_m bilin_filter_m_sse2 91%define filter_idx_shift 5 92 93 94%ifdef PIC ; 64bit PIC 95 %if %2 == 1 ; avg 96 cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ 97 x_offset, y_offset, \ 98 dst, dst_stride, \ 99 sec, sec_stride, height, sse 100 %define sec_str sec_strideq 101 %else 102 cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ 103 y_offset, dst, dst_stride, height, sse 104 %endif 105 %define block_height heightd 106 %define bilin_filter sseq 107%else 108 %if ARCH_X86=1 && CONFIG_PIC=1 109 %if %2 == 1 ; avg 110 cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 111 x_offset, y_offset, \ 112 dst, dst_stride, \ 113 sec, sec_stride, \ 114 height, sse, g_bilin_filter, g_pw_8 115 %define block_height dword heightm 116 %define sec_str sec_stridemp 117 118 ; Store bilin_filter and pw_8 location in stack 119 %if GET_GOT_DEFINED == 1 120 GET_GOT eax 121 add esp, 4 ; restore esp 122 %endif 123 124 lea ecx, [GLOBAL(bilin_filter_m)] 125 mov g_bilin_filterm, ecx 126 127 lea ecx, [GLOBAL(pw_8)] 128 mov g_pw_8m, ecx 129 130 LOAD_IF_USED 0, 1 ; load eax, ecx back 131 %else 132 cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ 133 x_offset, y_offset, dst, dst_stride, height, \ 134 sse, g_bilin_filter, g_pw_8 135 %define block_height heightd 136 137 ; Store bilin_filter and pw_8 location in stack 138 %if GET_GOT_DEFINED == 1 139 GET_GOT eax 140 add esp, 4 ; restore esp 141 %endif 142 143 lea ecx, [GLOBAL(bilin_filter_m)] 144 mov g_bilin_filterm, ecx 145 146 lea ecx, [GLOBAL(pw_8)] 147 mov g_pw_8m, ecx 148 149 LOAD_IF_USED 0, 1 ; load eax, ecx back 150 %endif 151 %else 152 %if %2 == 1 ; avg 153 cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ 154 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ 155 x_offset, y_offset, \ 156 dst, dst_stride, \ 157 sec, sec_stride, \ 158 height, sse 159 %if ARCH_X86_64 160 %define block_height heightd 161 %define sec_str sec_strideq 162 %else 163 %define block_height dword heightm 164 %define sec_str sec_stridemp 165 %endif 166 %else 167 cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ 168 x_offset, y_offset, dst, dst_stride, height, sse 169 %define block_height heightd 170 %endif 171 172 %define bilin_filter bilin_filter_m 173 %endif 174%endif 175 176 ASSERT %1 <= 16 ; m6 overflows if w > 16 177 pxor m6, m6 ; sum 178 pxor m7, m7 ; sse 179 180%if %1 < 16 181 sar block_height, 1 182%endif 183%if %2 == 1 ; avg 184 shl sec_str, 1 185%endif 186 187 ; FIXME(rbultje) replace by jumptable? 188 test x_offsetd, x_offsetd 189 jnz .x_nonzero 190 ; x_offset == 0 191 test y_offsetd, y_offsetd 192 jnz .x_zero_y_nonzero 193 194 ; x_offset == 0 && y_offset == 0 195.x_zero_y_zero_loop: 196%if %1 == 16 197 movu m0, [srcq] 198 movu m2, [srcq + 16] 199 mova m1, [dstq] 200 mova m3, [dstq + 16] 201%if %2 == 1 ; avg 202 pavgw m0, [secq] 203 pavgw m2, [secq+16] 204%endif 205 SUM_SSE m0, m1, m2, m3, m6, m7 206 207 lea srcq, [srcq + src_strideq*2] 208 lea dstq, [dstq + dst_strideq*2] 209%if %2 == 1 ; avg 210 add secq, sec_str 211%endif 212%else ; %1 < 16 213 movu m0, [srcq] 214 movu m2, [srcq + src_strideq*2] 215 mova m1, [dstq] 216 mova m3, [dstq + dst_strideq*2] 217%if %2 == 1 ; avg 218 pavgw m0, [secq] 219 add secq, sec_str 220 pavgw m2, [secq] 221%endif 222 SUM_SSE m0, m1, m2, m3, m6, m7 223 224 lea srcq, [srcq + src_strideq*4] 225 lea dstq, [dstq + dst_strideq*4] 226%if %2 == 1 ; avg 227 add secq, sec_str 228%endif 229%endif 230 dec block_height 231 jg .x_zero_y_zero_loop 232 STORE_AND_RET 233 234.x_zero_y_nonzero: 235 cmp y_offsetd, 8 236 jne .x_zero_y_nonhalf 237 238 ; x_offset == 0 && y_offset == 0.5 239.x_zero_y_half_loop: 240%if %1 == 16 241 movu m0, [srcq] 242 movu m1, [srcq+16] 243 movu m4, [srcq+src_strideq*2] 244 movu m5, [srcq+src_strideq*2+16] 245 mova m2, [dstq] 246 mova m3, [dstq+16] 247 pavgw m0, m4 248 pavgw m1, m5 249%if %2 == 1 ; avg 250 pavgw m0, [secq] 251 pavgw m1, [secq+16] 252%endif 253 SUM_SSE m0, m2, m1, m3, m6, m7 254 255 lea srcq, [srcq + src_strideq*2] 256 lea dstq, [dstq + dst_strideq*2] 257%if %2 == 1 ; avg 258 add secq, sec_str 259%endif 260%else ; %1 < 16 261 movu m0, [srcq] 262 movu m1, [srcq+src_strideq*2] 263 movu m5, [srcq+src_strideq*4] 264 mova m2, [dstq] 265 mova m3, [dstq+dst_strideq*2] 266 pavgw m0, m1 267 pavgw m1, m5 268%if %2 == 1 ; avg 269 pavgw m0, [secq] 270 add secq, sec_str 271 pavgw m1, [secq] 272%endif 273 SUM_SSE m0, m2, m1, m3, m6, m7 274 275 lea srcq, [srcq + src_strideq*4] 276 lea dstq, [dstq + dst_strideq*4] 277%if %2 == 1 ; avg 278 add secq, sec_str 279%endif 280%endif 281 dec block_height 282 jg .x_zero_y_half_loop 283 STORE_AND_RET 284 285.x_zero_y_nonhalf: 286 ; x_offset == 0 && y_offset == bilin interpolation 287%ifdef PIC 288 lea bilin_filter, [bilin_filter_m] 289%endif 290 shl y_offsetd, filter_idx_shift 291%if ARCH_X86_64 && mmsize == 16 292 mova m8, [bilin_filter+y_offsetq] 293 mova m9, [bilin_filter+y_offsetq+16] 294 mova m10, [pw_8] 295%define filter_y_a m8 296%define filter_y_b m9 297%define filter_rnd m10 298%else ; x86-32 or mmx 299%if ARCH_X86=1 && CONFIG_PIC=1 300; x_offset == 0, reuse x_offset reg 301%define tempq x_offsetq 302 add y_offsetq, g_bilin_filterm 303%define filter_y_a [y_offsetq] 304%define filter_y_b [y_offsetq+16] 305 mov tempq, g_pw_8m 306%define filter_rnd [tempq] 307%else 308 add y_offsetq, bilin_filter 309%define filter_y_a [y_offsetq] 310%define filter_y_b [y_offsetq+16] 311%define filter_rnd [pw_8] 312%endif 313%endif 314 315.x_zero_y_other_loop: 316%if %1 == 16 317 movu m0, [srcq] 318 movu m1, [srcq + 16] 319 movu m4, [srcq+src_strideq*2] 320 movu m5, [srcq+src_strideq*2+16] 321 mova m2, [dstq] 322 mova m3, [dstq+16] 323 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can 324 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of 325 ; instructions is the same (5), but it is 1 mul instead of 2, so might be 326 ; slightly faster because of pmullw latency. It would also cut our rodata 327 ; tables in half for this function, and save 1-2 registers on x86-64. 328 pmullw m1, filter_y_a 329 pmullw m5, filter_y_b 330 paddw m1, filter_rnd 331 pmullw m0, filter_y_a 332 pmullw m4, filter_y_b 333 paddw m0, filter_rnd 334 paddw m1, m5 335 paddw m0, m4 336 psrlw m1, 4 337 psrlw m0, 4 338%if %2 == 1 ; avg 339 pavgw m0, [secq] 340 pavgw m1, [secq+16] 341%endif 342 SUM_SSE m0, m2, m1, m3, m6, m7 343 344 lea srcq, [srcq + src_strideq*2] 345 lea dstq, [dstq + dst_strideq*2] 346%if %2 == 1 ; avg 347 add secq, sec_str 348%endif 349%else ; %1 < 16 350 movu m0, [srcq] 351 movu m1, [srcq+src_strideq*2] 352 movu m5, [srcq+src_strideq*4] 353 mova m4, m1 354 mova m2, [dstq] 355 mova m3, [dstq+dst_strideq*2] 356 pmullw m1, filter_y_a 357 pmullw m5, filter_y_b 358 paddw m1, filter_rnd 359 pmullw m0, filter_y_a 360 pmullw m4, filter_y_b 361 paddw m0, filter_rnd 362 paddw m1, m5 363 paddw m0, m4 364 psrlw m1, 4 365 psrlw m0, 4 366%if %2 == 1 ; avg 367 pavgw m0, [secq] 368 add secq, sec_str 369 pavgw m1, [secq] 370%endif 371 SUM_SSE m0, m2, m1, m3, m6, m7 372 373 lea srcq, [srcq + src_strideq*4] 374 lea dstq, [dstq + dst_strideq*4] 375%if %2 == 1 ; avg 376 add secq, sec_str 377%endif 378%endif 379 dec block_height 380 jg .x_zero_y_other_loop 381%undef filter_y_a 382%undef filter_y_b 383%undef filter_rnd 384 STORE_AND_RET 385 386.x_nonzero: 387 cmp x_offsetd, 8 388 jne .x_nonhalf 389 ; x_offset == 0.5 390 test y_offsetd, y_offsetd 391 jnz .x_half_y_nonzero 392 393 ; x_offset == 0.5 && y_offset == 0 394.x_half_y_zero_loop: 395%if %1 == 16 396 movu m0, [srcq] 397 movu m1, [srcq + 16] 398 movu m4, [srcq + 2] 399 movu m5, [srcq + 18] 400 mova m2, [dstq] 401 mova m3, [dstq + 16] 402 pavgw m0, m4 403 pavgw m1, m5 404%if %2 == 1 ; avg 405 pavgw m0, [secq] 406 pavgw m1, [secq+16] 407%endif 408 SUM_SSE m0, m2, m1, m3, m6, m7 409 410 lea srcq, [srcq + src_strideq*2] 411 lea dstq, [dstq + dst_strideq*2] 412%if %2 == 1 ; avg 413 add secq, sec_str 414%endif 415%else ; %1 < 16 416 movu m0, [srcq] 417 movu m1, [srcq + src_strideq*2] 418 movu m4, [srcq + 2] 419 movu m5, [srcq + src_strideq*2 + 2] 420 mova m2, [dstq] 421 mova m3, [dstq + dst_strideq*2] 422 pavgw m0, m4 423 pavgw m1, m5 424%if %2 == 1 ; avg 425 pavgw m0, [secq] 426 add secq, sec_str 427 pavgw m1, [secq] 428%endif 429 SUM_SSE m0, m2, m1, m3, m6, m7 430 431 lea srcq, [srcq + src_strideq*4] 432 lea dstq, [dstq + dst_strideq*4] 433%if %2 == 1 ; avg 434 add secq, sec_str 435%endif 436%endif 437 dec block_height 438 jg .x_half_y_zero_loop 439 STORE_AND_RET 440 441.x_half_y_nonzero: 442 cmp y_offsetd, 8 443 jne .x_half_y_nonhalf 444 445 ; x_offset == 0.5 && y_offset == 0.5 446%if %1 == 16 447 movu m0, [srcq] 448 movu m1, [srcq+16] 449 movu m2, [srcq+2] 450 movu m3, [srcq+18] 451 lea srcq, [srcq + src_strideq*2] 452 pavgw m0, m2 453 pavgw m1, m3 454.x_half_y_half_loop: 455 movu m2, [srcq] 456 movu m3, [srcq + 16] 457 movu m4, [srcq + 2] 458 movu m5, [srcq + 18] 459 pavgw m2, m4 460 pavgw m3, m5 461 pavgw m0, m2 462 pavgw m1, m3 463 mova m4, [dstq] 464 mova m5, [dstq + 16] 465%if %2 == 1 ; avg 466 pavgw m0, [secq] 467 pavgw m1, [secq+16] 468%endif 469 SUM_SSE m0, m4, m1, m5, m6, m7 470 mova m0, m2 471 mova m1, m3 472 473 lea srcq, [srcq + src_strideq*2] 474 lea dstq, [dstq + dst_strideq*2] 475%if %2 == 1 ; avg 476 add secq, sec_str 477%endif 478%else ; %1 < 16 479 movu m0, [srcq] 480 movu m2, [srcq+2] 481 lea srcq, [srcq + src_strideq*2] 482 pavgw m0, m2 483.x_half_y_half_loop: 484 movu m2, [srcq] 485 movu m3, [srcq + src_strideq*2] 486 movu m4, [srcq + 2] 487 movu m5, [srcq + src_strideq*2 + 2] 488 pavgw m2, m4 489 pavgw m3, m5 490 pavgw m0, m2 491 pavgw m2, m3 492 mova m4, [dstq] 493 mova m5, [dstq + dst_strideq*2] 494%if %2 == 1 ; avg 495 pavgw m0, [secq] 496 add secq, sec_str 497 pavgw m2, [secq] 498%endif 499 SUM_SSE m0, m4, m2, m5, m6, m7 500 mova m0, m3 501 502 lea srcq, [srcq + src_strideq*4] 503 lea dstq, [dstq + dst_strideq*4] 504%if %2 == 1 ; avg 505 add secq, sec_str 506%endif 507%endif 508 dec block_height 509 jg .x_half_y_half_loop 510 STORE_AND_RET 511 512.x_half_y_nonhalf: 513 ; x_offset == 0.5 && y_offset == bilin interpolation 514%ifdef PIC 515 lea bilin_filter, [bilin_filter_m] 516%endif 517 shl y_offsetd, filter_idx_shift 518%if ARCH_X86_64 && mmsize == 16 519 mova m8, [bilin_filter+y_offsetq] 520 mova m9, [bilin_filter+y_offsetq+16] 521 mova m10, [pw_8] 522%define filter_y_a m8 523%define filter_y_b m9 524%define filter_rnd m10 525%else ; x86_32 526%if ARCH_X86=1 && CONFIG_PIC=1 527; x_offset == 0.5. We can reuse x_offset reg 528%define tempq x_offsetq 529 add y_offsetq, g_bilin_filterm 530%define filter_y_a [y_offsetq] 531%define filter_y_b [y_offsetq+16] 532 mov tempq, g_pw_8m 533%define filter_rnd [tempq] 534%else 535 add y_offsetq, bilin_filter 536%define filter_y_a [y_offsetq] 537%define filter_y_b [y_offsetq+16] 538%define filter_rnd [pw_8] 539%endif 540%endif 541 542%if %1 == 16 543 movu m0, [srcq] 544 movu m1, [srcq+16] 545 movu m2, [srcq+2] 546 movu m3, [srcq+18] 547 lea srcq, [srcq + src_strideq*2] 548 pavgw m0, m2 549 pavgw m1, m3 550.x_half_y_other_loop: 551 movu m2, [srcq] 552 movu m3, [srcq+16] 553 movu m4, [srcq+2] 554 movu m5, [srcq+18] 555 pavgw m2, m4 556 pavgw m3, m5 557 mova m4, m2 558 mova m5, m3 559 pmullw m1, filter_y_a 560 pmullw m3, filter_y_b 561 paddw m1, filter_rnd 562 paddw m1, m3 563 pmullw m0, filter_y_a 564 pmullw m2, filter_y_b 565 paddw m0, filter_rnd 566 psrlw m1, 4 567 paddw m0, m2 568 mova m2, [dstq] 569 psrlw m0, 4 570 mova m3, [dstq+16] 571%if %2 == 1 ; avg 572 pavgw m0, [secq] 573 pavgw m1, [secq+16] 574%endif 575 SUM_SSE m0, m2, m1, m3, m6, m7 576 mova m0, m4 577 mova m1, m5 578 579 lea srcq, [srcq + src_strideq*2] 580 lea dstq, [dstq + dst_strideq*2] 581%if %2 == 1 ; avg 582 add secq, sec_str 583%endif 584%else ; %1 < 16 585 movu m0, [srcq] 586 movu m2, [srcq+2] 587 lea srcq, [srcq + src_strideq*2] 588 pavgw m0, m2 589.x_half_y_other_loop: 590 movu m2, [srcq] 591 movu m3, [srcq+src_strideq*2] 592 movu m4, [srcq+2] 593 movu m5, [srcq+src_strideq*2+2] 594 pavgw m2, m4 595 pavgw m3, m5 596 mova m4, m2 597 mova m5, m3 598 pmullw m4, filter_y_a 599 pmullw m3, filter_y_b 600 paddw m4, filter_rnd 601 paddw m4, m3 602 pmullw m0, filter_y_a 603 pmullw m2, filter_y_b 604 paddw m0, filter_rnd 605 psrlw m4, 4 606 paddw m0, m2 607 mova m2, [dstq] 608 psrlw m0, 4 609 mova m3, [dstq+dst_strideq*2] 610%if %2 == 1 ; avg 611 pavgw m0, [secq] 612 add secq, sec_str 613 pavgw m4, [secq] 614%endif 615 SUM_SSE m0, m2, m4, m3, m6, m7 616 mova m0, m5 617 618 lea srcq, [srcq + src_strideq*4] 619 lea dstq, [dstq + dst_strideq*4] 620%if %2 == 1 ; avg 621 add secq, sec_str 622%endif 623%endif 624 dec block_height 625 jg .x_half_y_other_loop 626%undef filter_y_a 627%undef filter_y_b 628%undef filter_rnd 629 STORE_AND_RET 630 631.x_nonhalf: 632 test y_offsetd, y_offsetd 633 jnz .x_nonhalf_y_nonzero 634 635 ; x_offset == bilin interpolation && y_offset == 0 636%ifdef PIC 637 lea bilin_filter, [bilin_filter_m] 638%endif 639 shl x_offsetd, filter_idx_shift 640%if ARCH_X86_64 && mmsize == 16 641 mova m8, [bilin_filter+x_offsetq] 642 mova m9, [bilin_filter+x_offsetq+16] 643 mova m10, [pw_8] 644%define filter_x_a m8 645%define filter_x_b m9 646%define filter_rnd m10 647%else ; x86-32 648%if ARCH_X86=1 && CONFIG_PIC=1 649; y_offset == 0. We can reuse y_offset reg. 650%define tempq y_offsetq 651 add x_offsetq, g_bilin_filterm 652%define filter_x_a [x_offsetq] 653%define filter_x_b [x_offsetq+16] 654 mov tempq, g_pw_8m 655%define filter_rnd [tempq] 656%else 657 add x_offsetq, bilin_filter 658%define filter_x_a [x_offsetq] 659%define filter_x_b [x_offsetq+16] 660%define filter_rnd [pw_8] 661%endif 662%endif 663 664.x_other_y_zero_loop: 665%if %1 == 16 666 movu m0, [srcq] 667 movu m1, [srcq+16] 668 movu m2, [srcq+2] 669 movu m3, [srcq+18] 670 mova m4, [dstq] 671 mova m5, [dstq+16] 672 pmullw m1, filter_x_a 673 pmullw m3, filter_x_b 674 paddw m1, filter_rnd 675 pmullw m0, filter_x_a 676 pmullw m2, filter_x_b 677 paddw m0, filter_rnd 678 paddw m1, m3 679 paddw m0, m2 680 psrlw m1, 4 681 psrlw m0, 4 682%if %2 == 1 ; avg 683 pavgw m0, [secq] 684 pavgw m1, [secq+16] 685%endif 686 SUM_SSE m0, m4, m1, m5, m6, m7 687 688 lea srcq, [srcq+src_strideq*2] 689 lea dstq, [dstq+dst_strideq*2] 690%if %2 == 1 ; avg 691 add secq, sec_str 692%endif 693%else ; %1 < 16 694 movu m0, [srcq] 695 movu m1, [srcq+src_strideq*2] 696 movu m2, [srcq+2] 697 movu m3, [srcq+src_strideq*2+2] 698 mova m4, [dstq] 699 mova m5, [dstq+dst_strideq*2] 700 pmullw m1, filter_x_a 701 pmullw m3, filter_x_b 702 paddw m1, filter_rnd 703 pmullw m0, filter_x_a 704 pmullw m2, filter_x_b 705 paddw m0, filter_rnd 706 paddw m1, m3 707 paddw m0, m2 708 psrlw m1, 4 709 psrlw m0, 4 710%if %2 == 1 ; avg 711 pavgw m0, [secq] 712 add secq, sec_str 713 pavgw m1, [secq] 714%endif 715 SUM_SSE m0, m4, m1, m5, m6, m7 716 717 lea srcq, [srcq+src_strideq*4] 718 lea dstq, [dstq+dst_strideq*4] 719%if %2 == 1 ; avg 720 add secq, sec_str 721%endif 722%endif 723 dec block_height 724 jg .x_other_y_zero_loop 725%undef filter_x_a 726%undef filter_x_b 727%undef filter_rnd 728 STORE_AND_RET 729 730.x_nonhalf_y_nonzero: 731 cmp y_offsetd, 8 732 jne .x_nonhalf_y_nonhalf 733 734 ; x_offset == bilin interpolation && y_offset == 0.5 735%ifdef PIC 736 lea bilin_filter, [bilin_filter_m] 737%endif 738 shl x_offsetd, filter_idx_shift 739%if ARCH_X86_64 && mmsize == 16 740 mova m8, [bilin_filter+x_offsetq] 741 mova m9, [bilin_filter+x_offsetq+16] 742 mova m10, [pw_8] 743%define filter_x_a m8 744%define filter_x_b m9 745%define filter_rnd m10 746%else ; x86-32 747%if ARCH_X86=1 && CONFIG_PIC=1 748; y_offset == 0.5. We can reuse y_offset reg. 749%define tempq y_offsetq 750 add x_offsetq, g_bilin_filterm 751%define filter_x_a [x_offsetq] 752%define filter_x_b [x_offsetq+16] 753 mov tempq, g_pw_8m 754%define filter_rnd [tempq] 755%else 756 add x_offsetq, bilin_filter 757%define filter_x_a [x_offsetq] 758%define filter_x_b [x_offsetq+16] 759%define filter_rnd [pw_8] 760%endif 761%endif 762 763%if %1 == 16 764 movu m0, [srcq] 765 movu m1, [srcq+16] 766 movu m2, [srcq+2] 767 movu m3, [srcq+18] 768 pmullw m0, filter_x_a 769 pmullw m2, filter_x_b 770 paddw m0, filter_rnd 771 pmullw m1, filter_x_a 772 pmullw m3, filter_x_b 773 paddw m1, filter_rnd 774 paddw m0, m2 775 paddw m1, m3 776 psrlw m0, 4 777 psrlw m1, 4 778 lea srcq, [srcq+src_strideq*2] 779.x_other_y_half_loop: 780 movu m2, [srcq] 781 movu m3, [srcq+16] 782 movu m4, [srcq+2] 783 movu m5, [srcq+18] 784 pmullw m2, filter_x_a 785 pmullw m4, filter_x_b 786 paddw m2, filter_rnd 787 pmullw m3, filter_x_a 788 pmullw m5, filter_x_b 789 paddw m3, filter_rnd 790 paddw m2, m4 791 paddw m3, m5 792 mova m4, [dstq] 793 mova m5, [dstq+16] 794 psrlw m2, 4 795 psrlw m3, 4 796 pavgw m0, m2 797 pavgw m1, m3 798%if %2 == 1 ; avg 799 pavgw m0, [secq] 800 pavgw m1, [secq+16] 801%endif 802 SUM_SSE m0, m4, m1, m5, m6, m7 803 mova m0, m2 804 mova m1, m3 805 806 lea srcq, [srcq+src_strideq*2] 807 lea dstq, [dstq+dst_strideq*2] 808%if %2 == 1 ; avg 809 add secq, sec_str 810%endif 811%else ; %1 < 16 812 movu m0, [srcq] 813 movu m2, [srcq+2] 814 pmullw m0, filter_x_a 815 pmullw m2, filter_x_b 816 paddw m0, filter_rnd 817 paddw m0, m2 818 psrlw m0, 4 819 lea srcq, [srcq+src_strideq*2] 820.x_other_y_half_loop: 821 movu m2, [srcq] 822 movu m3, [srcq+src_strideq*2] 823 movu m4, [srcq+2] 824 movu m5, [srcq+src_strideq*2+2] 825 pmullw m2, filter_x_a 826 pmullw m4, filter_x_b 827 paddw m2, filter_rnd 828 pmullw m3, filter_x_a 829 pmullw m5, filter_x_b 830 paddw m3, filter_rnd 831 paddw m2, m4 832 paddw m3, m5 833 mova m4, [dstq] 834 mova m5, [dstq+dst_strideq*2] 835 psrlw m2, 4 836 psrlw m3, 4 837 pavgw m0, m2 838 pavgw m2, m3 839%if %2 == 1 ; avg 840 pavgw m0, [secq] 841 add secq, sec_str 842 pavgw m2, [secq] 843%endif 844 SUM_SSE m0, m4, m2, m5, m6, m7 845 mova m0, m3 846 847 lea srcq, [srcq+src_strideq*4] 848 lea dstq, [dstq+dst_strideq*4] 849%if %2 == 1 ; avg 850 add secq, sec_str 851%endif 852%endif 853 dec block_height 854 jg .x_other_y_half_loop 855%undef filter_x_a 856%undef filter_x_b 857%undef filter_rnd 858 STORE_AND_RET 859 860.x_nonhalf_y_nonhalf: 861; loading filter - this is same as in 8-bit depth 862%ifdef PIC 863 lea bilin_filter, [bilin_filter_m] 864%endif 865 shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 866 shl y_offsetd, filter_idx_shift 867%if ARCH_X86_64 && mmsize == 16 868 mova m8, [bilin_filter+x_offsetq] 869 mova m9, [bilin_filter+x_offsetq+16] 870 mova m10, [bilin_filter+y_offsetq] 871 mova m11, [bilin_filter+y_offsetq+16] 872 mova m12, [pw_8] 873%define filter_x_a m8 874%define filter_x_b m9 875%define filter_y_a m10 876%define filter_y_b m11 877%define filter_rnd m12 878%else ; x86-32 879%if ARCH_X86=1 && CONFIG_PIC=1 880; In this case, there is NO unused register. Used src_stride register. Later, 881; src_stride has to be loaded from stack when it is needed. 882%define tempq src_strideq 883 mov tempq, g_bilin_filterm 884 add x_offsetq, tempq 885 add y_offsetq, tempq 886%define filter_x_a [x_offsetq] 887%define filter_x_b [x_offsetq+16] 888%define filter_y_a [y_offsetq] 889%define filter_y_b [y_offsetq+16] 890 891 mov tempq, g_pw_8m 892%define filter_rnd [tempq] 893%else 894 add x_offsetq, bilin_filter 895 add y_offsetq, bilin_filter 896%define filter_x_a [x_offsetq] 897%define filter_x_b [x_offsetq+16] 898%define filter_y_a [y_offsetq] 899%define filter_y_b [y_offsetq+16] 900%define filter_rnd [pw_8] 901%endif 902%endif 903; end of load filter 904 905 ; x_offset == bilin interpolation && y_offset == bilin interpolation 906%if %1 == 16 907 movu m0, [srcq] 908 movu m2, [srcq+2] 909 movu m1, [srcq+16] 910 movu m3, [srcq+18] 911 pmullw m0, filter_x_a 912 pmullw m2, filter_x_b 913 paddw m0, filter_rnd 914 pmullw m1, filter_x_a 915 pmullw m3, filter_x_b 916 paddw m1, filter_rnd 917 paddw m0, m2 918 paddw m1, m3 919 psrlw m0, 4 920 psrlw m1, 4 921 922 INC_SRC_BY_SRC_STRIDE 923 924.x_other_y_other_loop: 925 movu m2, [srcq] 926 movu m4, [srcq+2] 927 movu m3, [srcq+16] 928 movu m5, [srcq+18] 929 pmullw m2, filter_x_a 930 pmullw m4, filter_x_b 931 paddw m2, filter_rnd 932 pmullw m3, filter_x_a 933 pmullw m5, filter_x_b 934 paddw m3, filter_rnd 935 paddw m2, m4 936 paddw m3, m5 937 psrlw m2, 4 938 psrlw m3, 4 939 mova m4, m2 940 mova m5, m3 941 pmullw m0, filter_y_a 942 pmullw m2, filter_y_b 943 paddw m0, filter_rnd 944 pmullw m1, filter_y_a 945 pmullw m3, filter_y_b 946 paddw m0, m2 947 paddw m1, filter_rnd 948 mova m2, [dstq] 949 paddw m1, m3 950 psrlw m0, 4 951 psrlw m1, 4 952 mova m3, [dstq+16] 953%if %2 == 1 ; avg 954 pavgw m0, [secq] 955 pavgw m1, [secq+16] 956%endif 957 SUM_SSE m0, m2, m1, m3, m6, m7 958 mova m0, m4 959 mova m1, m5 960 961 INC_SRC_BY_SRC_STRIDE 962 lea dstq, [dstq + dst_strideq * 2] 963%if %2 == 1 ; avg 964 add secq, sec_str 965%endif 966%else ; %1 < 16 967 movu m0, [srcq] 968 movu m2, [srcq+2] 969 pmullw m0, filter_x_a 970 pmullw m2, filter_x_b 971 paddw m0, filter_rnd 972 paddw m0, m2 973 psrlw m0, 4 974 975 INC_SRC_BY_SRC_STRIDE 976 977.x_other_y_other_loop: 978 movu m2, [srcq] 979 movu m4, [srcq+2] 980 INC_SRC_BY_SRC_STRIDE 981 movu m3, [srcq] 982 movu m5, [srcq+2] 983 pmullw m2, filter_x_a 984 pmullw m4, filter_x_b 985 paddw m2, filter_rnd 986 pmullw m3, filter_x_a 987 pmullw m5, filter_x_b 988 paddw m3, filter_rnd 989 paddw m2, m4 990 paddw m3, m5 991 psrlw m2, 4 992 psrlw m3, 4 993 mova m4, m2 994 mova m5, m3 995 pmullw m0, filter_y_a 996 pmullw m2, filter_y_b 997 paddw m0, filter_rnd 998 pmullw m4, filter_y_a 999 pmullw m3, filter_y_b 1000 paddw m0, m2 1001 paddw m4, filter_rnd 1002 mova m2, [dstq] 1003 paddw m4, m3 1004 psrlw m0, 4 1005 psrlw m4, 4 1006 mova m3, [dstq+dst_strideq*2] 1007%if %2 == 1 ; avg 1008 pavgw m0, [secq] 1009 add secq, sec_str 1010 pavgw m4, [secq] 1011%endif 1012 SUM_SSE m0, m2, m4, m3, m6, m7 1013 mova m0, m5 1014 1015 INC_SRC_BY_SRC_STRIDE 1016 lea dstq, [dstq + dst_strideq * 4] 1017%if %2 == 1 ; avg 1018 add secq, sec_str 1019%endif 1020%endif 1021 dec block_height 1022 jg .x_other_y_other_loop 1023%undef filter_x_a 1024%undef filter_x_b 1025%undef filter_y_a 1026%undef filter_y_b 1027%undef filter_rnd 1028 STORE_AND_RET 1029%endmacro 1030 1031INIT_XMM sse2 1032SUBPEL_VARIANCE 8 1033SUBPEL_VARIANCE 16 1034 1035INIT_XMM sse2 1036SUBPEL_VARIANCE 8, 1 1037SUBPEL_VARIANCE 16, 1 1038