1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14pw_8: times 8 dw 8 15bilin_filter_m_sse2: times 8 dw 16 16 times 8 dw 0 17 times 8 dw 15 18 times 8 dw 1 19 times 8 dw 14 20 times 8 dw 2 21 times 8 dw 13 22 times 8 dw 3 23 times 8 dw 12 24 times 8 dw 4 25 times 8 dw 11 26 times 8 dw 5 27 times 8 dw 10 28 times 8 dw 6 29 times 8 dw 9 30 times 8 dw 7 31 times 16 dw 8 32 times 8 dw 7 33 times 8 dw 9 34 times 8 dw 6 35 times 8 dw 10 36 times 8 dw 5 37 times 8 dw 11 38 times 8 dw 4 39 times 8 dw 12 40 times 8 dw 3 41 times 8 dw 13 42 times 8 dw 2 43 times 8 dw 14 44 times 8 dw 1 45 times 8 dw 15 46 47bilin_filter_m_ssse3: times 8 db 16, 0 48 times 8 db 15, 1 49 times 8 db 14, 2 50 times 8 db 13, 3 51 times 8 db 12, 4 52 times 8 db 11, 5 53 times 8 db 10, 6 54 times 8 db 9, 7 55 times 16 db 8 56 times 8 db 7, 9 57 times 8 db 6, 10 58 times 8 db 5, 11 59 times 8 db 4, 12 60 times 8 db 3, 13 61 times 8 db 2, 14 62 times 8 db 1, 15 63 64SECTION .text 65 66; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, 67; int x_offset, int y_offset, 68; const uint8_t *dst, ptrdiff_t dst_stride, 69; int height, unsigned int *sse); 70; 71; This function returns the SE and stores SSE in the given pointer. 72 73%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse 74 psubw %3, %4 75 psubw %1, %2 76 paddw %5, %3 77 pmaddwd %3, %3 78 paddw %5, %1 79 pmaddwd %1, %1 80 paddd %6, %3 81 paddd %6, %1 82%endmacro 83 84%macro STORE_AND_RET 0 85%if mmsize == 16 86 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit 87 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. 88 ; We have to sign-extend it before adding the words within the register 89 ; and outputing to a dword. 90 pcmpgtw m5, m6 ; mask for 0 > x 91 movhlps m3, m7 92 punpcklwd m4, m6, m5 93 punpckhwd m6, m5 ; sign-extend m6 word->dword 94 paddd m7, m3 95 paddd m6, m4 96 pshufd m3, m7, 0x1 97 movhlps m4, m6 98 paddd m7, m3 99 paddd m6, m4 100 mov r1, ssem ; r1 = unsigned int *sse 101 pshufd m4, m6, 0x1 102 movd [r1], m7 ; store sse 103 paddd m6, m4 104 movd rax, m6 ; store sum as return value 105%else ; mmsize == 8 106 pshufw m4, m6, 0xe 107 pshufw m3, m7, 0xe 108 paddw m6, m4 109 paddd m7, m3 110 pcmpgtw m5, m6 ; mask for 0 > x 111 mov r1, ssem ; r1 = unsigned int *sse 112 punpcklwd m6, m5 ; sign-extend m6 word->dword 113 movd [r1], m7 ; store sse 114 pshufw m4, m6, 0xe 115 paddd m6, m4 116 movd rax, m6 ; store sum as return value 117%endif 118 RET 119%endmacro 120 121%macro INC_SRC_BY_SRC_STRIDE 0 122%if ARCH_X86=1 && CONFIG_PIC=1 123 add srcq, src_stridemp 124%else 125 add srcq, src_strideq 126%endif 127%endmacro 128 129%macro SUBPEL_VARIANCE 1-2 0 ; W 130%if cpuflag(ssse3) 131%define bilin_filter_m bilin_filter_m_ssse3 132%define filter_idx_shift 4 133%else 134%define bilin_filter_m bilin_filter_m_sse2 135%define filter_idx_shift 5 136%endif 137; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses 138; 11, not 13, if the registers are ordered correctly. May make a minor speed 139; difference on Win64 140 141%ifdef PIC ; 64bit PIC 142 %if %2 == 1 ; avg 143 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ 144 x_offset, y_offset, \ 145 dst, dst_stride, \ 146 sec, sec_stride, height, sse 147 %define sec_str sec_strideq 148 %else 149 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ 150 y_offset, dst, dst_stride, height, sse 151 %endif 152 %define h heightd 153 %define bilin_filter sseq 154%else 155 %if ARCH_X86=1 && CONFIG_PIC=1 156 %if %2 == 1 ; avg 157 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 158 x_offset, y_offset, \ 159 dst, dst_stride, \ 160 sec, sec_stride, \ 161 height, sse, g_bilin_filter, g_pw_8 162 %define h dword heightm 163 %define sec_str sec_stridemp 164 165 ;Store bilin_filter and pw_8 location in stack 166 GET_GOT eax 167 add esp, 4 ; restore esp 168 169 lea ecx, [GLOBAL(bilin_filter_m)] 170 mov g_bilin_filterm, ecx 171 172 lea ecx, [GLOBAL(pw_8)] 173 mov g_pw_8m, ecx 174 175 LOAD_IF_USED 0, 1 ; load eax, ecx back 176 %else 177 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ 178 y_offset, dst, dst_stride, height, sse, \ 179 g_bilin_filter, g_pw_8 180 %define h heightd 181 182 ;Store bilin_filter and pw_8 location in stack 183 GET_GOT eax 184 add esp, 4 ; restore esp 185 186 lea ecx, [GLOBAL(bilin_filter_m)] 187 mov g_bilin_filterm, ecx 188 189 lea ecx, [GLOBAL(pw_8)] 190 mov g_pw_8m, ecx 191 192 LOAD_IF_USED 0, 1 ; load eax, ecx back 193 %endif 194 %else 195 %if %2 == 1 ; avg 196 cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ 197 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ 198 x_offset, y_offset, \ 199 dst, dst_stride, \ 200 sec, sec_stride, \ 201 height, sse 202 %if ARCH_X86_64 203 %define h heightd 204 %define sec_str sec_strideq 205 %else 206 %define h dword heightm 207 %define sec_str sec_stridemp 208 %endif 209 %else 210 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ 211 y_offset, dst, dst_stride, height, sse 212 %define h heightd 213 %endif 214 215 %define bilin_filter bilin_filter_m 216 %endif 217%endif 218 219 ASSERT %1 <= 16 ; m6 overflows if w > 16 220 pxor m6, m6 ; sum 221 pxor m7, m7 ; sse 222 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we 223 ; could perhaps use it for something more productive then 224 pxor m5, m5 ; dedicated zero register 225%if %1 < 16 226 sar h, 1 227%if %2 == 1 ; avg 228 shl sec_str, 1 229%endif 230%endif 231 232 ; FIXME(rbultje) replace by jumptable? 233 test x_offsetd, x_offsetd 234 jnz .x_nonzero 235 ; x_offset == 0 236 test y_offsetd, y_offsetd 237 jnz .x_zero_y_nonzero 238 239 ; x_offset == 0 && y_offset == 0 240.x_zero_y_zero_loop: 241%if %1 == 16 242 movu m0, [srcq] 243 mova m1, [dstq] 244%if %2 == 1 ; avg 245 pavgb m0, [secq] 246 punpckhbw m3, m1, m5 247 punpcklbw m1, m5 248%endif 249 punpckhbw m2, m0, m5 250 punpcklbw m0, m5 251%if %2 == 0 ; !avg 252 punpckhbw m3, m1, m5 253 punpcklbw m1, m5 254%endif 255 SUM_SSE m0, m1, m2, m3, m6, m7 256 257 add srcq, src_strideq 258 add dstq, dst_strideq 259%else ; %1 < 16 260 movh m0, [srcq] 261%if %2 == 1 ; avg 262%if mmsize == 16 263 movhps m0, [srcq+src_strideq] 264%else ; mmsize == 8 265 punpckldq m0, [srcq+src_strideq] 266%endif 267%else ; !avg 268 movh m2, [srcq+src_strideq] 269%endif 270 movh m1, [dstq] 271 movh m3, [dstq+dst_strideq] 272%if %2 == 1 ; avg 273 pavgb m0, [secq] 274 punpcklbw m3, m5 275 punpcklbw m1, m5 276 punpckhbw m2, m0, m5 277 punpcklbw m0, m5 278%else ; !avg 279 punpcklbw m0, m5 280 punpcklbw m2, m5 281 punpcklbw m3, m5 282 punpcklbw m1, m5 283%endif 284 SUM_SSE m0, m1, m2, m3, m6, m7 285 286 lea srcq, [srcq+src_strideq*2] 287 lea dstq, [dstq+dst_strideq*2] 288%endif 289%if %2 == 1 ; avg 290 add secq, sec_str 291%endif 292 dec h 293 jg .x_zero_y_zero_loop 294 STORE_AND_RET 295 296.x_zero_y_nonzero: 297 cmp y_offsetd, 8 298 jne .x_zero_y_nonhalf 299 300 ; x_offset == 0 && y_offset == 0.5 301.x_zero_y_half_loop: 302%if %1 == 16 303 movu m0, [srcq] 304 movu m4, [srcq+src_strideq] 305 mova m1, [dstq] 306 pavgb m0, m4 307 punpckhbw m3, m1, m5 308%if %2 == 1 ; avg 309 pavgb m0, [secq] 310%endif 311 punpcklbw m1, m5 312 punpckhbw m2, m0, m5 313 punpcklbw m0, m5 314 SUM_SSE m0, m1, m2, m3, m6, m7 315 316 add srcq, src_strideq 317 add dstq, dst_strideq 318%else ; %1 < 16 319 movh m0, [srcq] 320 movh m2, [srcq+src_strideq] 321%if %2 == 1 ; avg 322%if mmsize == 16 323 movhps m2, [srcq+src_strideq*2] 324%else ; mmsize == 8 325%if %1 == 4 326 movh m1, [srcq+src_strideq*2] 327 punpckldq m2, m1 328%else 329 punpckldq m2, [srcq+src_strideq*2] 330%endif 331%endif 332 movh m1, [dstq] 333%if mmsize == 16 334 movlhps m0, m2 335%else ; mmsize == 8 336 punpckldq m0, m2 337%endif 338 movh m3, [dstq+dst_strideq] 339 pavgb m0, m2 340 punpcklbw m1, m5 341 pavgb m0, [secq] 342 punpcklbw m3, m5 343 punpckhbw m2, m0, m5 344 punpcklbw m0, m5 345%else ; !avg 346 movh m4, [srcq+src_strideq*2] 347 movh m1, [dstq] 348 pavgb m0, m2 349 movh m3, [dstq+dst_strideq] 350 pavgb m2, m4 351 punpcklbw m0, m5 352 punpcklbw m2, m5 353 punpcklbw m3, m5 354 punpcklbw m1, m5 355%endif 356 SUM_SSE m0, m1, m2, m3, m6, m7 357 358 lea srcq, [srcq+src_strideq*2] 359 lea dstq, [dstq+dst_strideq*2] 360%endif 361%if %2 == 1 ; avg 362 add secq, sec_str 363%endif 364 dec h 365 jg .x_zero_y_half_loop 366 STORE_AND_RET 367 368.x_zero_y_nonhalf: 369 ; x_offset == 0 && y_offset == bilin interpolation 370%ifdef PIC 371 lea bilin_filter, [bilin_filter_m] 372%endif 373 shl y_offsetd, filter_idx_shift 374%if ARCH_X86_64 && mmsize == 16 375 mova m8, [bilin_filter+y_offsetq] 376%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 377 mova m9, [bilin_filter+y_offsetq+16] 378%endif 379 mova m10, [pw_8] 380%define filter_y_a m8 381%define filter_y_b m9 382%define filter_rnd m10 383%else ; x86-32 or mmx 384%if ARCH_X86=1 && CONFIG_PIC=1 385; x_offset == 0, reuse x_offset reg 386%define tempq x_offsetq 387 add y_offsetq, g_bilin_filterm 388%define filter_y_a [y_offsetq] 389%define filter_y_b [y_offsetq+16] 390 mov tempq, g_pw_8m 391%define filter_rnd [tempq] 392%else 393 add y_offsetq, bilin_filter 394%define filter_y_a [y_offsetq] 395%define filter_y_b [y_offsetq+16] 396%define filter_rnd [pw_8] 397%endif 398%endif 399 400.x_zero_y_other_loop: 401%if %1 == 16 402 movu m0, [srcq] 403 movu m4, [srcq+src_strideq] 404 mova m1, [dstq] 405%if cpuflag(ssse3) 406 punpckhbw m2, m0, m4 407 punpcklbw m0, m4 408 pmaddubsw m2, filter_y_a 409 pmaddubsw m0, filter_y_a 410 paddw m2, filter_rnd 411 paddw m0, filter_rnd 412%else 413 punpckhbw m2, m0, m5 414 punpckhbw m3, m4, m5 415 punpcklbw m0, m5 416 punpcklbw m4, m5 417 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can 418 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of 419 ; instructions is the same (5), but it is 1 mul instead of 2, so might be 420 ; slightly faster because of pmullw latency. It would also cut our rodata 421 ; tables in half for this function, and save 1-2 registers on x86-64. 422 pmullw m2, filter_y_a 423 pmullw m3, filter_y_b 424 paddw m2, filter_rnd 425 pmullw m0, filter_y_a 426 pmullw m4, filter_y_b 427 paddw m0, filter_rnd 428 paddw m2, m3 429 paddw m0, m4 430%endif 431 psraw m2, 4 432 psraw m0, 4 433%if %2 == 1 ; avg 434 ; FIXME(rbultje) pipeline 435 packuswb m0, m2 436 pavgb m0, [secq] 437 punpckhbw m2, m0, m5 438 punpcklbw m0, m5 439%endif 440 punpckhbw m3, m1, m5 441 punpcklbw m1, m5 442 SUM_SSE m0, m1, m2, m3, m6, m7 443 444 add srcq, src_strideq 445 add dstq, dst_strideq 446%else ; %1 < 16 447 movh m0, [srcq] 448 movh m2, [srcq+src_strideq] 449 movh m4, [srcq+src_strideq*2] 450 movh m3, [dstq+dst_strideq] 451%if cpuflag(ssse3) 452 movh m1, [dstq] 453 punpcklbw m0, m2 454 punpcklbw m2, m4 455 pmaddubsw m0, filter_y_a 456 pmaddubsw m2, filter_y_a 457 punpcklbw m3, m5 458 paddw m2, filter_rnd 459 paddw m0, filter_rnd 460%else 461 punpcklbw m0, m5 462 punpcklbw m2, m5 463 punpcklbw m4, m5 464 pmullw m0, filter_y_a 465 pmullw m1, m2, filter_y_b 466 punpcklbw m3, m5 467 paddw m0, filter_rnd 468 pmullw m2, filter_y_a 469 pmullw m4, filter_y_b 470 paddw m0, m1 471 paddw m2, filter_rnd 472 movh m1, [dstq] 473 paddw m2, m4 474%endif 475 psraw m0, 4 476 psraw m2, 4 477%if %2 == 1 ; avg 478 ; FIXME(rbultje) pipeline 479 packuswb m0, m2 480 pavgb m0, [secq] 481 punpckhbw m2, m0, m5 482 punpcklbw m0, m5 483%endif 484 punpcklbw m1, m5 485 SUM_SSE m0, m1, m2, m3, m6, m7 486 487 lea srcq, [srcq+src_strideq*2] 488 lea dstq, [dstq+dst_strideq*2] 489%endif 490%if %2 == 1 ; avg 491 add secq, sec_str 492%endif 493 dec h 494 jg .x_zero_y_other_loop 495%undef filter_y_a 496%undef filter_y_b 497%undef filter_rnd 498 STORE_AND_RET 499 500.x_nonzero: 501 cmp x_offsetd, 8 502 jne .x_nonhalf 503 ; x_offset == 0.5 504 test y_offsetd, y_offsetd 505 jnz .x_half_y_nonzero 506 507 ; x_offset == 0.5 && y_offset == 0 508.x_half_y_zero_loop: 509%if %1 == 16 510 movu m0, [srcq] 511 movu m4, [srcq+1] 512 mova m1, [dstq] 513 pavgb m0, m4 514 punpckhbw m3, m1, m5 515%if %2 == 1 ; avg 516 pavgb m0, [secq] 517%endif 518 punpcklbw m1, m5 519 punpckhbw m2, m0, m5 520 punpcklbw m0, m5 521 SUM_SSE m0, m1, m2, m3, m6, m7 522 523 add srcq, src_strideq 524 add dstq, dst_strideq 525%else ; %1 < 16 526 movh m0, [srcq] 527 movh m4, [srcq+1] 528%if %2 == 1 ; avg 529%if mmsize == 16 530 movhps m0, [srcq+src_strideq] 531 movhps m4, [srcq+src_strideq+1] 532%else ; mmsize == 8 533 punpckldq m0, [srcq+src_strideq] 534 punpckldq m4, [srcq+src_strideq+1] 535%endif 536 movh m1, [dstq] 537 movh m3, [dstq+dst_strideq] 538 pavgb m0, m4 539 punpcklbw m3, m5 540 pavgb m0, [secq] 541 punpcklbw m1, m5 542 punpckhbw m2, m0, m5 543 punpcklbw m0, m5 544%else ; !avg 545 movh m2, [srcq+src_strideq] 546 movh m1, [dstq] 547 pavgb m0, m4 548 movh m4, [srcq+src_strideq+1] 549 movh m3, [dstq+dst_strideq] 550 pavgb m2, m4 551 punpcklbw m0, m5 552 punpcklbw m2, m5 553 punpcklbw m3, m5 554 punpcklbw m1, m5 555%endif 556 SUM_SSE m0, m1, m2, m3, m6, m7 557 558 lea srcq, [srcq+src_strideq*2] 559 lea dstq, [dstq+dst_strideq*2] 560%endif 561%if %2 == 1 ; avg 562 add secq, sec_str 563%endif 564 dec h 565 jg .x_half_y_zero_loop 566 STORE_AND_RET 567 568.x_half_y_nonzero: 569 cmp y_offsetd, 8 570 jne .x_half_y_nonhalf 571 572 ; x_offset == 0.5 && y_offset == 0.5 573%if %1 == 16 574 movu m0, [srcq] 575 movu m3, [srcq+1] 576 add srcq, src_strideq 577 pavgb m0, m3 578.x_half_y_half_loop: 579 movu m4, [srcq] 580 movu m3, [srcq+1] 581 mova m1, [dstq] 582 pavgb m4, m3 583 punpckhbw m3, m1, m5 584 pavgb m0, m4 585%if %2 == 1 ; avg 586 punpcklbw m1, m5 587 pavgb m0, [secq] 588 punpckhbw m2, m0, m5 589 punpcklbw m0, m5 590%else 591 punpckhbw m2, m0, m5 592 punpcklbw m0, m5 593 punpcklbw m1, m5 594%endif 595 SUM_SSE m0, m1, m2, m3, m6, m7 596 mova m0, m4 597 598 add srcq, src_strideq 599 add dstq, dst_strideq 600%else ; %1 < 16 601 movh m0, [srcq] 602 movh m3, [srcq+1] 603 add srcq, src_strideq 604 pavgb m0, m3 605.x_half_y_half_loop: 606 movh m2, [srcq] 607 movh m3, [srcq+1] 608%if %2 == 1 ; avg 609%if mmsize == 16 610 movhps m2, [srcq+src_strideq] 611 movhps m3, [srcq+src_strideq+1] 612%else 613%if %1 == 4 614 movh m1, [srcq+src_strideq] 615 punpckldq m2, m1 616 movh m1, [srcq+src_strideq+1] 617 punpckldq m3, m1 618%else 619 punpckldq m2, [srcq+src_strideq] 620 punpckldq m3, [srcq+src_strideq+1] 621%endif 622%endif 623 pavgb m2, m3 624%if mmsize == 16 625 movlhps m0, m2 626 movhlps m4, m2 627%else ; mmsize == 8 628 punpckldq m0, m2 629 pshufw m4, m2, 0xe 630%endif 631 movh m1, [dstq] 632 pavgb m0, m2 633 movh m3, [dstq+dst_strideq] 634 pavgb m0, [secq] 635 punpcklbw m3, m5 636 punpcklbw m1, m5 637 punpckhbw m2, m0, m5 638 punpcklbw m0, m5 639%else ; !avg 640 movh m4, [srcq+src_strideq] 641 movh m1, [srcq+src_strideq+1] 642 pavgb m2, m3 643 pavgb m4, m1 644 pavgb m0, m2 645 pavgb m2, m4 646 movh m1, [dstq] 647 movh m3, [dstq+dst_strideq] 648 punpcklbw m0, m5 649 punpcklbw m2, m5 650 punpcklbw m3, m5 651 punpcklbw m1, m5 652%endif 653 SUM_SSE m0, m1, m2, m3, m6, m7 654 mova m0, m4 655 656 lea srcq, [srcq+src_strideq*2] 657 lea dstq, [dstq+dst_strideq*2] 658%endif 659%if %2 == 1 ; avg 660 add secq, sec_str 661%endif 662 dec h 663 jg .x_half_y_half_loop 664 STORE_AND_RET 665 666.x_half_y_nonhalf: 667 ; x_offset == 0.5 && y_offset == bilin interpolation 668%ifdef PIC 669 lea bilin_filter, [bilin_filter_m] 670%endif 671 shl y_offsetd, filter_idx_shift 672%if ARCH_X86_64 && mmsize == 16 673 mova m8, [bilin_filter+y_offsetq] 674%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 675 mova m9, [bilin_filter+y_offsetq+16] 676%endif 677 mova m10, [pw_8] 678%define filter_y_a m8 679%define filter_y_b m9 680%define filter_rnd m10 681%else ;x86_32 682%if ARCH_X86=1 && CONFIG_PIC=1 683; x_offset == 0.5. We can reuse x_offset reg 684%define tempq x_offsetq 685 add y_offsetq, g_bilin_filterm 686%define filter_y_a [y_offsetq] 687%define filter_y_b [y_offsetq+16] 688 mov tempq, g_pw_8m 689%define filter_rnd [tempq] 690%else 691 add y_offsetq, bilin_filter 692%define filter_y_a [y_offsetq] 693%define filter_y_b [y_offsetq+16] 694%define filter_rnd [pw_8] 695%endif 696%endif 697 698%if %1 == 16 699 movu m0, [srcq] 700 movu m3, [srcq+1] 701 add srcq, src_strideq 702 pavgb m0, m3 703.x_half_y_other_loop: 704 movu m4, [srcq] 705 movu m2, [srcq+1] 706 mova m1, [dstq] 707 pavgb m4, m2 708%if cpuflag(ssse3) 709 punpckhbw m2, m0, m4 710 punpcklbw m0, m4 711 pmaddubsw m2, filter_y_a 712 pmaddubsw m0, filter_y_a 713 paddw m2, filter_rnd 714 paddw m0, filter_rnd 715 psraw m2, 4 716%else 717 punpckhbw m2, m0, m5 718 punpckhbw m3, m4, m5 719 pmullw m2, filter_y_a 720 pmullw m3, filter_y_b 721 paddw m2, filter_rnd 722 punpcklbw m0, m5 723 paddw m2, m3 724 punpcklbw m3, m4, m5 725 pmullw m0, filter_y_a 726 pmullw m3, filter_y_b 727 paddw m0, filter_rnd 728 psraw m2, 4 729 paddw m0, m3 730%endif 731 punpckhbw m3, m1, m5 732 psraw m0, 4 733%if %2 == 1 ; avg 734 ; FIXME(rbultje) pipeline 735 packuswb m0, m2 736 pavgb m0, [secq] 737 punpckhbw m2, m0, m5 738 punpcklbw m0, m5 739%endif 740 punpcklbw m1, m5 741 SUM_SSE m0, m1, m2, m3, m6, m7 742 mova m0, m4 743 744 add srcq, src_strideq 745 add dstq, dst_strideq 746%else ; %1 < 16 747 movh m0, [srcq] 748 movh m3, [srcq+1] 749 add srcq, src_strideq 750 pavgb m0, m3 751%if notcpuflag(ssse3) 752 punpcklbw m0, m5 753%endif 754.x_half_y_other_loop: 755 movh m2, [srcq] 756 movh m1, [srcq+1] 757 movh m4, [srcq+src_strideq] 758 movh m3, [srcq+src_strideq+1] 759 pavgb m2, m1 760 pavgb m4, m3 761 movh m3, [dstq+dst_strideq] 762%if cpuflag(ssse3) 763 movh m1, [dstq] 764 punpcklbw m0, m2 765 punpcklbw m2, m4 766 pmaddubsw m0, filter_y_a 767 pmaddubsw m2, filter_y_a 768 punpcklbw m3, m5 769 paddw m0, filter_rnd 770 paddw m2, filter_rnd 771%else 772 punpcklbw m2, m5 773 punpcklbw m4, m5 774 pmullw m0, filter_y_a 775 pmullw m1, m2, filter_y_b 776 punpcklbw m3, m5 777 paddw m0, filter_rnd 778 pmullw m2, filter_y_a 779 paddw m0, m1 780 pmullw m1, m4, filter_y_b 781 paddw m2, filter_rnd 782 paddw m2, m1 783 movh m1, [dstq] 784%endif 785 psraw m0, 4 786 psraw m2, 4 787%if %2 == 1 ; avg 788 ; FIXME(rbultje) pipeline 789 packuswb m0, m2 790 pavgb m0, [secq] 791 punpckhbw m2, m0, m5 792 punpcklbw m0, m5 793%endif 794 punpcklbw m1, m5 795 SUM_SSE m0, m1, m2, m3, m6, m7 796 mova m0, m4 797 798 lea srcq, [srcq+src_strideq*2] 799 lea dstq, [dstq+dst_strideq*2] 800%endif 801%if %2 == 1 ; avg 802 add secq, sec_str 803%endif 804 dec h 805 jg .x_half_y_other_loop 806%undef filter_y_a 807%undef filter_y_b 808%undef filter_rnd 809 STORE_AND_RET 810 811.x_nonhalf: 812 test y_offsetd, y_offsetd 813 jnz .x_nonhalf_y_nonzero 814 815 ; x_offset == bilin interpolation && y_offset == 0 816%ifdef PIC 817 lea bilin_filter, [bilin_filter_m] 818%endif 819 shl x_offsetd, filter_idx_shift 820%if ARCH_X86_64 && mmsize == 16 821 mova m8, [bilin_filter+x_offsetq] 822%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 823 mova m9, [bilin_filter+x_offsetq+16] 824%endif 825 mova m10, [pw_8] 826%define filter_x_a m8 827%define filter_x_b m9 828%define filter_rnd m10 829%else ; x86-32 830%if ARCH_X86=1 && CONFIG_PIC=1 831;y_offset == 0. We can reuse y_offset reg. 832%define tempq y_offsetq 833 add x_offsetq, g_bilin_filterm 834%define filter_x_a [x_offsetq] 835%define filter_x_b [x_offsetq+16] 836 mov tempq, g_pw_8m 837%define filter_rnd [tempq] 838%else 839 add x_offsetq, bilin_filter 840%define filter_x_a [x_offsetq] 841%define filter_x_b [x_offsetq+16] 842%define filter_rnd [pw_8] 843%endif 844%endif 845 846.x_other_y_zero_loop: 847%if %1 == 16 848 movu m0, [srcq] 849 movu m4, [srcq+1] 850 mova m1, [dstq] 851%if cpuflag(ssse3) 852 punpckhbw m2, m0, m4 853 punpcklbw m0, m4 854 pmaddubsw m2, filter_x_a 855 pmaddubsw m0, filter_x_a 856 paddw m2, filter_rnd 857 paddw m0, filter_rnd 858%else 859 punpckhbw m2, m0, m5 860 punpckhbw m3, m4, m5 861 punpcklbw m0, m5 862 punpcklbw m4, m5 863 pmullw m2, filter_x_a 864 pmullw m3, filter_x_b 865 paddw m2, filter_rnd 866 pmullw m0, filter_x_a 867 pmullw m4, filter_x_b 868 paddw m0, filter_rnd 869 paddw m2, m3 870 paddw m0, m4 871%endif 872 psraw m2, 4 873 psraw m0, 4 874%if %2 == 1 ; avg 875 ; FIXME(rbultje) pipeline 876 packuswb m0, m2 877 pavgb m0, [secq] 878 punpckhbw m2, m0, m5 879 punpcklbw m0, m5 880%endif 881 punpckhbw m3, m1, m5 882 punpcklbw m1, m5 883 SUM_SSE m0, m1, m2, m3, m6, m7 884 885 add srcq, src_strideq 886 add dstq, dst_strideq 887%else ; %1 < 16 888 movh m0, [srcq] 889 movh m1, [srcq+1] 890 movh m2, [srcq+src_strideq] 891 movh m4, [srcq+src_strideq+1] 892 movh m3, [dstq+dst_strideq] 893%if cpuflag(ssse3) 894 punpcklbw m0, m1 895 movh m1, [dstq] 896 punpcklbw m2, m4 897 pmaddubsw m0, filter_x_a 898 pmaddubsw m2, filter_x_a 899 punpcklbw m3, m5 900 paddw m0, filter_rnd 901 paddw m2, filter_rnd 902%else 903 punpcklbw m0, m5 904 punpcklbw m1, m5 905 punpcklbw m2, m5 906 punpcklbw m4, m5 907 pmullw m0, filter_x_a 908 pmullw m1, filter_x_b 909 punpcklbw m3, m5 910 paddw m0, filter_rnd 911 pmullw m2, filter_x_a 912 pmullw m4, filter_x_b 913 paddw m0, m1 914 paddw m2, filter_rnd 915 movh m1, [dstq] 916 paddw m2, m4 917%endif 918 psraw m0, 4 919 psraw m2, 4 920%if %2 == 1 ; avg 921 ; FIXME(rbultje) pipeline 922 packuswb m0, m2 923 pavgb m0, [secq] 924 punpckhbw m2, m0, m5 925 punpcklbw m0, m5 926%endif 927 punpcklbw m1, m5 928 SUM_SSE m0, m1, m2, m3, m6, m7 929 930 lea srcq, [srcq+src_strideq*2] 931 lea dstq, [dstq+dst_strideq*2] 932%endif 933%if %2 == 1 ; avg 934 add secq, sec_str 935%endif 936 dec h 937 jg .x_other_y_zero_loop 938%undef filter_x_a 939%undef filter_x_b 940%undef filter_rnd 941 STORE_AND_RET 942 943.x_nonhalf_y_nonzero: 944 cmp y_offsetd, 8 945 jne .x_nonhalf_y_nonhalf 946 947 ; x_offset == bilin interpolation && y_offset == 0.5 948%ifdef PIC 949 lea bilin_filter, [bilin_filter_m] 950%endif 951 shl x_offsetd, filter_idx_shift 952%if ARCH_X86_64 && mmsize == 16 953 mova m8, [bilin_filter+x_offsetq] 954%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 955 mova m9, [bilin_filter+x_offsetq+16] 956%endif 957 mova m10, [pw_8] 958%define filter_x_a m8 959%define filter_x_b m9 960%define filter_rnd m10 961%else ; x86-32 962%if ARCH_X86=1 && CONFIG_PIC=1 963; y_offset == 0.5. We can reuse y_offset reg. 964%define tempq y_offsetq 965 add x_offsetq, g_bilin_filterm 966%define filter_x_a [x_offsetq] 967%define filter_x_b [x_offsetq+16] 968 mov tempq, g_pw_8m 969%define filter_rnd [tempq] 970%else 971 add x_offsetq, bilin_filter 972%define filter_x_a [x_offsetq] 973%define filter_x_b [x_offsetq+16] 974%define filter_rnd [pw_8] 975%endif 976%endif 977 978%if %1 == 16 979 movu m0, [srcq] 980 movu m1, [srcq+1] 981%if cpuflag(ssse3) 982 punpckhbw m2, m0, m1 983 punpcklbw m0, m1 984 pmaddubsw m2, filter_x_a 985 pmaddubsw m0, filter_x_a 986 paddw m2, filter_rnd 987 paddw m0, filter_rnd 988%else 989 punpckhbw m2, m0, m5 990 punpckhbw m3, m1, m5 991 punpcklbw m0, m5 992 punpcklbw m1, m5 993 pmullw m0, filter_x_a 994 pmullw m1, filter_x_b 995 paddw m0, filter_rnd 996 pmullw m2, filter_x_a 997 pmullw m3, filter_x_b 998 paddw m2, filter_rnd 999 paddw m0, m1 1000 paddw m2, m3 1001%endif 1002 psraw m0, 4 1003 psraw m2, 4 1004 add srcq, src_strideq 1005 packuswb m0, m2 1006.x_other_y_half_loop: 1007 movu m4, [srcq] 1008 movu m3, [srcq+1] 1009%if cpuflag(ssse3) 1010 mova m1, [dstq] 1011 punpckhbw m2, m4, m3 1012 punpcklbw m4, m3 1013 pmaddubsw m2, filter_x_a 1014 pmaddubsw m4, filter_x_a 1015 paddw m2, filter_rnd 1016 paddw m4, filter_rnd 1017 psraw m2, 4 1018 psraw m4, 4 1019 packuswb m4, m2 1020 pavgb m0, m4 1021 punpckhbw m3, m1, m5 1022 punpcklbw m1, m5 1023%else 1024 punpckhbw m2, m4, m5 1025 punpckhbw m1, m3, m5 1026 punpcklbw m4, m5 1027 punpcklbw m3, m5 1028 pmullw m4, filter_x_a 1029 pmullw m3, filter_x_b 1030 paddw m4, filter_rnd 1031 pmullw m2, filter_x_a 1032 pmullw m1, filter_x_b 1033 paddw m2, filter_rnd 1034 paddw m4, m3 1035 paddw m2, m1 1036 mova m1, [dstq] 1037 psraw m4, 4 1038 psraw m2, 4 1039 punpckhbw m3, m1, m5 1040 ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we 1041 ; have a 1-register shortage to be able to store the backup of the bilin 1042 ; filtered second line as words as cache for the next line. Packing into 1043 ; a byte costs 1 pack and 2 unpacks, but saves a register. 1044 packuswb m4, m2 1045 punpcklbw m1, m5 1046 pavgb m0, m4 1047%endif 1048%if %2 == 1 ; avg 1049 ; FIXME(rbultje) pipeline 1050 pavgb m0, [secq] 1051%endif 1052 punpckhbw m2, m0, m5 1053 punpcklbw m0, m5 1054 SUM_SSE m0, m1, m2, m3, m6, m7 1055 mova m0, m4 1056 1057 add srcq, src_strideq 1058 add dstq, dst_strideq 1059%else ; %1 < 16 1060 movh m0, [srcq] 1061 movh m1, [srcq+1] 1062%if cpuflag(ssse3) 1063 punpcklbw m0, m1 1064 pmaddubsw m0, filter_x_a 1065 paddw m0, filter_rnd 1066%else 1067 punpcklbw m0, m5 1068 punpcklbw m1, m5 1069 pmullw m0, filter_x_a 1070 pmullw m1, filter_x_b 1071 paddw m0, filter_rnd 1072 paddw m0, m1 1073%endif 1074 add srcq, src_strideq 1075 psraw m0, 4 1076.x_other_y_half_loop: 1077 movh m2, [srcq] 1078 movh m1, [srcq+1] 1079 movh m4, [srcq+src_strideq] 1080 movh m3, [srcq+src_strideq+1] 1081%if cpuflag(ssse3) 1082 punpcklbw m2, m1 1083 punpcklbw m4, m3 1084 pmaddubsw m2, filter_x_a 1085 pmaddubsw m4, filter_x_a 1086 movh m1, [dstq] 1087 movh m3, [dstq+dst_strideq] 1088 paddw m2, filter_rnd 1089 paddw m4, filter_rnd 1090%else 1091 punpcklbw m2, m5 1092 punpcklbw m1, m5 1093 punpcklbw m4, m5 1094 punpcklbw m3, m5 1095 pmullw m2, filter_x_a 1096 pmullw m1, filter_x_b 1097 paddw m2, filter_rnd 1098 pmullw m4, filter_x_a 1099 pmullw m3, filter_x_b 1100 paddw m4, filter_rnd 1101 paddw m2, m1 1102 movh m1, [dstq] 1103 paddw m4, m3 1104 movh m3, [dstq+dst_strideq] 1105%endif 1106 psraw m2, 4 1107 psraw m4, 4 1108 pavgw m0, m2 1109 pavgw m2, m4 1110%if %2 == 1 ; avg 1111 ; FIXME(rbultje) pipeline - also consider going to bytes here 1112 packuswb m0, m2 1113 pavgb m0, [secq] 1114 punpckhbw m2, m0, m5 1115 punpcklbw m0, m5 1116%endif 1117 punpcklbw m3, m5 1118 punpcklbw m1, m5 1119 SUM_SSE m0, m1, m2, m3, m6, m7 1120 mova m0, m4 1121 1122 lea srcq, [srcq+src_strideq*2] 1123 lea dstq, [dstq+dst_strideq*2] 1124%endif 1125%if %2 == 1 ; avg 1126 add secq, sec_str 1127%endif 1128 dec h 1129 jg .x_other_y_half_loop 1130%undef filter_x_a 1131%undef filter_x_b 1132%undef filter_rnd 1133 STORE_AND_RET 1134 1135.x_nonhalf_y_nonhalf: 1136%ifdef PIC 1137 lea bilin_filter, [bilin_filter_m] 1138%endif 1139 shl x_offsetd, filter_idx_shift 1140 shl y_offsetd, filter_idx_shift 1141%if ARCH_X86_64 && mmsize == 16 1142 mova m8, [bilin_filter+x_offsetq] 1143%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1144 mova m9, [bilin_filter+x_offsetq+16] 1145%endif 1146 mova m10, [bilin_filter+y_offsetq] 1147%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1148 mova m11, [bilin_filter+y_offsetq+16] 1149%endif 1150 mova m12, [pw_8] 1151%define filter_x_a m8 1152%define filter_x_b m9 1153%define filter_y_a m10 1154%define filter_y_b m11 1155%define filter_rnd m12 1156%else ; x86-32 1157%if ARCH_X86=1 && CONFIG_PIC=1 1158; In this case, there is NO unused register. Used src_stride register. Later, 1159; src_stride has to be loaded from stack when it is needed. 1160%define tempq src_strideq 1161 mov tempq, g_bilin_filterm 1162 add x_offsetq, tempq 1163 add y_offsetq, tempq 1164%define filter_x_a [x_offsetq] 1165%define filter_x_b [x_offsetq+16] 1166%define filter_y_a [y_offsetq] 1167%define filter_y_b [y_offsetq+16] 1168 1169 mov tempq, g_pw_8m 1170%define filter_rnd [tempq] 1171%else 1172 add x_offsetq, bilin_filter 1173 add y_offsetq, bilin_filter 1174%define filter_x_a [x_offsetq] 1175%define filter_x_b [x_offsetq+16] 1176%define filter_y_a [y_offsetq] 1177%define filter_y_b [y_offsetq+16] 1178%define filter_rnd [pw_8] 1179%endif 1180%endif 1181 1182 ; x_offset == bilin interpolation && y_offset == bilin interpolation 1183%if %1 == 16 1184 movu m0, [srcq] 1185 movu m1, [srcq+1] 1186%if cpuflag(ssse3) 1187 punpckhbw m2, m0, m1 1188 punpcklbw m0, m1 1189 pmaddubsw m2, filter_x_a 1190 pmaddubsw m0, filter_x_a 1191 paddw m2, filter_rnd 1192 paddw m0, filter_rnd 1193%else 1194 punpckhbw m2, m0, m5 1195 punpckhbw m3, m1, m5 1196 punpcklbw m0, m5 1197 punpcklbw m1, m5 1198 pmullw m0, filter_x_a 1199 pmullw m1, filter_x_b 1200 paddw m0, filter_rnd 1201 pmullw m2, filter_x_a 1202 pmullw m3, filter_x_b 1203 paddw m2, filter_rnd 1204 paddw m0, m1 1205 paddw m2, m3 1206%endif 1207 psraw m0, 4 1208 psraw m2, 4 1209 1210 INC_SRC_BY_SRC_STRIDE 1211 1212 packuswb m0, m2 1213.x_other_y_other_loop: 1214%if cpuflag(ssse3) 1215 movu m4, [srcq] 1216 movu m3, [srcq+1] 1217 mova m1, [dstq] 1218 punpckhbw m2, m4, m3 1219 punpcklbw m4, m3 1220 pmaddubsw m2, filter_x_a 1221 pmaddubsw m4, filter_x_a 1222 punpckhbw m3, m1, m5 1223 paddw m2, filter_rnd 1224 paddw m4, filter_rnd 1225 psraw m2, 4 1226 psraw m4, 4 1227 packuswb m4, m2 1228 punpckhbw m2, m0, m4 1229 punpcklbw m0, m4 1230 pmaddubsw m2, filter_y_a 1231 pmaddubsw m0, filter_y_a 1232 punpcklbw m1, m5 1233 paddw m2, filter_rnd 1234 paddw m0, filter_rnd 1235 psraw m2, 4 1236 psraw m0, 4 1237%else 1238 movu m3, [srcq] 1239 movu m4, [srcq+1] 1240 punpckhbw m1, m3, m5 1241 punpckhbw m2, m4, m5 1242 punpcklbw m3, m5 1243 punpcklbw m4, m5 1244 pmullw m3, filter_x_a 1245 pmullw m4, filter_x_b 1246 paddw m3, filter_rnd 1247 pmullw m1, filter_x_a 1248 pmullw m2, filter_x_b 1249 paddw m1, filter_rnd 1250 paddw m3, m4 1251 paddw m1, m2 1252 psraw m3, 4 1253 psraw m1, 4 1254 packuswb m4, m3, m1 1255 punpckhbw m2, m0, m5 1256 punpcklbw m0, m5 1257 pmullw m2, filter_y_a 1258 pmullw m1, filter_y_b 1259 paddw m2, filter_rnd 1260 pmullw m0, filter_y_a 1261 pmullw m3, filter_y_b 1262 paddw m2, m1 1263 mova m1, [dstq] 1264 paddw m0, filter_rnd 1265 psraw m2, 4 1266 paddw m0, m3 1267 punpckhbw m3, m1, m5 1268 psraw m0, 4 1269 punpcklbw m1, m5 1270%endif 1271%if %2 == 1 ; avg 1272 ; FIXME(rbultje) pipeline 1273 packuswb m0, m2 1274 pavgb m0, [secq] 1275 punpckhbw m2, m0, m5 1276 punpcklbw m0, m5 1277%endif 1278 SUM_SSE m0, m1, m2, m3, m6, m7 1279 mova m0, m4 1280 1281 INC_SRC_BY_SRC_STRIDE 1282 add dstq, dst_strideq 1283%else ; %1 < 16 1284 movh m0, [srcq] 1285 movh m1, [srcq+1] 1286%if cpuflag(ssse3) 1287 punpcklbw m0, m1 1288 pmaddubsw m0, filter_x_a 1289 paddw m0, filter_rnd 1290%else 1291 punpcklbw m0, m5 1292 punpcklbw m1, m5 1293 pmullw m0, filter_x_a 1294 pmullw m1, filter_x_b 1295 paddw m0, filter_rnd 1296 paddw m0, m1 1297%endif 1298 psraw m0, 4 1299%if cpuflag(ssse3) 1300 packuswb m0, m0 1301%endif 1302 1303 INC_SRC_BY_SRC_STRIDE 1304 1305.x_other_y_other_loop: 1306 movh m2, [srcq] 1307 movh m1, [srcq+1] 1308 1309 INC_SRC_BY_SRC_STRIDE 1310 movh m4, [srcq] 1311 movh m3, [srcq+1] 1312 1313%if cpuflag(ssse3) 1314 punpcklbw m2, m1 1315 punpcklbw m4, m3 1316 pmaddubsw m2, filter_x_a 1317 pmaddubsw m4, filter_x_a 1318 movh m3, [dstq+dst_strideq] 1319 movh m1, [dstq] 1320 paddw m2, filter_rnd 1321 paddw m4, filter_rnd 1322 psraw m2, 4 1323 psraw m4, 4 1324 packuswb m2, m2 1325 packuswb m4, m4 1326 punpcklbw m0, m2 1327 punpcklbw m2, m4 1328 pmaddubsw m0, filter_y_a 1329 pmaddubsw m2, filter_y_a 1330 punpcklbw m3, m5 1331 paddw m0, filter_rnd 1332 paddw m2, filter_rnd 1333 psraw m0, 4 1334 psraw m2, 4 1335 punpcklbw m1, m5 1336%else 1337 punpcklbw m2, m5 1338 punpcklbw m1, m5 1339 punpcklbw m4, m5 1340 punpcklbw m3, m5 1341 pmullw m2, filter_x_a 1342 pmullw m1, filter_x_b 1343 paddw m2, filter_rnd 1344 pmullw m4, filter_x_a 1345 pmullw m3, filter_x_b 1346 paddw m4, filter_rnd 1347 paddw m2, m1 1348 paddw m4, m3 1349 psraw m2, 4 1350 psraw m4, 4 1351 pmullw m0, filter_y_a 1352 pmullw m3, m2, filter_y_b 1353 paddw m0, filter_rnd 1354 pmullw m2, filter_y_a 1355 pmullw m1, m4, filter_y_b 1356 paddw m2, filter_rnd 1357 paddw m0, m3 1358 movh m3, [dstq+dst_strideq] 1359 paddw m2, m1 1360 movh m1, [dstq] 1361 psraw m0, 4 1362 psraw m2, 4 1363 punpcklbw m3, m5 1364 punpcklbw m1, m5 1365%endif 1366%if %2 == 1 ; avg 1367 ; FIXME(rbultje) pipeline 1368 packuswb m0, m2 1369 pavgb m0, [secq] 1370 punpckhbw m2, m0, m5 1371 punpcklbw m0, m5 1372%endif 1373 SUM_SSE m0, m1, m2, m3, m6, m7 1374 mova m0, m4 1375 1376 INC_SRC_BY_SRC_STRIDE 1377 lea dstq, [dstq+dst_strideq*2] 1378%endif 1379%if %2 == 1 ; avg 1380 add secq, sec_str 1381%endif 1382 dec h 1383 jg .x_other_y_other_loop 1384%undef filter_x_a 1385%undef filter_x_b 1386%undef filter_y_a 1387%undef filter_y_b 1388%undef filter_rnd 1389 STORE_AND_RET 1390%endmacro 1391 1392; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical 1393; between the ssse3 and non-ssse3 version. It may make sense to merge their 1394; code in the sense that the ssse3 version would jump to the appropriate 1395; location in the sse/2 version, rather than duplicating that code in the 1396; binary. 1397 1398INIT_MMX sse 1399SUBPEL_VARIANCE 4 1400INIT_XMM sse2 1401SUBPEL_VARIANCE 8 1402SUBPEL_VARIANCE 16 1403 1404INIT_MMX ssse3 1405SUBPEL_VARIANCE 4 1406INIT_XMM ssse3 1407SUBPEL_VARIANCE 8 1408SUBPEL_VARIANCE 16 1409 1410INIT_MMX sse 1411SUBPEL_VARIANCE 4, 1 1412INIT_XMM sse2 1413SUBPEL_VARIANCE 8, 1 1414SUBPEL_VARIANCE 16, 1 1415 1416INIT_MMX ssse3 1417SUBPEL_VARIANCE 4, 1 1418INIT_XMM ssse3 1419SUBPEL_VARIANCE 8, 1 1420SUBPEL_VARIANCE 16, 1 1421