191037db265ecdd914a26e056cf69207b4f50924ehkuang; 291037db265ecdd914a26e056cf69207b4f50924ehkuang; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 391037db265ecdd914a26e056cf69207b4f50924ehkuang; 491037db265ecdd914a26e056cf69207b4f50924ehkuang; Use of this source code is governed by a BSD-style license 591037db265ecdd914a26e056cf69207b4f50924ehkuang; that can be found in the LICENSE file in the root of the source 691037db265ecdd914a26e056cf69207b4f50924ehkuang; tree. An additional intellectual property rights grant can be found 791037db265ecdd914a26e056cf69207b4f50924ehkuang; in the file PATENTS. All contributing project authors may 891037db265ecdd914a26e056cf69207b4f50924ehkuang; be found in the AUTHORS file in the root of the source tree. 991037db265ecdd914a26e056cf69207b4f50924ehkuang; 1091037db265ecdd914a26e056cf69207b4f50924ehkuang 1191037db265ecdd914a26e056cf69207b4f50924ehkuang%include "third_party/x86inc/x86inc.asm" 1291037db265ecdd914a26e056cf69207b4f50924ehkuang 1391037db265ecdd914a26e056cf69207b4f50924ehkuangSECTION_RODATA 1491037db265ecdd914a26e056cf69207b4f50924ehkuangpw_8: times 8 dw 8 1591037db265ecdd914a26e056cf69207b4f50924ehkuangbilin_filter_m_sse2: times 8 dw 16 1691037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 dw 0 1791037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 dw 14 1891037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 dw 2 1991037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 dw 12 2091037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 dw 4 2191037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 dw 10 2291037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 dw 6 2391037db265ecdd914a26e056cf69207b4f50924ehkuang times 16 dw 8 2491037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 dw 6 2591037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 dw 10 2691037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 dw 4 2791037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 dw 12 2891037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 dw 2 2991037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 dw 14 3091037db265ecdd914a26e056cf69207b4f50924ehkuang 3191037db265ecdd914a26e056cf69207b4f50924ehkuangbilin_filter_m_ssse3: times 8 db 16, 0 3291037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 db 14, 2 3391037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 db 12, 4 3491037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 db 10, 6 3591037db265ecdd914a26e056cf69207b4f50924ehkuang times 16 db 8 3691037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 db 6, 10 3791037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 db 4, 12 3891037db265ecdd914a26e056cf69207b4f50924ehkuang times 8 db 2, 14 3991037db265ecdd914a26e056cf69207b4f50924ehkuang 4091037db265ecdd914a26e056cf69207b4f50924ehkuangSECTION .text 4191037db265ecdd914a26e056cf69207b4f50924ehkuang 427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, 4391037db265ecdd914a26e056cf69207b4f50924ehkuang; int x_offset, int y_offset, 4491037db265ecdd914a26e056cf69207b4f50924ehkuang; const uint8_t *dst, ptrdiff_t dst_stride, 4591037db265ecdd914a26e056cf69207b4f50924ehkuang; int height, unsigned int *sse); 4691037db265ecdd914a26e056cf69207b4f50924ehkuang; 4791037db265ecdd914a26e056cf69207b4f50924ehkuang; This function returns the SE and stores SSE in the given pointer. 4891037db265ecdd914a26e056cf69207b4f50924ehkuang 4991037db265ecdd914a26e056cf69207b4f50924ehkuang%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse 5091037db265ecdd914a26e056cf69207b4f50924ehkuang psubw %3, %4 5191037db265ecdd914a26e056cf69207b4f50924ehkuang psubw %1, %2 5291037db265ecdd914a26e056cf69207b4f50924ehkuang paddw %5, %3 5391037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddwd %3, %3 5491037db265ecdd914a26e056cf69207b4f50924ehkuang paddw %5, %1 5591037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddwd %1, %1 5691037db265ecdd914a26e056cf69207b4f50924ehkuang paddd %6, %3 5791037db265ecdd914a26e056cf69207b4f50924ehkuang paddd %6, %1 5891037db265ecdd914a26e056cf69207b4f50924ehkuang%endmacro 5991037db265ecdd914a26e056cf69207b4f50924ehkuang 6068e1c830ade592be74773e249bf94e2bbfb50de7Johann%macro STORE_AND_RET 1 6168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 6291037db265ecdd914a26e056cf69207b4f50924ehkuang ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit 6391037db265ecdd914a26e056cf69207b4f50924ehkuang ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. 6491037db265ecdd914a26e056cf69207b4f50924ehkuang ; We have to sign-extend it before adding the words within the register 6591037db265ecdd914a26e056cf69207b4f50924ehkuang ; and outputing to a dword. 6691037db265ecdd914a26e056cf69207b4f50924ehkuang pcmpgtw m5, m6 ; mask for 0 > x 6791037db265ecdd914a26e056cf69207b4f50924ehkuang movhlps m3, m7 6891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklwd m4, m6, m5 6991037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhwd m6, m5 ; sign-extend m6 word->dword 7091037db265ecdd914a26e056cf69207b4f50924ehkuang paddd m7, m3 7191037db265ecdd914a26e056cf69207b4f50924ehkuang paddd m6, m4 7291037db265ecdd914a26e056cf69207b4f50924ehkuang pshufd m3, m7, 0x1 7391037db265ecdd914a26e056cf69207b4f50924ehkuang movhlps m4, m6 7491037db265ecdd914a26e056cf69207b4f50924ehkuang paddd m7, m3 7591037db265ecdd914a26e056cf69207b4f50924ehkuang paddd m6, m4 7691037db265ecdd914a26e056cf69207b4f50924ehkuang mov r1, ssem ; r1 = unsigned int *sse 7791037db265ecdd914a26e056cf69207b4f50924ehkuang pshufd m4, m6, 0x1 7891037db265ecdd914a26e056cf69207b4f50924ehkuang movd [r1], m7 ; store sse 7991037db265ecdd914a26e056cf69207b4f50924ehkuang paddd m6, m4 807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movd raxd, m6 ; store sum as return value 8168e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh 8268e1c830ade592be74773e249bf94e2bbfb50de7Johann pshuflw m4, m6, 0xe 8368e1c830ade592be74773e249bf94e2bbfb50de7Johann pshuflw m3, m7, 0xe 8491037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m6, m4 8591037db265ecdd914a26e056cf69207b4f50924ehkuang paddd m7, m3 8691037db265ecdd914a26e056cf69207b4f50924ehkuang pcmpgtw m5, m6 ; mask for 0 > x 8791037db265ecdd914a26e056cf69207b4f50924ehkuang mov r1, ssem ; r1 = unsigned int *sse 8891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklwd m6, m5 ; sign-extend m6 word->dword 8991037db265ecdd914a26e056cf69207b4f50924ehkuang movd [r1], m7 ; store sse 9068e1c830ade592be74773e249bf94e2bbfb50de7Johann pshuflw m4, m6, 0xe 9191037db265ecdd914a26e056cf69207b4f50924ehkuang paddd m6, m4 927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movd raxd, m6 ; store sum as return value 9391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 9491037db265ecdd914a26e056cf69207b4f50924ehkuang RET 9591037db265ecdd914a26e056cf69207b4f50924ehkuang%endmacro 9691037db265ecdd914a26e056cf69207b4f50924ehkuang 979b35249446b07f40ac5fcc3205f2c048616efacchkuang%macro INC_SRC_BY_SRC_STRIDE 0 989b35249446b07f40ac5fcc3205f2c048616efacchkuang%if ARCH_X86=1 && CONFIG_PIC=1 999b35249446b07f40ac5fcc3205f2c048616efacchkuang add srcq, src_stridemp 1009b35249446b07f40ac5fcc3205f2c048616efacchkuang%else 1019b35249446b07f40ac5fcc3205f2c048616efacchkuang add srcq, src_strideq 1029b35249446b07f40ac5fcc3205f2c048616efacchkuang%endif 1039b35249446b07f40ac5fcc3205f2c048616efacchkuang%endmacro 1049b35249446b07f40ac5fcc3205f2c048616efacchkuang 10591037db265ecdd914a26e056cf69207b4f50924ehkuang%macro SUBPEL_VARIANCE 1-2 0 ; W 10691037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 10791037db265ecdd914a26e056cf69207b4f50924ehkuang%define bilin_filter_m bilin_filter_m_ssse3 10891037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_idx_shift 4 10991037db265ecdd914a26e056cf69207b4f50924ehkuang%else 11091037db265ecdd914a26e056cf69207b4f50924ehkuang%define bilin_filter_m bilin_filter_m_sse2 11191037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_idx_shift 5 11291037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 11391037db265ecdd914a26e056cf69207b4f50924ehkuang; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses 11491037db265ecdd914a26e056cf69207b4f50924ehkuang; 11, not 13, if the registers are ordered correctly. May make a minor speed 11591037db265ecdd914a26e056cf69207b4f50924ehkuang; difference on Win64 1169b35249446b07f40ac5fcc3205f2c048616efacchkuang 1179b35249446b07f40ac5fcc3205f2c048616efacchkuang%ifdef PIC ; 64bit PIC 1189b35249446b07f40ac5fcc3205f2c048616efacchkuang %if %2 == 1 ; avg 1199b35249446b07f40ac5fcc3205f2c048616efacchkuang cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ 1209b35249446b07f40ac5fcc3205f2c048616efacchkuang x_offset, y_offset, \ 1219b35249446b07f40ac5fcc3205f2c048616efacchkuang dst, dst_stride, \ 1229b35249446b07f40ac5fcc3205f2c048616efacchkuang sec, sec_stride, height, sse 1239b35249446b07f40ac5fcc3205f2c048616efacchkuang %define sec_str sec_strideq 1249b35249446b07f40ac5fcc3205f2c048616efacchkuang %else 1259b35249446b07f40ac5fcc3205f2c048616efacchkuang cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ 1269b35249446b07f40ac5fcc3205f2c048616efacchkuang y_offset, dst, dst_stride, height, sse 1279b35249446b07f40ac5fcc3205f2c048616efacchkuang %endif 1287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian %define block_height heightd 1299b35249446b07f40ac5fcc3205f2c048616efacchkuang %define bilin_filter sseq 13091037db265ecdd914a26e056cf69207b4f50924ehkuang%else 1319b35249446b07f40ac5fcc3205f2c048616efacchkuang %if ARCH_X86=1 && CONFIG_PIC=1 1329b35249446b07f40ac5fcc3205f2c048616efacchkuang %if %2 == 1 ; avg 1339b35249446b07f40ac5fcc3205f2c048616efacchkuang cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 1349b35249446b07f40ac5fcc3205f2c048616efacchkuang x_offset, y_offset, \ 1359b35249446b07f40ac5fcc3205f2c048616efacchkuang dst, dst_stride, \ 1369b35249446b07f40ac5fcc3205f2c048616efacchkuang sec, sec_stride, \ 1379b35249446b07f40ac5fcc3205f2c048616efacchkuang height, sse, g_bilin_filter, g_pw_8 1387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian %define block_height dword heightm 1399b35249446b07f40ac5fcc3205f2c048616efacchkuang %define sec_str sec_stridemp 1409b35249446b07f40ac5fcc3205f2c048616efacchkuang 1419b35249446b07f40ac5fcc3205f2c048616efacchkuang ;Store bilin_filter and pw_8 location in stack 14268e1c830ade592be74773e249bf94e2bbfb50de7Johann %if GET_GOT_DEFINED == 1 14368e1c830ade592be74773e249bf94e2bbfb50de7Johann GET_GOT eax 14468e1c830ade592be74773e249bf94e2bbfb50de7Johann add esp, 4 ; restore esp 14568e1c830ade592be74773e249bf94e2bbfb50de7Johann %endif 1469b35249446b07f40ac5fcc3205f2c048616efacchkuang 1479b35249446b07f40ac5fcc3205f2c048616efacchkuang lea ecx, [GLOBAL(bilin_filter_m)] 1489b35249446b07f40ac5fcc3205f2c048616efacchkuang mov g_bilin_filterm, ecx 1499b35249446b07f40ac5fcc3205f2c048616efacchkuang 1509b35249446b07f40ac5fcc3205f2c048616efacchkuang lea ecx, [GLOBAL(pw_8)] 1519b35249446b07f40ac5fcc3205f2c048616efacchkuang mov g_pw_8m, ecx 1529b35249446b07f40ac5fcc3205f2c048616efacchkuang 1539b35249446b07f40ac5fcc3205f2c048616efacchkuang LOAD_IF_USED 0, 1 ; load eax, ecx back 1549b35249446b07f40ac5fcc3205f2c048616efacchkuang %else 1559b35249446b07f40ac5fcc3205f2c048616efacchkuang cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ 1569b35249446b07f40ac5fcc3205f2c048616efacchkuang y_offset, dst, dst_stride, height, sse, \ 1579b35249446b07f40ac5fcc3205f2c048616efacchkuang g_bilin_filter, g_pw_8 1587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian %define block_height heightd 1599b35249446b07f40ac5fcc3205f2c048616efacchkuang 1609b35249446b07f40ac5fcc3205f2c048616efacchkuang ;Store bilin_filter and pw_8 location in stack 16168e1c830ade592be74773e249bf94e2bbfb50de7Johann %if GET_GOT_DEFINED == 1 16268e1c830ade592be74773e249bf94e2bbfb50de7Johann GET_GOT eax 16368e1c830ade592be74773e249bf94e2bbfb50de7Johann add esp, 4 ; restore esp 16468e1c830ade592be74773e249bf94e2bbfb50de7Johann %endif 1659b35249446b07f40ac5fcc3205f2c048616efacchkuang 1669b35249446b07f40ac5fcc3205f2c048616efacchkuang lea ecx, [GLOBAL(bilin_filter_m)] 1679b35249446b07f40ac5fcc3205f2c048616efacchkuang mov g_bilin_filterm, ecx 1689b35249446b07f40ac5fcc3205f2c048616efacchkuang 1699b35249446b07f40ac5fcc3205f2c048616efacchkuang lea ecx, [GLOBAL(pw_8)] 1709b35249446b07f40ac5fcc3205f2c048616efacchkuang mov g_pw_8m, ecx 1719b35249446b07f40ac5fcc3205f2c048616efacchkuang 1729b35249446b07f40ac5fcc3205f2c048616efacchkuang LOAD_IF_USED 0, 1 ; load eax, ecx back 1739b35249446b07f40ac5fcc3205f2c048616efacchkuang %endif 1749b35249446b07f40ac5fcc3205f2c048616efacchkuang %else 1759b35249446b07f40ac5fcc3205f2c048616efacchkuang %if %2 == 1 ; avg 1769b35249446b07f40ac5fcc3205f2c048616efacchkuang cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ 1779b35249446b07f40ac5fcc3205f2c048616efacchkuang 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ 1789b35249446b07f40ac5fcc3205f2c048616efacchkuang x_offset, y_offset, \ 1799b35249446b07f40ac5fcc3205f2c048616efacchkuang dst, dst_stride, \ 1809b35249446b07f40ac5fcc3205f2c048616efacchkuang sec, sec_stride, \ 1819b35249446b07f40ac5fcc3205f2c048616efacchkuang height, sse 1829b35249446b07f40ac5fcc3205f2c048616efacchkuang %if ARCH_X86_64 1837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian %define block_height heightd 1849b35249446b07f40ac5fcc3205f2c048616efacchkuang %define sec_str sec_strideq 1859b35249446b07f40ac5fcc3205f2c048616efacchkuang %else 1867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian %define block_height dword heightm 1879b35249446b07f40ac5fcc3205f2c048616efacchkuang %define sec_str sec_stridemp 1889b35249446b07f40ac5fcc3205f2c048616efacchkuang %endif 1899b35249446b07f40ac5fcc3205f2c048616efacchkuang %else 1909b35249446b07f40ac5fcc3205f2c048616efacchkuang cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ 1919b35249446b07f40ac5fcc3205f2c048616efacchkuang y_offset, dst, dst_stride, height, sse 1927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian %define block_height heightd 1939b35249446b07f40ac5fcc3205f2c048616efacchkuang %endif 1949b35249446b07f40ac5fcc3205f2c048616efacchkuang 1959b35249446b07f40ac5fcc3205f2c048616efacchkuang %define bilin_filter bilin_filter_m 1969b35249446b07f40ac5fcc3205f2c048616efacchkuang %endif 19791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 1989b35249446b07f40ac5fcc3205f2c048616efacchkuang 19968e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 == 4 20068e1c830ade592be74773e249bf94e2bbfb50de7Johann %define movx movd 20168e1c830ade592be74773e249bf94e2bbfb50de7Johann%else 20268e1c830ade592be74773e249bf94e2bbfb50de7Johann %define movx movh 20368e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 20468e1c830ade592be74773e249bf94e2bbfb50de7Johann 20591037db265ecdd914a26e056cf69207b4f50924ehkuang ASSERT %1 <= 16 ; m6 overflows if w > 16 20691037db265ecdd914a26e056cf69207b4f50924ehkuang pxor m6, m6 ; sum 20791037db265ecdd914a26e056cf69207b4f50924ehkuang pxor m7, m7 ; sse 20891037db265ecdd914a26e056cf69207b4f50924ehkuang ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we 20991037db265ecdd914a26e056cf69207b4f50924ehkuang ; could perhaps use it for something more productive then 21091037db265ecdd914a26e056cf69207b4f50924ehkuang pxor m5, m5 ; dedicated zero register 21191037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 < 16 2127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sar block_height, 1 21391037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 21491037db265ecdd914a26e056cf69207b4f50924ehkuang shl sec_str, 1 21591037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 21691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 21791037db265ecdd914a26e056cf69207b4f50924ehkuang 21891037db265ecdd914a26e056cf69207b4f50924ehkuang ; FIXME(rbultje) replace by jumptable? 21991037db265ecdd914a26e056cf69207b4f50924ehkuang test x_offsetd, x_offsetd 22091037db265ecdd914a26e056cf69207b4f50924ehkuang jnz .x_nonzero 22191037db265ecdd914a26e056cf69207b4f50924ehkuang ; x_offset == 0 22291037db265ecdd914a26e056cf69207b4f50924ehkuang test y_offsetd, y_offsetd 22391037db265ecdd914a26e056cf69207b4f50924ehkuang jnz .x_zero_y_nonzero 22491037db265ecdd914a26e056cf69207b4f50924ehkuang 22591037db265ecdd914a26e056cf69207b4f50924ehkuang ; x_offset == 0 && y_offset == 0 22691037db265ecdd914a26e056cf69207b4f50924ehkuang.x_zero_y_zero_loop: 22791037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16 22891037db265ecdd914a26e056cf69207b4f50924ehkuang movu m0, [srcq] 22991037db265ecdd914a26e056cf69207b4f50924ehkuang mova m1, [dstq] 23091037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 23191037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 23291037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m1, m5 23391037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 23491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 23591037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 23691037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 23768e1c830ade592be74773e249bf94e2bbfb50de7Johann 23891037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 0 ; !avg 23991037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m1, m5 24091037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 24191037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 24291037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 24391037db265ecdd914a26e056cf69207b4f50924ehkuang 24491037db265ecdd914a26e056cf69207b4f50924ehkuang add srcq, src_strideq 24591037db265ecdd914a26e056cf69207b4f50924ehkuang add dstq, dst_strideq 24691037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16 24768e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m0, [srcq] 24891037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 24968e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 25091037db265ecdd914a26e056cf69207b4f50924ehkuang movhps m0, [srcq+src_strideq] 25168e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh 25268e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [srcq+src_strideq] 25368e1c830ade592be74773e249bf94e2bbfb50de7Johann punpckldq m0, m1 25491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 25591037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; !avg 25668e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m2, [srcq+src_strideq] 25791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 25868e1c830ade592be74773e249bf94e2bbfb50de7Johann 25968e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 26068e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [dstq+dst_strideq] 26168e1c830ade592be74773e249bf94e2bbfb50de7Johann 26291037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 26368e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 26491037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 26568e1c830ade592be74773e249bf94e2bbfb50de7Johann%else 26668e1c830ade592be74773e249bf94e2bbfb50de7Johann movh m2, [secq] 26768e1c830ade592be74773e249bf94e2bbfb50de7Johann pavgb m0, m2 26868e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 26991037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 27091037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 27168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 27291037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 27391037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 27468e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh 27568e1c830ade592be74773e249bf94e2bbfb50de7Johann punpcklbw m0, m5 27668e1c830ade592be74773e249bf94e2bbfb50de7Johann movhlps m2, m0 27768e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 27891037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; !avg 27991037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 28091037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m2, m5 28191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 28291037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 28391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 28491037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 28591037db265ecdd914a26e056cf69207b4f50924ehkuang 28691037db265ecdd914a26e056cf69207b4f50924ehkuang lea srcq, [srcq+src_strideq*2] 28791037db265ecdd914a26e056cf69207b4f50924ehkuang lea dstq, [dstq+dst_strideq*2] 28891037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 28991037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 29091037db265ecdd914a26e056cf69207b4f50924ehkuang add secq, sec_str 29191037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 2927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian dec block_height 29391037db265ecdd914a26e056cf69207b4f50924ehkuang jg .x_zero_y_zero_loop 29468e1c830ade592be74773e249bf94e2bbfb50de7Johann STORE_AND_RET %1 29591037db265ecdd914a26e056cf69207b4f50924ehkuang 29691037db265ecdd914a26e056cf69207b4f50924ehkuang.x_zero_y_nonzero: 29768e1c830ade592be74773e249bf94e2bbfb50de7Johann cmp y_offsetd, 4 29891037db265ecdd914a26e056cf69207b4f50924ehkuang jne .x_zero_y_nonhalf 29991037db265ecdd914a26e056cf69207b4f50924ehkuang 30091037db265ecdd914a26e056cf69207b4f50924ehkuang ; x_offset == 0 && y_offset == 0.5 30191037db265ecdd914a26e056cf69207b4f50924ehkuang.x_zero_y_half_loop: 30291037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16 30391037db265ecdd914a26e056cf69207b4f50924ehkuang movu m0, [srcq] 30491037db265ecdd914a26e056cf69207b4f50924ehkuang movu m4, [srcq+src_strideq] 30591037db265ecdd914a26e056cf69207b4f50924ehkuang mova m1, [dstq] 30691037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, m4 30791037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m1, m5 30891037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 30991037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 31091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 31191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 31291037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 31391037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 31491037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 31591037db265ecdd914a26e056cf69207b4f50924ehkuang 31691037db265ecdd914a26e056cf69207b4f50924ehkuang add srcq, src_strideq 31791037db265ecdd914a26e056cf69207b4f50924ehkuang add dstq, dst_strideq 31891037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16 31968e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m0, [srcq] 32068e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m2, [srcq+src_strideq] 32191037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 32268e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 32391037db265ecdd914a26e056cf69207b4f50924ehkuang movhps m2, [srcq+src_strideq*2] 32468e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh 32568e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [srcq+src_strideq*2] 3261184aebb761cbeac9124c37189a80a1a58f04b6bhkuang punpckldq m2, m1 3271184aebb761cbeac9124c37189a80a1a58f04b6bhkuang%endif 32868e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 32968e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 33091037db265ecdd914a26e056cf69207b4f50924ehkuang movlhps m0, m2 33168e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh 33291037db265ecdd914a26e056cf69207b4f50924ehkuang punpckldq m0, m2 33391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 33468e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [dstq+dst_strideq] 33591037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, m2 33691037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 33768e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 33891037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 33991037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 34091037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 34191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 34268e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh 34368e1c830ade592be74773e249bf94e2bbfb50de7Johann movh m4, [secq] 34468e1c830ade592be74773e249bf94e2bbfb50de7Johann pavgb m0, m4 34568e1c830ade592be74773e249bf94e2bbfb50de7Johann punpcklbw m3, m5 34668e1c830ade592be74773e249bf94e2bbfb50de7Johann punpcklbw m0, m5 34768e1c830ade592be74773e249bf94e2bbfb50de7Johann movhlps m2, m0 34868e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 34991037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; !avg 35068e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m4, [srcq+src_strideq*2] 35168e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 35291037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, m2 35368e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [dstq+dst_strideq] 35491037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m2, m4 35591037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 35691037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m2, m5 35791037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 35891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 35991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 36091037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 36191037db265ecdd914a26e056cf69207b4f50924ehkuang 36291037db265ecdd914a26e056cf69207b4f50924ehkuang lea srcq, [srcq+src_strideq*2] 36391037db265ecdd914a26e056cf69207b4f50924ehkuang lea dstq, [dstq+dst_strideq*2] 36491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 36591037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 36691037db265ecdd914a26e056cf69207b4f50924ehkuang add secq, sec_str 36791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 3687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian dec block_height 36991037db265ecdd914a26e056cf69207b4f50924ehkuang jg .x_zero_y_half_loop 37068e1c830ade592be74773e249bf94e2bbfb50de7Johann STORE_AND_RET %1 37191037db265ecdd914a26e056cf69207b4f50924ehkuang 37291037db265ecdd914a26e056cf69207b4f50924ehkuang.x_zero_y_nonhalf: 37391037db265ecdd914a26e056cf69207b4f50924ehkuang ; x_offset == 0 && y_offset == bilin interpolation 37491037db265ecdd914a26e056cf69207b4f50924ehkuang%ifdef PIC 37591037db265ecdd914a26e056cf69207b4f50924ehkuang lea bilin_filter, [bilin_filter_m] 37691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 37791037db265ecdd914a26e056cf69207b4f50924ehkuang shl y_offsetd, filter_idx_shift 37868e1c830ade592be74773e249bf94e2bbfb50de7Johann%if ARCH_X86_64 && %1 > 4 37991037db265ecdd914a26e056cf69207b4f50924ehkuang mova m8, [bilin_filter+y_offsetq] 38091037db265ecdd914a26e056cf69207b4f50924ehkuang%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 38191037db265ecdd914a26e056cf69207b4f50924ehkuang mova m9, [bilin_filter+y_offsetq+16] 38291037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 38391037db265ecdd914a26e056cf69207b4f50924ehkuang mova m10, [pw_8] 38491037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_a m8 38591037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_b m9 38691037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd m10 38791037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; x86-32 or mmx 3889b35249446b07f40ac5fcc3205f2c048616efacchkuang%if ARCH_X86=1 && CONFIG_PIC=1 3899b35249446b07f40ac5fcc3205f2c048616efacchkuang; x_offset == 0, reuse x_offset reg 3909b35249446b07f40ac5fcc3205f2c048616efacchkuang%define tempq x_offsetq 3919b35249446b07f40ac5fcc3205f2c048616efacchkuang add y_offsetq, g_bilin_filterm 3929b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_y_a [y_offsetq] 3939b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_y_b [y_offsetq+16] 3949b35249446b07f40ac5fcc3205f2c048616efacchkuang mov tempq, g_pw_8m 3959b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_rnd [tempq] 3969b35249446b07f40ac5fcc3205f2c048616efacchkuang%else 39791037db265ecdd914a26e056cf69207b4f50924ehkuang add y_offsetq, bilin_filter 39891037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_a [y_offsetq] 39991037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_b [y_offsetq+16] 40091037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd [pw_8] 40191037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 4029b35249446b07f40ac5fcc3205f2c048616efacchkuang%endif 4039b35249446b07f40ac5fcc3205f2c048616efacchkuang 40491037db265ecdd914a26e056cf69207b4f50924ehkuang.x_zero_y_other_loop: 40591037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16 40691037db265ecdd914a26e056cf69207b4f50924ehkuang movu m0, [srcq] 40791037db265ecdd914a26e056cf69207b4f50924ehkuang movu m4, [srcq+src_strideq] 40891037db265ecdd914a26e056cf69207b4f50924ehkuang mova m1, [dstq] 40991037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 41091037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m4 41191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m4 41291037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m2, filter_y_a 41391037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m0, filter_y_a 41491037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 41591037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 41691037db265ecdd914a26e056cf69207b4f50924ehkuang%else 41791037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 41891037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m4, m5 41991037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 42091037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m4, m5 42191037db265ecdd914a26e056cf69207b4f50924ehkuang ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can 42291037db265ecdd914a26e056cf69207b4f50924ehkuang ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of 42391037db265ecdd914a26e056cf69207b4f50924ehkuang ; instructions is the same (5), but it is 1 mul instead of 2, so might be 42491037db265ecdd914a26e056cf69207b4f50924ehkuang ; slightly faster because of pmullw latency. It would also cut our rodata 42591037db265ecdd914a26e056cf69207b4f50924ehkuang ; tables in half for this function, and save 1-2 registers on x86-64. 42691037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m2, filter_y_a 42791037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m3, filter_y_b 42891037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 42991037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m0, filter_y_a 43091037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m4, filter_y_b 43191037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 43291037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, m3 43391037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, m4 43491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 43591037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 43691037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m0, 4 43791037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 43891037db265ecdd914a26e056cf69207b4f50924ehkuang ; FIXME(rbultje) pipeline 43991037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m0, m2 44091037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 44191037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 44291037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 44391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 44491037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m1, m5 44591037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 44691037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 44791037db265ecdd914a26e056cf69207b4f50924ehkuang 44891037db265ecdd914a26e056cf69207b4f50924ehkuang add srcq, src_strideq 44991037db265ecdd914a26e056cf69207b4f50924ehkuang add dstq, dst_strideq 45091037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16 45168e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m0, [srcq] 45268e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m2, [srcq+src_strideq] 45368e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m4, [srcq+src_strideq*2] 45468e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [dstq+dst_strideq] 45591037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 45668e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 45791037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m2 45891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m2, m4 45991037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m0, filter_y_a 46091037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m2, filter_y_a 46191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 46291037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 46391037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 46491037db265ecdd914a26e056cf69207b4f50924ehkuang%else 46591037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 46691037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m2, m5 46791037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m4, m5 46891037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m0, filter_y_a 46991037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m1, m2, filter_y_b 47091037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 47191037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 47291037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m2, filter_y_a 47391037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m4, filter_y_b 47491037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, m1 47591037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 47668e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 47791037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, m4 47891037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 47991037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m0, 4 48091037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 48191037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 48291037db265ecdd914a26e056cf69207b4f50924ehkuang ; FIXME(rbultje) pipeline 48368e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 == 4 48468e1c830ade592be74773e249bf94e2bbfb50de7Johann movlhps m0, m2 48568e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 48691037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m0, m2 48768e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 48891037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 48991037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 49091037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 49168e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh 49268e1c830ade592be74773e249bf94e2bbfb50de7Johann movh m2, [secq] 49368e1c830ade592be74773e249bf94e2bbfb50de7Johann pavgb m0, m2 49468e1c830ade592be74773e249bf94e2bbfb50de7Johann punpcklbw m0, m5 49568e1c830ade592be74773e249bf94e2bbfb50de7Johann movhlps m2, m0 49668e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 49791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 49891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 49991037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 50091037db265ecdd914a26e056cf69207b4f50924ehkuang 50191037db265ecdd914a26e056cf69207b4f50924ehkuang lea srcq, [srcq+src_strideq*2] 50291037db265ecdd914a26e056cf69207b4f50924ehkuang lea dstq, [dstq+dst_strideq*2] 50391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 50491037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 50591037db265ecdd914a26e056cf69207b4f50924ehkuang add secq, sec_str 50691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 5077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian dec block_height 50891037db265ecdd914a26e056cf69207b4f50924ehkuang jg .x_zero_y_other_loop 50991037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_y_a 51091037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_y_b 51191037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_rnd 51268e1c830ade592be74773e249bf94e2bbfb50de7Johann STORE_AND_RET %1 51391037db265ecdd914a26e056cf69207b4f50924ehkuang 51491037db265ecdd914a26e056cf69207b4f50924ehkuang.x_nonzero: 51568e1c830ade592be74773e249bf94e2bbfb50de7Johann cmp x_offsetd, 4 51691037db265ecdd914a26e056cf69207b4f50924ehkuang jne .x_nonhalf 51791037db265ecdd914a26e056cf69207b4f50924ehkuang ; x_offset == 0.5 51891037db265ecdd914a26e056cf69207b4f50924ehkuang test y_offsetd, y_offsetd 51991037db265ecdd914a26e056cf69207b4f50924ehkuang jnz .x_half_y_nonzero 52091037db265ecdd914a26e056cf69207b4f50924ehkuang 52191037db265ecdd914a26e056cf69207b4f50924ehkuang ; x_offset == 0.5 && y_offset == 0 52291037db265ecdd914a26e056cf69207b4f50924ehkuang.x_half_y_zero_loop: 52391037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16 52491037db265ecdd914a26e056cf69207b4f50924ehkuang movu m0, [srcq] 52591037db265ecdd914a26e056cf69207b4f50924ehkuang movu m4, [srcq+1] 52691037db265ecdd914a26e056cf69207b4f50924ehkuang mova m1, [dstq] 52791037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, m4 52891037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m1, m5 52991037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 53091037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 53191037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 53291037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 53391037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 53491037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 53591037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 53691037db265ecdd914a26e056cf69207b4f50924ehkuang 53791037db265ecdd914a26e056cf69207b4f50924ehkuang add srcq, src_strideq 53891037db265ecdd914a26e056cf69207b4f50924ehkuang add dstq, dst_strideq 53991037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16 54068e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m0, [srcq] 54168e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m4, [srcq+1] 54291037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 54368e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 54491037db265ecdd914a26e056cf69207b4f50924ehkuang movhps m0, [srcq+src_strideq] 54591037db265ecdd914a26e056cf69207b4f50924ehkuang movhps m4, [srcq+src_strideq+1] 54668e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh 54768e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [srcq+src_strideq] 54868e1c830ade592be74773e249bf94e2bbfb50de7Johann punpckldq m0, m1 54968e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m2, [srcq+src_strideq+1] 55068e1c830ade592be74773e249bf94e2bbfb50de7Johann punpckldq m4, m2 55168e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 55268e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 55368e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [dstq+dst_strideq] 55491037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, m4 55591037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 55668e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 55791037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 55891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 55991037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 56091037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 56168e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh 56268e1c830ade592be74773e249bf94e2bbfb50de7Johann movh m2, [secq] 56368e1c830ade592be74773e249bf94e2bbfb50de7Johann pavgb m0, m2 56468e1c830ade592be74773e249bf94e2bbfb50de7Johann punpcklbw m1, m5 56568e1c830ade592be74773e249bf94e2bbfb50de7Johann punpcklbw m0, m5 56668e1c830ade592be74773e249bf94e2bbfb50de7Johann movhlps m2, m0 56768e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 56891037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; !avg 56968e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m2, [srcq+src_strideq] 57068e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 57191037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, m4 57268e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m4, [srcq+src_strideq+1] 57368e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [dstq+dst_strideq] 57491037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m2, m4 57591037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 57691037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m2, m5 57791037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 57891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 57991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 58091037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 58191037db265ecdd914a26e056cf69207b4f50924ehkuang 58291037db265ecdd914a26e056cf69207b4f50924ehkuang lea srcq, [srcq+src_strideq*2] 58391037db265ecdd914a26e056cf69207b4f50924ehkuang lea dstq, [dstq+dst_strideq*2] 58491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 58591037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 58691037db265ecdd914a26e056cf69207b4f50924ehkuang add secq, sec_str 58791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 5887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian dec block_height 58991037db265ecdd914a26e056cf69207b4f50924ehkuang jg .x_half_y_zero_loop 59068e1c830ade592be74773e249bf94e2bbfb50de7Johann STORE_AND_RET %1 59191037db265ecdd914a26e056cf69207b4f50924ehkuang 59291037db265ecdd914a26e056cf69207b4f50924ehkuang.x_half_y_nonzero: 59368e1c830ade592be74773e249bf94e2bbfb50de7Johann cmp y_offsetd, 4 59491037db265ecdd914a26e056cf69207b4f50924ehkuang jne .x_half_y_nonhalf 59591037db265ecdd914a26e056cf69207b4f50924ehkuang 59691037db265ecdd914a26e056cf69207b4f50924ehkuang ; x_offset == 0.5 && y_offset == 0.5 59791037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16 59891037db265ecdd914a26e056cf69207b4f50924ehkuang movu m0, [srcq] 59991037db265ecdd914a26e056cf69207b4f50924ehkuang movu m3, [srcq+1] 60091037db265ecdd914a26e056cf69207b4f50924ehkuang add srcq, src_strideq 60191037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, m3 60291037db265ecdd914a26e056cf69207b4f50924ehkuang.x_half_y_half_loop: 60391037db265ecdd914a26e056cf69207b4f50924ehkuang movu m4, [srcq] 60491037db265ecdd914a26e056cf69207b4f50924ehkuang movu m3, [srcq+1] 60591037db265ecdd914a26e056cf69207b4f50924ehkuang mova m1, [dstq] 60691037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m4, m3 60791037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m1, m5 60891037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, m4 60991037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 61091037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 61191037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 61291037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 61391037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 61491037db265ecdd914a26e056cf69207b4f50924ehkuang%else 61591037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 61691037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 61791037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 61891037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 61991037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 62091037db265ecdd914a26e056cf69207b4f50924ehkuang mova m0, m4 62191037db265ecdd914a26e056cf69207b4f50924ehkuang 62291037db265ecdd914a26e056cf69207b4f50924ehkuang add srcq, src_strideq 62391037db265ecdd914a26e056cf69207b4f50924ehkuang add dstq, dst_strideq 62491037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16 62568e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m0, [srcq] 62668e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [srcq+1] 62791037db265ecdd914a26e056cf69207b4f50924ehkuang add srcq, src_strideq 62891037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, m3 62991037db265ecdd914a26e056cf69207b4f50924ehkuang.x_half_y_half_loop: 63068e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m2, [srcq] 63168e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [srcq+1] 63291037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 63368e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 63491037db265ecdd914a26e056cf69207b4f50924ehkuang movhps m2, [srcq+src_strideq] 63591037db265ecdd914a26e056cf69207b4f50924ehkuang movhps m3, [srcq+src_strideq+1] 63691037db265ecdd914a26e056cf69207b4f50924ehkuang%else 63768e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [srcq+src_strideq] 6381184aebb761cbeac9124c37189a80a1a58f04b6bhkuang punpckldq m2, m1 63968e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [srcq+src_strideq+1] 6401184aebb761cbeac9124c37189a80a1a58f04b6bhkuang punpckldq m3, m1 6411184aebb761cbeac9124c37189a80a1a58f04b6bhkuang%endif 64291037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m2, m3 64368e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 64491037db265ecdd914a26e056cf69207b4f50924ehkuang movlhps m0, m2 64591037db265ecdd914a26e056cf69207b4f50924ehkuang movhlps m4, m2 64668e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh 64791037db265ecdd914a26e056cf69207b4f50924ehkuang punpckldq m0, m2 64868e1c830ade592be74773e249bf94e2bbfb50de7Johann pshuflw m4, m2, 0xe 64991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 65068e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 65191037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, m2 65268e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [dstq+dst_strideq] 65368e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 65491037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 65568e1c830ade592be74773e249bf94e2bbfb50de7Johann%else 65668e1c830ade592be74773e249bf94e2bbfb50de7Johann movh m2, [secq] 65768e1c830ade592be74773e249bf94e2bbfb50de7Johann pavgb m0, m2 65868e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 65991037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 66091037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 66168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 66291037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 66391037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 66468e1c830ade592be74773e249bf94e2bbfb50de7Johann%else 66568e1c830ade592be74773e249bf94e2bbfb50de7Johann punpcklbw m0, m5 66668e1c830ade592be74773e249bf94e2bbfb50de7Johann movhlps m2, m0 66768e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 66891037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; !avg 66968e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m4, [srcq+src_strideq] 67068e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [srcq+src_strideq+1] 67191037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m2, m3 67291037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m4, m1 67391037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, m2 67491037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m2, m4 67568e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 67668e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [dstq+dst_strideq] 67791037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 67891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m2, m5 67991037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 68091037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 68191037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 68291037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 68391037db265ecdd914a26e056cf69207b4f50924ehkuang mova m0, m4 68491037db265ecdd914a26e056cf69207b4f50924ehkuang 68591037db265ecdd914a26e056cf69207b4f50924ehkuang lea srcq, [srcq+src_strideq*2] 68691037db265ecdd914a26e056cf69207b4f50924ehkuang lea dstq, [dstq+dst_strideq*2] 68791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 68891037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 68991037db265ecdd914a26e056cf69207b4f50924ehkuang add secq, sec_str 69091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 6917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian dec block_height 69291037db265ecdd914a26e056cf69207b4f50924ehkuang jg .x_half_y_half_loop 69368e1c830ade592be74773e249bf94e2bbfb50de7Johann STORE_AND_RET %1 69491037db265ecdd914a26e056cf69207b4f50924ehkuang 69591037db265ecdd914a26e056cf69207b4f50924ehkuang.x_half_y_nonhalf: 69691037db265ecdd914a26e056cf69207b4f50924ehkuang ; x_offset == 0.5 && y_offset == bilin interpolation 69791037db265ecdd914a26e056cf69207b4f50924ehkuang%ifdef PIC 69891037db265ecdd914a26e056cf69207b4f50924ehkuang lea bilin_filter, [bilin_filter_m] 69991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 70091037db265ecdd914a26e056cf69207b4f50924ehkuang shl y_offsetd, filter_idx_shift 70168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if ARCH_X86_64 && %1 > 4 70291037db265ecdd914a26e056cf69207b4f50924ehkuang mova m8, [bilin_filter+y_offsetq] 70391037db265ecdd914a26e056cf69207b4f50924ehkuang%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 70491037db265ecdd914a26e056cf69207b4f50924ehkuang mova m9, [bilin_filter+y_offsetq+16] 70591037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 70691037db265ecdd914a26e056cf69207b4f50924ehkuang mova m10, [pw_8] 70791037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_a m8 70891037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_b m9 70991037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd m10 7109b35249446b07f40ac5fcc3205f2c048616efacchkuang%else ;x86_32 7119b35249446b07f40ac5fcc3205f2c048616efacchkuang%if ARCH_X86=1 && CONFIG_PIC=1 7129b35249446b07f40ac5fcc3205f2c048616efacchkuang; x_offset == 0.5. We can reuse x_offset reg 7139b35249446b07f40ac5fcc3205f2c048616efacchkuang%define tempq x_offsetq 7149b35249446b07f40ac5fcc3205f2c048616efacchkuang add y_offsetq, g_bilin_filterm 7159b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_y_a [y_offsetq] 7169b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_y_b [y_offsetq+16] 7179b35249446b07f40ac5fcc3205f2c048616efacchkuang mov tempq, g_pw_8m 7189b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_rnd [tempq] 71991037db265ecdd914a26e056cf69207b4f50924ehkuang%else 72091037db265ecdd914a26e056cf69207b4f50924ehkuang add y_offsetq, bilin_filter 72191037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_a [y_offsetq] 72291037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_b [y_offsetq+16] 72391037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd [pw_8] 72491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 7259b35249446b07f40ac5fcc3205f2c048616efacchkuang%endif 7269b35249446b07f40ac5fcc3205f2c048616efacchkuang 72791037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16 72891037db265ecdd914a26e056cf69207b4f50924ehkuang movu m0, [srcq] 72991037db265ecdd914a26e056cf69207b4f50924ehkuang movu m3, [srcq+1] 73091037db265ecdd914a26e056cf69207b4f50924ehkuang add srcq, src_strideq 73191037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, m3 73291037db265ecdd914a26e056cf69207b4f50924ehkuang.x_half_y_other_loop: 73391037db265ecdd914a26e056cf69207b4f50924ehkuang movu m4, [srcq] 73491037db265ecdd914a26e056cf69207b4f50924ehkuang movu m2, [srcq+1] 73591037db265ecdd914a26e056cf69207b4f50924ehkuang mova m1, [dstq] 73691037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m4, m2 73791037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 73891037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m4 73991037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m4 74091037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m2, filter_y_a 74191037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m0, filter_y_a 74291037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 74391037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 74491037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 74591037db265ecdd914a26e056cf69207b4f50924ehkuang%else 74691037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 74791037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m4, m5 74891037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m2, filter_y_a 74991037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m3, filter_y_b 75091037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 75191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 75291037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, m3 75391037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m4, m5 75491037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m0, filter_y_a 75591037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m3, filter_y_b 75691037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 75791037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 75891037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, m3 75991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 76091037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m1, m5 76191037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m0, 4 76291037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 76391037db265ecdd914a26e056cf69207b4f50924ehkuang ; FIXME(rbultje) pipeline 76491037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m0, m2 76591037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 76691037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 76791037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 76891037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 76991037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 77091037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 77191037db265ecdd914a26e056cf69207b4f50924ehkuang mova m0, m4 77291037db265ecdd914a26e056cf69207b4f50924ehkuang 77391037db265ecdd914a26e056cf69207b4f50924ehkuang add srcq, src_strideq 77491037db265ecdd914a26e056cf69207b4f50924ehkuang add dstq, dst_strideq 77591037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16 77668e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m0, [srcq] 77768e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [srcq+1] 77891037db265ecdd914a26e056cf69207b4f50924ehkuang add srcq, src_strideq 77991037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, m3 78091037db265ecdd914a26e056cf69207b4f50924ehkuang%if notcpuflag(ssse3) 78191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 78291037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 78391037db265ecdd914a26e056cf69207b4f50924ehkuang.x_half_y_other_loop: 78468e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m2, [srcq] 78568e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [srcq+1] 78668e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m4, [srcq+src_strideq] 78768e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [srcq+src_strideq+1] 78891037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m2, m1 78991037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m4, m3 79068e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [dstq+dst_strideq] 79191037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 79268e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 79391037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m2 79491037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m2, m4 79591037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m0, filter_y_a 79691037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m2, filter_y_a 79791037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 79891037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 79991037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 80091037db265ecdd914a26e056cf69207b4f50924ehkuang%else 80191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m2, m5 80291037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m4, m5 80391037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m0, filter_y_a 80491037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m1, m2, filter_y_b 80591037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 80691037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 80791037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m2, filter_y_a 80891037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, m1 80991037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m1, m4, filter_y_b 81091037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 81191037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, m1 81268e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 81391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 81491037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m0, 4 81591037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 81691037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 81791037db265ecdd914a26e056cf69207b4f50924ehkuang ; FIXME(rbultje) pipeline 81868e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 == 4 81968e1c830ade592be74773e249bf94e2bbfb50de7Johann movlhps m0, m2 82068e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 82191037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m0, m2 82268e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 82391037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 82491037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 82591037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 82668e1c830ade592be74773e249bf94e2bbfb50de7Johann%else 82768e1c830ade592be74773e249bf94e2bbfb50de7Johann movh m2, [secq] 82868e1c830ade592be74773e249bf94e2bbfb50de7Johann pavgb m0, m2 82968e1c830ade592be74773e249bf94e2bbfb50de7Johann punpcklbw m0, m5 83068e1c830ade592be74773e249bf94e2bbfb50de7Johann movhlps m2, m0 83168e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 83291037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 83391037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 83491037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 83591037db265ecdd914a26e056cf69207b4f50924ehkuang mova m0, m4 83691037db265ecdd914a26e056cf69207b4f50924ehkuang 83791037db265ecdd914a26e056cf69207b4f50924ehkuang lea srcq, [srcq+src_strideq*2] 83891037db265ecdd914a26e056cf69207b4f50924ehkuang lea dstq, [dstq+dst_strideq*2] 83991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 84091037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 84191037db265ecdd914a26e056cf69207b4f50924ehkuang add secq, sec_str 84291037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 8437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian dec block_height 84491037db265ecdd914a26e056cf69207b4f50924ehkuang jg .x_half_y_other_loop 84591037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_y_a 84691037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_y_b 84791037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_rnd 84868e1c830ade592be74773e249bf94e2bbfb50de7Johann STORE_AND_RET %1 84991037db265ecdd914a26e056cf69207b4f50924ehkuang 85091037db265ecdd914a26e056cf69207b4f50924ehkuang.x_nonhalf: 85191037db265ecdd914a26e056cf69207b4f50924ehkuang test y_offsetd, y_offsetd 85291037db265ecdd914a26e056cf69207b4f50924ehkuang jnz .x_nonhalf_y_nonzero 85391037db265ecdd914a26e056cf69207b4f50924ehkuang 85491037db265ecdd914a26e056cf69207b4f50924ehkuang ; x_offset == bilin interpolation && y_offset == 0 85591037db265ecdd914a26e056cf69207b4f50924ehkuang%ifdef PIC 85691037db265ecdd914a26e056cf69207b4f50924ehkuang lea bilin_filter, [bilin_filter_m] 85791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 85891037db265ecdd914a26e056cf69207b4f50924ehkuang shl x_offsetd, filter_idx_shift 85968e1c830ade592be74773e249bf94e2bbfb50de7Johann%if ARCH_X86_64 && %1 > 4 86091037db265ecdd914a26e056cf69207b4f50924ehkuang mova m8, [bilin_filter+x_offsetq] 86191037db265ecdd914a26e056cf69207b4f50924ehkuang%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 86291037db265ecdd914a26e056cf69207b4f50924ehkuang mova m9, [bilin_filter+x_offsetq+16] 86391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 86491037db265ecdd914a26e056cf69207b4f50924ehkuang mova m10, [pw_8] 86591037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_a m8 86691037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_b m9 86791037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd m10 8689b35249446b07f40ac5fcc3205f2c048616efacchkuang%else ; x86-32 8699b35249446b07f40ac5fcc3205f2c048616efacchkuang%if ARCH_X86=1 && CONFIG_PIC=1 8709b35249446b07f40ac5fcc3205f2c048616efacchkuang;y_offset == 0. We can reuse y_offset reg. 8719b35249446b07f40ac5fcc3205f2c048616efacchkuang%define tempq y_offsetq 8729b35249446b07f40ac5fcc3205f2c048616efacchkuang add x_offsetq, g_bilin_filterm 8739b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_x_a [x_offsetq] 8749b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_x_b [x_offsetq+16] 8759b35249446b07f40ac5fcc3205f2c048616efacchkuang mov tempq, g_pw_8m 8769b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_rnd [tempq] 87791037db265ecdd914a26e056cf69207b4f50924ehkuang%else 87891037db265ecdd914a26e056cf69207b4f50924ehkuang add x_offsetq, bilin_filter 87991037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_a [x_offsetq] 88091037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_b [x_offsetq+16] 88191037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd [pw_8] 88291037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 8839b35249446b07f40ac5fcc3205f2c048616efacchkuang%endif 8849b35249446b07f40ac5fcc3205f2c048616efacchkuang 88591037db265ecdd914a26e056cf69207b4f50924ehkuang.x_other_y_zero_loop: 88691037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16 88791037db265ecdd914a26e056cf69207b4f50924ehkuang movu m0, [srcq] 88891037db265ecdd914a26e056cf69207b4f50924ehkuang movu m4, [srcq+1] 88991037db265ecdd914a26e056cf69207b4f50924ehkuang mova m1, [dstq] 89091037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 89191037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m4 89291037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m4 89391037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m2, filter_x_a 89491037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m0, filter_x_a 89591037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 89691037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 89791037db265ecdd914a26e056cf69207b4f50924ehkuang%else 89891037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 89991037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m4, m5 90091037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 90191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m4, m5 90291037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m2, filter_x_a 90391037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m3, filter_x_b 90491037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 90591037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m0, filter_x_a 90691037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m4, filter_x_b 90791037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 90891037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, m3 90991037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, m4 91091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 91191037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 91291037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m0, 4 91391037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 91491037db265ecdd914a26e056cf69207b4f50924ehkuang ; FIXME(rbultje) pipeline 91591037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m0, m2 91691037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 91791037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 91891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 91991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 92091037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m1, m5 92191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 92291037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 92391037db265ecdd914a26e056cf69207b4f50924ehkuang 92491037db265ecdd914a26e056cf69207b4f50924ehkuang add srcq, src_strideq 92591037db265ecdd914a26e056cf69207b4f50924ehkuang add dstq, dst_strideq 92691037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16 92768e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m0, [srcq] 92868e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [srcq+1] 92968e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m2, [srcq+src_strideq] 93068e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m4, [srcq+src_strideq+1] 93168e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [dstq+dst_strideq] 93291037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 93391037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m1 93468e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 93591037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m2, m4 93691037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m0, filter_x_a 93791037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m2, filter_x_a 93891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 93991037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 94091037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 94191037db265ecdd914a26e056cf69207b4f50924ehkuang%else 94291037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 94391037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 94491037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m2, m5 94591037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m4, m5 94691037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m0, filter_x_a 94791037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m1, filter_x_b 94891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 94991037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 95091037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m2, filter_x_a 95191037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m4, filter_x_b 95291037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, m1 95391037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 95468e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 95591037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, m4 95691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 95791037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m0, 4 95891037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 95991037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 96091037db265ecdd914a26e056cf69207b4f50924ehkuang ; FIXME(rbultje) pipeline 96168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 == 4 96268e1c830ade592be74773e249bf94e2bbfb50de7Johann movlhps m0, m2 96368e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 96491037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m0, m2 96568e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 96691037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 96791037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 96891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 96968e1c830ade592be74773e249bf94e2bbfb50de7Johann%else 97068e1c830ade592be74773e249bf94e2bbfb50de7Johann movh m2, [secq] 97168e1c830ade592be74773e249bf94e2bbfb50de7Johann pavgb m0, m2 97268e1c830ade592be74773e249bf94e2bbfb50de7Johann punpcklbw m0, m5 97368e1c830ade592be74773e249bf94e2bbfb50de7Johann movhlps m2, m0 97468e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 97591037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 97691037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 97791037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 97891037db265ecdd914a26e056cf69207b4f50924ehkuang 97991037db265ecdd914a26e056cf69207b4f50924ehkuang lea srcq, [srcq+src_strideq*2] 98091037db265ecdd914a26e056cf69207b4f50924ehkuang lea dstq, [dstq+dst_strideq*2] 98191037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 98291037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 98391037db265ecdd914a26e056cf69207b4f50924ehkuang add secq, sec_str 98491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 9857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian dec block_height 98691037db265ecdd914a26e056cf69207b4f50924ehkuang jg .x_other_y_zero_loop 98791037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_x_a 98891037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_x_b 98991037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_rnd 99068e1c830ade592be74773e249bf94e2bbfb50de7Johann STORE_AND_RET %1 99191037db265ecdd914a26e056cf69207b4f50924ehkuang 99291037db265ecdd914a26e056cf69207b4f50924ehkuang.x_nonhalf_y_nonzero: 99368e1c830ade592be74773e249bf94e2bbfb50de7Johann cmp y_offsetd, 4 99491037db265ecdd914a26e056cf69207b4f50924ehkuang jne .x_nonhalf_y_nonhalf 99591037db265ecdd914a26e056cf69207b4f50924ehkuang 99691037db265ecdd914a26e056cf69207b4f50924ehkuang ; x_offset == bilin interpolation && y_offset == 0.5 99791037db265ecdd914a26e056cf69207b4f50924ehkuang%ifdef PIC 99891037db265ecdd914a26e056cf69207b4f50924ehkuang lea bilin_filter, [bilin_filter_m] 99991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 100091037db265ecdd914a26e056cf69207b4f50924ehkuang shl x_offsetd, filter_idx_shift 100168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if ARCH_X86_64 && %1 > 4 100291037db265ecdd914a26e056cf69207b4f50924ehkuang mova m8, [bilin_filter+x_offsetq] 100391037db265ecdd914a26e056cf69207b4f50924ehkuang%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 100491037db265ecdd914a26e056cf69207b4f50924ehkuang mova m9, [bilin_filter+x_offsetq+16] 100591037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 100691037db265ecdd914a26e056cf69207b4f50924ehkuang mova m10, [pw_8] 100791037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_a m8 100891037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_b m9 100991037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd m10 10109b35249446b07f40ac5fcc3205f2c048616efacchkuang%else ; x86-32 10119b35249446b07f40ac5fcc3205f2c048616efacchkuang%if ARCH_X86=1 && CONFIG_PIC=1 10129b35249446b07f40ac5fcc3205f2c048616efacchkuang; y_offset == 0.5. We can reuse y_offset reg. 10139b35249446b07f40ac5fcc3205f2c048616efacchkuang%define tempq y_offsetq 10149b35249446b07f40ac5fcc3205f2c048616efacchkuang add x_offsetq, g_bilin_filterm 10159b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_x_a [x_offsetq] 10169b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_x_b [x_offsetq+16] 10179b35249446b07f40ac5fcc3205f2c048616efacchkuang mov tempq, g_pw_8m 10189b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_rnd [tempq] 101991037db265ecdd914a26e056cf69207b4f50924ehkuang%else 102091037db265ecdd914a26e056cf69207b4f50924ehkuang add x_offsetq, bilin_filter 102191037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_a [x_offsetq] 102291037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_b [x_offsetq+16] 102391037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd [pw_8] 102491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 10259b35249446b07f40ac5fcc3205f2c048616efacchkuang%endif 10269b35249446b07f40ac5fcc3205f2c048616efacchkuang 102791037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16 102891037db265ecdd914a26e056cf69207b4f50924ehkuang movu m0, [srcq] 102991037db265ecdd914a26e056cf69207b4f50924ehkuang movu m1, [srcq+1] 103091037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 103191037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m1 103291037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m1 103391037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m2, filter_x_a 103491037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m0, filter_x_a 103591037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 103691037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 103791037db265ecdd914a26e056cf69207b4f50924ehkuang%else 103891037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 103991037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m1, m5 104091037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 104191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 104291037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m0, filter_x_a 104391037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m1, filter_x_b 104491037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 104591037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m2, filter_x_a 104691037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m3, filter_x_b 104791037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 104891037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, m1 104991037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, m3 105091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 105191037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m0, 4 105291037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 105391037db265ecdd914a26e056cf69207b4f50924ehkuang add srcq, src_strideq 105491037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m0, m2 105591037db265ecdd914a26e056cf69207b4f50924ehkuang.x_other_y_half_loop: 105691037db265ecdd914a26e056cf69207b4f50924ehkuang movu m4, [srcq] 105791037db265ecdd914a26e056cf69207b4f50924ehkuang movu m3, [srcq+1] 105891037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 105991037db265ecdd914a26e056cf69207b4f50924ehkuang mova m1, [dstq] 106091037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m4, m3 106191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m4, m3 106291037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m2, filter_x_a 106391037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m4, filter_x_a 106491037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 106591037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m4, filter_rnd 106691037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 106791037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m4, 4 106891037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m4, m2 106991037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, m4 107091037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m1, m5 107191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 107291037db265ecdd914a26e056cf69207b4f50924ehkuang%else 107391037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m4, m5 107491037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m1, m3, m5 107591037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m4, m5 107691037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 107791037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m4, filter_x_a 107891037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m3, filter_x_b 107991037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m4, filter_rnd 108091037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m2, filter_x_a 108191037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m1, filter_x_b 108291037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 108391037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m4, m3 108491037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, m1 108591037db265ecdd914a26e056cf69207b4f50924ehkuang mova m1, [dstq] 108691037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m4, 4 108791037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 108891037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m1, m5 108991037db265ecdd914a26e056cf69207b4f50924ehkuang ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we 109091037db265ecdd914a26e056cf69207b4f50924ehkuang ; have a 1-register shortage to be able to store the backup of the bilin 109191037db265ecdd914a26e056cf69207b4f50924ehkuang ; filtered second line as words as cache for the next line. Packing into 109291037db265ecdd914a26e056cf69207b4f50924ehkuang ; a byte costs 1 pack and 2 unpacks, but saves a register. 109391037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m4, m2 109491037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 109591037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, m4 109691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 109791037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 109891037db265ecdd914a26e056cf69207b4f50924ehkuang ; FIXME(rbultje) pipeline 109991037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 110091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 110191037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 110291037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 110391037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 110491037db265ecdd914a26e056cf69207b4f50924ehkuang mova m0, m4 110591037db265ecdd914a26e056cf69207b4f50924ehkuang 110691037db265ecdd914a26e056cf69207b4f50924ehkuang add srcq, src_strideq 110791037db265ecdd914a26e056cf69207b4f50924ehkuang add dstq, dst_strideq 110891037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16 110968e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m0, [srcq] 111068e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [srcq+1] 111191037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 111291037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m1 111391037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m0, filter_x_a 111491037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 111591037db265ecdd914a26e056cf69207b4f50924ehkuang%else 111691037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 111791037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 111891037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m0, filter_x_a 111991037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m1, filter_x_b 112091037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 112191037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, m1 112291037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 112391037db265ecdd914a26e056cf69207b4f50924ehkuang add srcq, src_strideq 112491037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m0, 4 112591037db265ecdd914a26e056cf69207b4f50924ehkuang.x_other_y_half_loop: 112668e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m2, [srcq] 112768e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [srcq+1] 112868e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m4, [srcq+src_strideq] 112968e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [srcq+src_strideq+1] 113091037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 113191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m2, m1 113291037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m4, m3 113391037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m2, filter_x_a 113491037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m4, filter_x_a 113568e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 113668e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [dstq+dst_strideq] 113791037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 113891037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m4, filter_rnd 113991037db265ecdd914a26e056cf69207b4f50924ehkuang%else 114091037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m2, m5 114191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 114291037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m4, m5 114391037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 114491037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m2, filter_x_a 114591037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m1, filter_x_b 114691037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 114791037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m4, filter_x_a 114891037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m3, filter_x_b 114991037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m4, filter_rnd 115091037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, m1 115168e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 115291037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m4, m3 115368e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [dstq+dst_strideq] 115491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 115591037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 115691037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m4, 4 115791037db265ecdd914a26e056cf69207b4f50924ehkuang pavgw m0, m2 115891037db265ecdd914a26e056cf69207b4f50924ehkuang pavgw m2, m4 115991037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 116091037db265ecdd914a26e056cf69207b4f50924ehkuang ; FIXME(rbultje) pipeline - also consider going to bytes here 116168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 == 4 116268e1c830ade592be74773e249bf94e2bbfb50de7Johann movlhps m0, m2 116368e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 116491037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m0, m2 116568e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 116691037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 116791037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 116891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 116968e1c830ade592be74773e249bf94e2bbfb50de7Johann%else 117068e1c830ade592be74773e249bf94e2bbfb50de7Johann movh m2, [secq] 117168e1c830ade592be74773e249bf94e2bbfb50de7Johann pavgb m0, m2 117268e1c830ade592be74773e249bf94e2bbfb50de7Johann punpcklbw m0, m5 117368e1c830ade592be74773e249bf94e2bbfb50de7Johann movhlps m2, m0 117468e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 117591037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 117691037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 117791037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 117891037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 117991037db265ecdd914a26e056cf69207b4f50924ehkuang mova m0, m4 118091037db265ecdd914a26e056cf69207b4f50924ehkuang 118191037db265ecdd914a26e056cf69207b4f50924ehkuang lea srcq, [srcq+src_strideq*2] 118291037db265ecdd914a26e056cf69207b4f50924ehkuang lea dstq, [dstq+dst_strideq*2] 118391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 118491037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 118591037db265ecdd914a26e056cf69207b4f50924ehkuang add secq, sec_str 118691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 11877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian dec block_height 118891037db265ecdd914a26e056cf69207b4f50924ehkuang jg .x_other_y_half_loop 118991037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_x_a 119091037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_x_b 119191037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_rnd 119268e1c830ade592be74773e249bf94e2bbfb50de7Johann STORE_AND_RET %1 119391037db265ecdd914a26e056cf69207b4f50924ehkuang 119491037db265ecdd914a26e056cf69207b4f50924ehkuang.x_nonhalf_y_nonhalf: 119591037db265ecdd914a26e056cf69207b4f50924ehkuang%ifdef PIC 119691037db265ecdd914a26e056cf69207b4f50924ehkuang lea bilin_filter, [bilin_filter_m] 119791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 119891037db265ecdd914a26e056cf69207b4f50924ehkuang shl x_offsetd, filter_idx_shift 119991037db265ecdd914a26e056cf69207b4f50924ehkuang shl y_offsetd, filter_idx_shift 120068e1c830ade592be74773e249bf94e2bbfb50de7Johann%if ARCH_X86_64 && %1 > 4 120191037db265ecdd914a26e056cf69207b4f50924ehkuang mova m8, [bilin_filter+x_offsetq] 120291037db265ecdd914a26e056cf69207b4f50924ehkuang%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 120391037db265ecdd914a26e056cf69207b4f50924ehkuang mova m9, [bilin_filter+x_offsetq+16] 120491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 120591037db265ecdd914a26e056cf69207b4f50924ehkuang mova m10, [bilin_filter+y_offsetq] 120691037db265ecdd914a26e056cf69207b4f50924ehkuang%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 120791037db265ecdd914a26e056cf69207b4f50924ehkuang mova m11, [bilin_filter+y_offsetq+16] 120891037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 120991037db265ecdd914a26e056cf69207b4f50924ehkuang mova m12, [pw_8] 121091037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_a m8 121191037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_b m9 121291037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_a m10 121391037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_b m11 121491037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd m12 12159b35249446b07f40ac5fcc3205f2c048616efacchkuang%else ; x86-32 12169b35249446b07f40ac5fcc3205f2c048616efacchkuang%if ARCH_X86=1 && CONFIG_PIC=1 12179b35249446b07f40ac5fcc3205f2c048616efacchkuang; In this case, there is NO unused register. Used src_stride register. Later, 12189b35249446b07f40ac5fcc3205f2c048616efacchkuang; src_stride has to be loaded from stack when it is needed. 12199b35249446b07f40ac5fcc3205f2c048616efacchkuang%define tempq src_strideq 12209b35249446b07f40ac5fcc3205f2c048616efacchkuang mov tempq, g_bilin_filterm 12219b35249446b07f40ac5fcc3205f2c048616efacchkuang add x_offsetq, tempq 12229b35249446b07f40ac5fcc3205f2c048616efacchkuang add y_offsetq, tempq 12239b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_x_a [x_offsetq] 12249b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_x_b [x_offsetq+16] 12259b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_y_a [y_offsetq] 12269b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_y_b [y_offsetq+16] 12279b35249446b07f40ac5fcc3205f2c048616efacchkuang 12289b35249446b07f40ac5fcc3205f2c048616efacchkuang mov tempq, g_pw_8m 12299b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_rnd [tempq] 123091037db265ecdd914a26e056cf69207b4f50924ehkuang%else 123191037db265ecdd914a26e056cf69207b4f50924ehkuang add x_offsetq, bilin_filter 123291037db265ecdd914a26e056cf69207b4f50924ehkuang add y_offsetq, bilin_filter 123391037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_a [x_offsetq] 123491037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_b [x_offsetq+16] 123591037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_a [y_offsetq] 123691037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_b [y_offsetq+16] 123791037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd [pw_8] 123891037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 12399b35249446b07f40ac5fcc3205f2c048616efacchkuang%endif 12409b35249446b07f40ac5fcc3205f2c048616efacchkuang 124191037db265ecdd914a26e056cf69207b4f50924ehkuang ; x_offset == bilin interpolation && y_offset == bilin interpolation 124291037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16 124391037db265ecdd914a26e056cf69207b4f50924ehkuang movu m0, [srcq] 124491037db265ecdd914a26e056cf69207b4f50924ehkuang movu m1, [srcq+1] 124591037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 124691037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m1 124791037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m1 124891037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m2, filter_x_a 124991037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m0, filter_x_a 125091037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 125191037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 125291037db265ecdd914a26e056cf69207b4f50924ehkuang%else 125391037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 125491037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m1, m5 125591037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 125691037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 125791037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m0, filter_x_a 125891037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m1, filter_x_b 125991037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 126091037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m2, filter_x_a 126191037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m3, filter_x_b 126291037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 126391037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, m1 126491037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, m3 126591037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 126691037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m0, 4 126791037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 12689b35249446b07f40ac5fcc3205f2c048616efacchkuang 12699b35249446b07f40ac5fcc3205f2c048616efacchkuang INC_SRC_BY_SRC_STRIDE 12709b35249446b07f40ac5fcc3205f2c048616efacchkuang 127191037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m0, m2 127291037db265ecdd914a26e056cf69207b4f50924ehkuang.x_other_y_other_loop: 127391037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 127491037db265ecdd914a26e056cf69207b4f50924ehkuang movu m4, [srcq] 127591037db265ecdd914a26e056cf69207b4f50924ehkuang movu m3, [srcq+1] 127691037db265ecdd914a26e056cf69207b4f50924ehkuang mova m1, [dstq] 127791037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m4, m3 127891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m4, m3 127991037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m2, filter_x_a 128091037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m4, filter_x_a 128191037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m1, m5 128291037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 128391037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m4, filter_rnd 128491037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 128591037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m4, 4 128691037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m4, m2 128791037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m4 128891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m4 128991037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m2, filter_y_a 129091037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m0, filter_y_a 129191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 129291037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 129391037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 129491037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 129591037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m0, 4 129691037db265ecdd914a26e056cf69207b4f50924ehkuang%else 129791037db265ecdd914a26e056cf69207b4f50924ehkuang movu m3, [srcq] 129891037db265ecdd914a26e056cf69207b4f50924ehkuang movu m4, [srcq+1] 129991037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m1, m3, m5 130091037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m4, m5 130191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 130291037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m4, m5 130391037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m3, filter_x_a 130491037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m4, filter_x_b 130591037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m3, filter_rnd 130691037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m1, filter_x_a 130791037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m2, filter_x_b 130891037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m1, filter_rnd 130991037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m3, m4 131091037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m1, m2 131191037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m3, 4 131291037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m1, 4 131391037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m4, m3, m1 131491037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 131591037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 131691037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m2, filter_y_a 131791037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m1, filter_y_b 131891037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 131991037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m0, filter_y_a 132091037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m3, filter_y_b 132191037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, m1 132291037db265ecdd914a26e056cf69207b4f50924ehkuang mova m1, [dstq] 132391037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 132491037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 132591037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, m3 132691037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m3, m1, m5 132791037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m0, 4 132891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 132991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 133091037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 133191037db265ecdd914a26e056cf69207b4f50924ehkuang ; FIXME(rbultje) pipeline 133291037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m0, m2 133391037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 133491037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 133591037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 133691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 133791037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 133891037db265ecdd914a26e056cf69207b4f50924ehkuang mova m0, m4 133991037db265ecdd914a26e056cf69207b4f50924ehkuang 13409b35249446b07f40ac5fcc3205f2c048616efacchkuang INC_SRC_BY_SRC_STRIDE 134191037db265ecdd914a26e056cf69207b4f50924ehkuang add dstq, dst_strideq 134291037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16 134368e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m0, [srcq] 134468e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [srcq+1] 134591037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 134691037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m1 134791037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m0, filter_x_a 134891037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 134991037db265ecdd914a26e056cf69207b4f50924ehkuang%else 135091037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 135191037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 135291037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m0, filter_x_a 135391037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m1, filter_x_b 135491037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 135591037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, m1 135691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 135791037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m0, 4 135891037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 135991037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m0, m0 136091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 13619b35249446b07f40ac5fcc3205f2c048616efacchkuang 13629b35249446b07f40ac5fcc3205f2c048616efacchkuang INC_SRC_BY_SRC_STRIDE 13639b35249446b07f40ac5fcc3205f2c048616efacchkuang 136491037db265ecdd914a26e056cf69207b4f50924ehkuang.x_other_y_other_loop: 136568e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m2, [srcq] 136668e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [srcq+1] 13679b35249446b07f40ac5fcc3205f2c048616efacchkuang 13689b35249446b07f40ac5fcc3205f2c048616efacchkuang INC_SRC_BY_SRC_STRIDE 136968e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m4, [srcq] 137068e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [srcq+1] 13719b35249446b07f40ac5fcc3205f2c048616efacchkuang 137291037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3) 137391037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m2, m1 137491037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m4, m3 137591037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m2, filter_x_a 137691037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m4, filter_x_a 137768e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [dstq+dst_strideq] 137868e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 137991037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 138091037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m4, filter_rnd 138191037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 138291037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m4, 4 138391037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m2, m2 138491037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m4, m4 138591037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m2 138691037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m2, m4 138791037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m0, filter_y_a 138891037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddubsw m2, filter_y_a 138991037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 139091037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 139191037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 139291037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m0, 4 139391037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 139491037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 139591037db265ecdd914a26e056cf69207b4f50924ehkuang%else 139691037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m2, m5 139791037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 139891037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m4, m5 139991037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 140091037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m2, filter_x_a 140191037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m1, filter_x_b 140291037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 140391037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m4, filter_x_a 140491037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m3, filter_x_b 140591037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m4, filter_rnd 140691037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, m1 140791037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m4, m3 140891037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 140991037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m4, 4 141091037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m0, filter_y_a 141191037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m3, m2, filter_y_b 141291037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, filter_rnd 141391037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m2, filter_y_a 141491037db265ecdd914a26e056cf69207b4f50924ehkuang pmullw m1, m4, filter_y_b 141591037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, filter_rnd 141691037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m0, m3 141768e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m3, [dstq+dst_strideq] 141891037db265ecdd914a26e056cf69207b4f50924ehkuang paddw m2, m1 141968e1c830ade592be74773e249bf94e2bbfb50de7Johann movx m1, [dstq] 142091037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m0, 4 142191037db265ecdd914a26e056cf69207b4f50924ehkuang psraw m2, 4 142291037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m3, m5 142391037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m1, m5 142491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 142591037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 142691037db265ecdd914a26e056cf69207b4f50924ehkuang ; FIXME(rbultje) pipeline 142768e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 == 4 142868e1c830ade592be74773e249bf94e2bbfb50de7Johann movlhps m0, m2 142968e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 143091037db265ecdd914a26e056cf69207b4f50924ehkuang packuswb m0, m2 143168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4 143291037db265ecdd914a26e056cf69207b4f50924ehkuang pavgb m0, [secq] 143391037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhbw m2, m0, m5 143491037db265ecdd914a26e056cf69207b4f50924ehkuang punpcklbw m0, m5 143568e1c830ade592be74773e249bf94e2bbfb50de7Johann%else 143668e1c830ade592be74773e249bf94e2bbfb50de7Johann movh m2, [secq] 143768e1c830ade592be74773e249bf94e2bbfb50de7Johann pavgb m0, m2 143868e1c830ade592be74773e249bf94e2bbfb50de7Johann punpcklbw m0, m5 143968e1c830ade592be74773e249bf94e2bbfb50de7Johann movhlps m2, m0 144068e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif 144191037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 144291037db265ecdd914a26e056cf69207b4f50924ehkuang SUM_SSE m0, m1, m2, m3, m6, m7 144391037db265ecdd914a26e056cf69207b4f50924ehkuang mova m0, m4 144491037db265ecdd914a26e056cf69207b4f50924ehkuang 14459b35249446b07f40ac5fcc3205f2c048616efacchkuang INC_SRC_BY_SRC_STRIDE 144691037db265ecdd914a26e056cf69207b4f50924ehkuang lea dstq, [dstq+dst_strideq*2] 144791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 144891037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg 144991037db265ecdd914a26e056cf69207b4f50924ehkuang add secq, sec_str 145091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 14517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian dec block_height 145291037db265ecdd914a26e056cf69207b4f50924ehkuang jg .x_other_y_other_loop 145391037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_x_a 145491037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_x_b 145591037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_y_a 145691037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_y_b 145791037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_rnd 145868e1c830ade592be74773e249bf94e2bbfb50de7Johann%undef movx 145968e1c830ade592be74773e249bf94e2bbfb50de7Johann STORE_AND_RET %1 146091037db265ecdd914a26e056cf69207b4f50924ehkuang%endmacro 146191037db265ecdd914a26e056cf69207b4f50924ehkuang 146291037db265ecdd914a26e056cf69207b4f50924ehkuang; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical 146391037db265ecdd914a26e056cf69207b4f50924ehkuang; between the ssse3 and non-ssse3 version. It may make sense to merge their 146491037db265ecdd914a26e056cf69207b4f50924ehkuang; code in the sense that the ssse3 version would jump to the appropriate 146591037db265ecdd914a26e056cf69207b4f50924ehkuang; location in the sse/2 version, rather than duplicating that code in the 146691037db265ecdd914a26e056cf69207b4f50924ehkuang; binary. 146791037db265ecdd914a26e056cf69207b4f50924ehkuang 146891037db265ecdd914a26e056cf69207b4f50924ehkuangINIT_XMM sse2 146968e1c830ade592be74773e249bf94e2bbfb50de7JohannSUBPEL_VARIANCE 4 147091037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE 8 147191037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE 16 147291037db265ecdd914a26e056cf69207b4f50924ehkuang 147391037db265ecdd914a26e056cf69207b4f50924ehkuangINIT_XMM ssse3 147468e1c830ade592be74773e249bf94e2bbfb50de7JohannSUBPEL_VARIANCE 4 147591037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE 8 147691037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE 16 147791037db265ecdd914a26e056cf69207b4f50924ehkuang 147891037db265ecdd914a26e056cf69207b4f50924ehkuangINIT_XMM sse2 147968e1c830ade592be74773e249bf94e2bbfb50de7JohannSUBPEL_VARIANCE 4, 1 148091037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE 8, 1 148191037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE 16, 1 148291037db265ecdd914a26e056cf69207b4f50924ehkuang 148391037db265ecdd914a26e056cf69207b4f50924ehkuangINIT_XMM ssse3 148468e1c830ade592be74773e249bf94e2bbfb50de7JohannSUBPEL_VARIANCE 4, 1 148591037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE 8, 1 148691037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE 16, 1 1487