191037db265ecdd914a26e056cf69207b4f50924ehkuang;
291037db265ecdd914a26e056cf69207b4f50924ehkuang;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
391037db265ecdd914a26e056cf69207b4f50924ehkuang;
491037db265ecdd914a26e056cf69207b4f50924ehkuang;  Use of this source code is governed by a BSD-style license
591037db265ecdd914a26e056cf69207b4f50924ehkuang;  that can be found in the LICENSE file in the root of the source
691037db265ecdd914a26e056cf69207b4f50924ehkuang;  tree. An additional intellectual property rights grant can be found
791037db265ecdd914a26e056cf69207b4f50924ehkuang;  in the file PATENTS.  All contributing project authors may
891037db265ecdd914a26e056cf69207b4f50924ehkuang;  be found in the AUTHORS file in the root of the source tree.
991037db265ecdd914a26e056cf69207b4f50924ehkuang;
1091037db265ecdd914a26e056cf69207b4f50924ehkuang
1191037db265ecdd914a26e056cf69207b4f50924ehkuang%include "third_party/x86inc/x86inc.asm"
1291037db265ecdd914a26e056cf69207b4f50924ehkuang
1391037db265ecdd914a26e056cf69207b4f50924ehkuangSECTION_RODATA
1491037db265ecdd914a26e056cf69207b4f50924ehkuangpw_8: times  8 dw  8
1591037db265ecdd914a26e056cf69207b4f50924ehkuangbilin_filter_m_sse2: times  8 dw 16
1691037db265ecdd914a26e056cf69207b4f50924ehkuang                     times  8 dw  0
1791037db265ecdd914a26e056cf69207b4f50924ehkuang                     times  8 dw 14
1891037db265ecdd914a26e056cf69207b4f50924ehkuang                     times  8 dw  2
1991037db265ecdd914a26e056cf69207b4f50924ehkuang                     times  8 dw 12
2091037db265ecdd914a26e056cf69207b4f50924ehkuang                     times  8 dw  4
2191037db265ecdd914a26e056cf69207b4f50924ehkuang                     times  8 dw 10
2291037db265ecdd914a26e056cf69207b4f50924ehkuang                     times  8 dw  6
2391037db265ecdd914a26e056cf69207b4f50924ehkuang                     times 16 dw  8
2491037db265ecdd914a26e056cf69207b4f50924ehkuang                     times  8 dw  6
2591037db265ecdd914a26e056cf69207b4f50924ehkuang                     times  8 dw 10
2691037db265ecdd914a26e056cf69207b4f50924ehkuang                     times  8 dw  4
2791037db265ecdd914a26e056cf69207b4f50924ehkuang                     times  8 dw 12
2891037db265ecdd914a26e056cf69207b4f50924ehkuang                     times  8 dw  2
2991037db265ecdd914a26e056cf69207b4f50924ehkuang                     times  8 dw 14
3091037db265ecdd914a26e056cf69207b4f50924ehkuang
3191037db265ecdd914a26e056cf69207b4f50924ehkuangbilin_filter_m_ssse3: times  8 db 16,  0
3291037db265ecdd914a26e056cf69207b4f50924ehkuang                      times  8 db 14,  2
3391037db265ecdd914a26e056cf69207b4f50924ehkuang                      times  8 db 12,  4
3491037db265ecdd914a26e056cf69207b4f50924ehkuang                      times  8 db 10,  6
3591037db265ecdd914a26e056cf69207b4f50924ehkuang                      times 16 db  8
3691037db265ecdd914a26e056cf69207b4f50924ehkuang                      times  8 db  6, 10
3791037db265ecdd914a26e056cf69207b4f50924ehkuang                      times  8 db  4, 12
3891037db265ecdd914a26e056cf69207b4f50924ehkuang                      times  8 db  2, 14
3991037db265ecdd914a26e056cf69207b4f50924ehkuang
4091037db265ecdd914a26e056cf69207b4f50924ehkuangSECTION .text
4191037db265ecdd914a26e056cf69207b4f50924ehkuang
427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
4391037db265ecdd914a26e056cf69207b4f50924ehkuang;                               int x_offset, int y_offset,
4491037db265ecdd914a26e056cf69207b4f50924ehkuang;                               const uint8_t *dst, ptrdiff_t dst_stride,
4591037db265ecdd914a26e056cf69207b4f50924ehkuang;                               int height, unsigned int *sse);
4691037db265ecdd914a26e056cf69207b4f50924ehkuang;
4791037db265ecdd914a26e056cf69207b4f50924ehkuang; This function returns the SE and stores SSE in the given pointer.
4891037db265ecdd914a26e056cf69207b4f50924ehkuang
4991037db265ecdd914a26e056cf69207b4f50924ehkuang%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
5091037db265ecdd914a26e056cf69207b4f50924ehkuang  psubw                %3, %4
5191037db265ecdd914a26e056cf69207b4f50924ehkuang  psubw                %1, %2
5291037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                %5, %3
5391037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddwd              %3, %3
5491037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                %5, %1
5591037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddwd              %1, %1
5691037db265ecdd914a26e056cf69207b4f50924ehkuang  paddd                %6, %3
5791037db265ecdd914a26e056cf69207b4f50924ehkuang  paddd                %6, %1
5891037db265ecdd914a26e056cf69207b4f50924ehkuang%endmacro
5991037db265ecdd914a26e056cf69207b4f50924ehkuang
6068e1c830ade592be74773e249bf94e2bbfb50de7Johann%macro STORE_AND_RET 1
6168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
6291037db265ecdd914a26e056cf69207b4f50924ehkuang  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
6391037db265ecdd914a26e056cf69207b4f50924ehkuang  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
6491037db265ecdd914a26e056cf69207b4f50924ehkuang  ; We have to sign-extend it before adding the words within the register
6591037db265ecdd914a26e056cf69207b4f50924ehkuang  ; and outputing to a dword.
6691037db265ecdd914a26e056cf69207b4f50924ehkuang  pcmpgtw              m5, m6           ; mask for 0 > x
6791037db265ecdd914a26e056cf69207b4f50924ehkuang  movhlps              m3, m7
6891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklwd            m4, m6, m5
6991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhwd            m6, m5           ; sign-extend m6 word->dword
7091037db265ecdd914a26e056cf69207b4f50924ehkuang  paddd                m7, m3
7191037db265ecdd914a26e056cf69207b4f50924ehkuang  paddd                m6, m4
7291037db265ecdd914a26e056cf69207b4f50924ehkuang  pshufd               m3, m7, 0x1
7391037db265ecdd914a26e056cf69207b4f50924ehkuang  movhlps              m4, m6
7491037db265ecdd914a26e056cf69207b4f50924ehkuang  paddd                m7, m3
7591037db265ecdd914a26e056cf69207b4f50924ehkuang  paddd                m6, m4
7691037db265ecdd914a26e056cf69207b4f50924ehkuang  mov                  r1, ssem         ; r1 = unsigned int *sse
7791037db265ecdd914a26e056cf69207b4f50924ehkuang  pshufd               m4, m6, 0x1
7891037db265ecdd914a26e056cf69207b4f50924ehkuang  movd               [r1], m7           ; store sse
7991037db265ecdd914a26e056cf69207b4f50924ehkuang  paddd                m6, m4
807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  movd               raxd, m6           ; store sum as return value
8168e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh
8268e1c830ade592be74773e249bf94e2bbfb50de7Johann  pshuflw              m4, m6, 0xe
8368e1c830ade592be74773e249bf94e2bbfb50de7Johann  pshuflw              m3, m7, 0xe
8491037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m6, m4
8591037db265ecdd914a26e056cf69207b4f50924ehkuang  paddd                m7, m3
8691037db265ecdd914a26e056cf69207b4f50924ehkuang  pcmpgtw              m5, m6           ; mask for 0 > x
8791037db265ecdd914a26e056cf69207b4f50924ehkuang  mov                  r1, ssem         ; r1 = unsigned int *sse
8891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklwd            m6, m5           ; sign-extend m6 word->dword
8991037db265ecdd914a26e056cf69207b4f50924ehkuang  movd               [r1], m7           ; store sse
9068e1c830ade592be74773e249bf94e2bbfb50de7Johann  pshuflw              m4, m6, 0xe
9191037db265ecdd914a26e056cf69207b4f50924ehkuang  paddd                m6, m4
927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  movd               raxd, m6           ; store sum as return value
9391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
9491037db265ecdd914a26e056cf69207b4f50924ehkuang  RET
9591037db265ecdd914a26e056cf69207b4f50924ehkuang%endmacro
9691037db265ecdd914a26e056cf69207b4f50924ehkuang
979b35249446b07f40ac5fcc3205f2c048616efacchkuang%macro INC_SRC_BY_SRC_STRIDE  0
989b35249446b07f40ac5fcc3205f2c048616efacchkuang%if ARCH_X86=1 && CONFIG_PIC=1
999b35249446b07f40ac5fcc3205f2c048616efacchkuang  add                srcq, src_stridemp
1009b35249446b07f40ac5fcc3205f2c048616efacchkuang%else
1019b35249446b07f40ac5fcc3205f2c048616efacchkuang  add                srcq, src_strideq
1029b35249446b07f40ac5fcc3205f2c048616efacchkuang%endif
1039b35249446b07f40ac5fcc3205f2c048616efacchkuang%endmacro
1049b35249446b07f40ac5fcc3205f2c048616efacchkuang
10591037db265ecdd914a26e056cf69207b4f50924ehkuang%macro SUBPEL_VARIANCE 1-2 0 ; W
10691037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
10791037db265ecdd914a26e056cf69207b4f50924ehkuang%define bilin_filter_m bilin_filter_m_ssse3
10891037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_idx_shift 4
10991037db265ecdd914a26e056cf69207b4f50924ehkuang%else
11091037db265ecdd914a26e056cf69207b4f50924ehkuang%define bilin_filter_m bilin_filter_m_sse2
11191037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_idx_shift 5
11291037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
11391037db265ecdd914a26e056cf69207b4f50924ehkuang; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
11491037db265ecdd914a26e056cf69207b4f50924ehkuang; 11, not 13, if the registers are ordered correctly. May make a minor speed
11591037db265ecdd914a26e056cf69207b4f50924ehkuang; difference on Win64
1169b35249446b07f40ac5fcc3205f2c048616efacchkuang
1179b35249446b07f40ac5fcc3205f2c048616efacchkuang%ifdef PIC    ; 64bit PIC
1189b35249446b07f40ac5fcc3205f2c048616efacchkuang  %if %2 == 1 ; avg
1199b35249446b07f40ac5fcc3205f2c048616efacchkuang    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
1209b35249446b07f40ac5fcc3205f2c048616efacchkuang                                      x_offset, y_offset, \
1219b35249446b07f40ac5fcc3205f2c048616efacchkuang                                      dst, dst_stride, \
1229b35249446b07f40ac5fcc3205f2c048616efacchkuang                                      sec, sec_stride, height, sse
1239b35249446b07f40ac5fcc3205f2c048616efacchkuang    %define sec_str sec_strideq
1249b35249446b07f40ac5fcc3205f2c048616efacchkuang  %else
1259b35249446b07f40ac5fcc3205f2c048616efacchkuang    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
1269b35249446b07f40ac5fcc3205f2c048616efacchkuang                                  y_offset, dst, dst_stride, height, sse
1279b35249446b07f40ac5fcc3205f2c048616efacchkuang  %endif
1287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  %define block_height heightd
1299b35249446b07f40ac5fcc3205f2c048616efacchkuang  %define bilin_filter sseq
13091037db265ecdd914a26e056cf69207b4f50924ehkuang%else
1319b35249446b07f40ac5fcc3205f2c048616efacchkuang  %if ARCH_X86=1 && CONFIG_PIC=1
1329b35249446b07f40ac5fcc3205f2c048616efacchkuang    %if %2 == 1 ; avg
1339b35249446b07f40ac5fcc3205f2c048616efacchkuang      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
1349b35249446b07f40ac5fcc3205f2c048616efacchkuang                                  x_offset, y_offset, \
1359b35249446b07f40ac5fcc3205f2c048616efacchkuang                                  dst, dst_stride, \
1369b35249446b07f40ac5fcc3205f2c048616efacchkuang                                  sec, sec_stride, \
1379b35249446b07f40ac5fcc3205f2c048616efacchkuang                                  height, sse, g_bilin_filter, g_pw_8
1387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      %define block_height dword heightm
1399b35249446b07f40ac5fcc3205f2c048616efacchkuang      %define sec_str sec_stridemp
1409b35249446b07f40ac5fcc3205f2c048616efacchkuang
1419b35249446b07f40ac5fcc3205f2c048616efacchkuang      ;Store bilin_filter and pw_8 location in stack
14268e1c830ade592be74773e249bf94e2bbfb50de7Johann      %if GET_GOT_DEFINED == 1
14368e1c830ade592be74773e249bf94e2bbfb50de7Johann        GET_GOT eax
14468e1c830ade592be74773e249bf94e2bbfb50de7Johann        add esp, 4                ; restore esp
14568e1c830ade592be74773e249bf94e2bbfb50de7Johann      %endif
1469b35249446b07f40ac5fcc3205f2c048616efacchkuang
1479b35249446b07f40ac5fcc3205f2c048616efacchkuang      lea ecx, [GLOBAL(bilin_filter_m)]
1489b35249446b07f40ac5fcc3205f2c048616efacchkuang      mov g_bilin_filterm, ecx
1499b35249446b07f40ac5fcc3205f2c048616efacchkuang
1509b35249446b07f40ac5fcc3205f2c048616efacchkuang      lea ecx, [GLOBAL(pw_8)]
1519b35249446b07f40ac5fcc3205f2c048616efacchkuang      mov g_pw_8m, ecx
1529b35249446b07f40ac5fcc3205f2c048616efacchkuang
1539b35249446b07f40ac5fcc3205f2c048616efacchkuang      LOAD_IF_USED 0, 1         ; load eax, ecx back
1549b35249446b07f40ac5fcc3205f2c048616efacchkuang    %else
1559b35249446b07f40ac5fcc3205f2c048616efacchkuang      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
1569b35249446b07f40ac5fcc3205f2c048616efacchkuang                                y_offset, dst, dst_stride, height, sse, \
1579b35249446b07f40ac5fcc3205f2c048616efacchkuang                                g_bilin_filter, g_pw_8
1587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      %define block_height heightd
1599b35249446b07f40ac5fcc3205f2c048616efacchkuang
1609b35249446b07f40ac5fcc3205f2c048616efacchkuang      ;Store bilin_filter and pw_8 location in stack
16168e1c830ade592be74773e249bf94e2bbfb50de7Johann      %if GET_GOT_DEFINED == 1
16268e1c830ade592be74773e249bf94e2bbfb50de7Johann        GET_GOT eax
16368e1c830ade592be74773e249bf94e2bbfb50de7Johann        add esp, 4                ; restore esp
16468e1c830ade592be74773e249bf94e2bbfb50de7Johann      %endif
1659b35249446b07f40ac5fcc3205f2c048616efacchkuang
1669b35249446b07f40ac5fcc3205f2c048616efacchkuang      lea ecx, [GLOBAL(bilin_filter_m)]
1679b35249446b07f40ac5fcc3205f2c048616efacchkuang      mov g_bilin_filterm, ecx
1689b35249446b07f40ac5fcc3205f2c048616efacchkuang
1699b35249446b07f40ac5fcc3205f2c048616efacchkuang      lea ecx, [GLOBAL(pw_8)]
1709b35249446b07f40ac5fcc3205f2c048616efacchkuang      mov g_pw_8m, ecx
1719b35249446b07f40ac5fcc3205f2c048616efacchkuang
1729b35249446b07f40ac5fcc3205f2c048616efacchkuang      LOAD_IF_USED 0, 1         ; load eax, ecx back
1739b35249446b07f40ac5fcc3205f2c048616efacchkuang    %endif
1749b35249446b07f40ac5fcc3205f2c048616efacchkuang  %else
1759b35249446b07f40ac5fcc3205f2c048616efacchkuang    %if %2 == 1 ; avg
1769b35249446b07f40ac5fcc3205f2c048616efacchkuang      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
1779b35249446b07f40ac5fcc3205f2c048616efacchkuang                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
1789b35249446b07f40ac5fcc3205f2c048616efacchkuang                                             x_offset, y_offset, \
1799b35249446b07f40ac5fcc3205f2c048616efacchkuang                                             dst, dst_stride, \
1809b35249446b07f40ac5fcc3205f2c048616efacchkuang                                             sec, sec_stride, \
1819b35249446b07f40ac5fcc3205f2c048616efacchkuang                                             height, sse
1829b35249446b07f40ac5fcc3205f2c048616efacchkuang      %if ARCH_X86_64
1837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      %define block_height heightd
1849b35249446b07f40ac5fcc3205f2c048616efacchkuang      %define sec_str sec_strideq
1859b35249446b07f40ac5fcc3205f2c048616efacchkuang      %else
1867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      %define block_height dword heightm
1879b35249446b07f40ac5fcc3205f2c048616efacchkuang      %define sec_str sec_stridemp
1889b35249446b07f40ac5fcc3205f2c048616efacchkuang      %endif
1899b35249446b07f40ac5fcc3205f2c048616efacchkuang    %else
1909b35249446b07f40ac5fcc3205f2c048616efacchkuang      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
1919b35249446b07f40ac5fcc3205f2c048616efacchkuang                              y_offset, dst, dst_stride, height, sse
1927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      %define block_height heightd
1939b35249446b07f40ac5fcc3205f2c048616efacchkuang    %endif
1949b35249446b07f40ac5fcc3205f2c048616efacchkuang
1959b35249446b07f40ac5fcc3205f2c048616efacchkuang    %define bilin_filter bilin_filter_m
1969b35249446b07f40ac5fcc3205f2c048616efacchkuang  %endif
19791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
1989b35249446b07f40ac5fcc3205f2c048616efacchkuang
19968e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 == 4
20068e1c830ade592be74773e249bf94e2bbfb50de7Johann  %define movx movd
20168e1c830ade592be74773e249bf94e2bbfb50de7Johann%else
20268e1c830ade592be74773e249bf94e2bbfb50de7Johann  %define movx movh
20368e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
20468e1c830ade592be74773e249bf94e2bbfb50de7Johann
20591037db265ecdd914a26e056cf69207b4f50924ehkuang  ASSERT               %1 <= 16         ; m6 overflows if w > 16
20691037db265ecdd914a26e056cf69207b4f50924ehkuang  pxor                 m6, m6           ; sum
20791037db265ecdd914a26e056cf69207b4f50924ehkuang  pxor                 m7, m7           ; sse
20891037db265ecdd914a26e056cf69207b4f50924ehkuang  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
20991037db265ecdd914a26e056cf69207b4f50924ehkuang  ; could perhaps use it for something more productive then
21091037db265ecdd914a26e056cf69207b4f50924ehkuang  pxor                 m5, m5           ; dedicated zero register
21191037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 < 16
2127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sar                   block_height, 1
21391037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
21491037db265ecdd914a26e056cf69207b4f50924ehkuang  shl             sec_str, 1
21591037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
21691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
21791037db265ecdd914a26e056cf69207b4f50924ehkuang
21891037db265ecdd914a26e056cf69207b4f50924ehkuang  ; FIXME(rbultje) replace by jumptable?
21991037db265ecdd914a26e056cf69207b4f50924ehkuang  test          x_offsetd, x_offsetd
22091037db265ecdd914a26e056cf69207b4f50924ehkuang  jnz .x_nonzero
22191037db265ecdd914a26e056cf69207b4f50924ehkuang  ; x_offset == 0
22291037db265ecdd914a26e056cf69207b4f50924ehkuang  test          y_offsetd, y_offsetd
22391037db265ecdd914a26e056cf69207b4f50924ehkuang  jnz .x_zero_y_nonzero
22491037db265ecdd914a26e056cf69207b4f50924ehkuang
22591037db265ecdd914a26e056cf69207b4f50924ehkuang  ; x_offset == 0 && y_offset == 0
22691037db265ecdd914a26e056cf69207b4f50924ehkuang.x_zero_y_zero_loop:
22791037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16
22891037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m0, [srcq]
22991037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m1, [dstq]
23091037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
23191037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
23291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m1, m5
23391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
23491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
23591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
23691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
23768e1c830ade592be74773e249bf94e2bbfb50de7Johann
23891037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 0 ; !avg
23991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m1, m5
24091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
24191037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
24291037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
24391037db265ecdd914a26e056cf69207b4f50924ehkuang
24491037db265ecdd914a26e056cf69207b4f50924ehkuang  add                srcq, src_strideq
24591037db265ecdd914a26e056cf69207b4f50924ehkuang  add                dstq, dst_strideq
24691037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16
24768e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m0, [srcq]
24891037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
24968e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
25091037db265ecdd914a26e056cf69207b4f50924ehkuang  movhps               m0, [srcq+src_strideq]
25168e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh
25268e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [srcq+src_strideq]
25368e1c830ade592be74773e249bf94e2bbfb50de7Johann  punpckldq            m0, m1
25491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
25591037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; !avg
25668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m2, [srcq+src_strideq]
25791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
25868e1c830ade592be74773e249bf94e2bbfb50de7Johann
25968e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
26068e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [dstq+dst_strideq]
26168e1c830ade592be74773e249bf94e2bbfb50de7Johann
26291037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
26368e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
26491037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
26568e1c830ade592be74773e249bf94e2bbfb50de7Johann%else
26668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movh                 m2, [secq]
26768e1c830ade592be74773e249bf94e2bbfb50de7Johann  pavgb                m0, m2
26868e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
26991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
27091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
27168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
27291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
27391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
27468e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh
27568e1c830ade592be74773e249bf94e2bbfb50de7Johann  punpcklbw            m0, m5
27668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movhlps              m2, m0
27768e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
27891037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; !avg
27991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
28091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m2, m5
28191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
28291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
28391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
28491037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
28591037db265ecdd914a26e056cf69207b4f50924ehkuang
28691037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                srcq, [srcq+src_strideq*2]
28791037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                dstq, [dstq+dst_strideq*2]
28891037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
28991037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
29091037db265ecdd914a26e056cf69207b4f50924ehkuang  add                secq, sec_str
29191037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
2927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  dec                   block_height
29391037db265ecdd914a26e056cf69207b4f50924ehkuang  jg .x_zero_y_zero_loop
29468e1c830ade592be74773e249bf94e2bbfb50de7Johann  STORE_AND_RET %1
29591037db265ecdd914a26e056cf69207b4f50924ehkuang
29691037db265ecdd914a26e056cf69207b4f50924ehkuang.x_zero_y_nonzero:
29768e1c830ade592be74773e249bf94e2bbfb50de7Johann  cmp           y_offsetd, 4
29891037db265ecdd914a26e056cf69207b4f50924ehkuang  jne .x_zero_y_nonhalf
29991037db265ecdd914a26e056cf69207b4f50924ehkuang
30091037db265ecdd914a26e056cf69207b4f50924ehkuang  ; x_offset == 0 && y_offset == 0.5
30191037db265ecdd914a26e056cf69207b4f50924ehkuang.x_zero_y_half_loop:
30291037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16
30391037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m0, [srcq]
30491037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m4, [srcq+src_strideq]
30591037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m1, [dstq]
30691037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, m4
30791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m1, m5
30891037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
30991037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
31091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
31191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
31291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
31391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
31491037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
31591037db265ecdd914a26e056cf69207b4f50924ehkuang
31691037db265ecdd914a26e056cf69207b4f50924ehkuang  add                srcq, src_strideq
31791037db265ecdd914a26e056cf69207b4f50924ehkuang  add                dstq, dst_strideq
31891037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16
31968e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m0, [srcq]
32068e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m2, [srcq+src_strideq]
32191037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
32268e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
32391037db265ecdd914a26e056cf69207b4f50924ehkuang  movhps               m2, [srcq+src_strideq*2]
32468e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh
32568e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [srcq+src_strideq*2]
3261184aebb761cbeac9124c37189a80a1a58f04b6bhkuang  punpckldq            m2, m1
3271184aebb761cbeac9124c37189a80a1a58f04b6bhkuang%endif
32868e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
32968e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
33091037db265ecdd914a26e056cf69207b4f50924ehkuang  movlhps              m0, m2
33168e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh
33291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckldq            m0, m2
33391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
33468e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [dstq+dst_strideq]
33591037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, m2
33691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
33768e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
33891037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
33991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
34091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
34191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
34268e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh
34368e1c830ade592be74773e249bf94e2bbfb50de7Johann  movh                 m4, [secq]
34468e1c830ade592be74773e249bf94e2bbfb50de7Johann  pavgb                m0, m4
34568e1c830ade592be74773e249bf94e2bbfb50de7Johann  punpcklbw            m3, m5
34668e1c830ade592be74773e249bf94e2bbfb50de7Johann  punpcklbw            m0, m5
34768e1c830ade592be74773e249bf94e2bbfb50de7Johann  movhlps              m2, m0
34868e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
34991037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; !avg
35068e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m4, [srcq+src_strideq*2]
35168e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
35291037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, m2
35368e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [dstq+dst_strideq]
35491037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m2, m4
35591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
35691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m2, m5
35791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
35891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
35991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
36091037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
36191037db265ecdd914a26e056cf69207b4f50924ehkuang
36291037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                srcq, [srcq+src_strideq*2]
36391037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                dstq, [dstq+dst_strideq*2]
36491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
36591037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
36691037db265ecdd914a26e056cf69207b4f50924ehkuang  add                secq, sec_str
36791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
3687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  dec                   block_height
36991037db265ecdd914a26e056cf69207b4f50924ehkuang  jg .x_zero_y_half_loop
37068e1c830ade592be74773e249bf94e2bbfb50de7Johann  STORE_AND_RET %1
37191037db265ecdd914a26e056cf69207b4f50924ehkuang
37291037db265ecdd914a26e056cf69207b4f50924ehkuang.x_zero_y_nonhalf:
37391037db265ecdd914a26e056cf69207b4f50924ehkuang  ; x_offset == 0 && y_offset == bilin interpolation
37491037db265ecdd914a26e056cf69207b4f50924ehkuang%ifdef PIC
37591037db265ecdd914a26e056cf69207b4f50924ehkuang  lea        bilin_filter, [bilin_filter_m]
37691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
37791037db265ecdd914a26e056cf69207b4f50924ehkuang  shl           y_offsetd, filter_idx_shift
37868e1c830ade592be74773e249bf94e2bbfb50de7Johann%if ARCH_X86_64 && %1 > 4
37991037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m8, [bilin_filter+y_offsetq]
38091037db265ecdd914a26e056cf69207b4f50924ehkuang%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
38191037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m9, [bilin_filter+y_offsetq+16]
38291037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
38391037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                m10, [pw_8]
38491037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_a m8
38591037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_b m9
38691037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd m10
38791037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; x86-32 or mmx
3889b35249446b07f40ac5fcc3205f2c048616efacchkuang%if ARCH_X86=1 && CONFIG_PIC=1
3899b35249446b07f40ac5fcc3205f2c048616efacchkuang; x_offset == 0, reuse x_offset reg
3909b35249446b07f40ac5fcc3205f2c048616efacchkuang%define tempq x_offsetq
3919b35249446b07f40ac5fcc3205f2c048616efacchkuang  add y_offsetq, g_bilin_filterm
3929b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_y_a [y_offsetq]
3939b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_y_b [y_offsetq+16]
3949b35249446b07f40ac5fcc3205f2c048616efacchkuang  mov tempq, g_pw_8m
3959b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_rnd [tempq]
3969b35249446b07f40ac5fcc3205f2c048616efacchkuang%else
39791037db265ecdd914a26e056cf69207b4f50924ehkuang  add           y_offsetq, bilin_filter
39891037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_a [y_offsetq]
39991037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_b [y_offsetq+16]
40091037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd [pw_8]
40191037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
4029b35249446b07f40ac5fcc3205f2c048616efacchkuang%endif
4039b35249446b07f40ac5fcc3205f2c048616efacchkuang
40491037db265ecdd914a26e056cf69207b4f50924ehkuang.x_zero_y_other_loop:
40591037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16
40691037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m0, [srcq]
40791037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m4, [srcq+src_strideq]
40891037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m1, [dstq]
40991037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
41091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m4
41191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m4
41291037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m2, filter_y_a
41391037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m0, filter_y_a
41491037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
41591037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
41691037db265ecdd914a26e056cf69207b4f50924ehkuang%else
41791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
41891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m4, m5
41991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
42091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m4, m5
42191037db265ecdd914a26e056cf69207b4f50924ehkuang  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
42291037db265ecdd914a26e056cf69207b4f50924ehkuang  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
42391037db265ecdd914a26e056cf69207b4f50924ehkuang  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
42491037db265ecdd914a26e056cf69207b4f50924ehkuang  ; slightly faster because of pmullw latency. It would also cut our rodata
42591037db265ecdd914a26e056cf69207b4f50924ehkuang  ; tables in half for this function, and save 1-2 registers on x86-64.
42691037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m2, filter_y_a
42791037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m3, filter_y_b
42891037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
42991037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m0, filter_y_a
43091037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m4, filter_y_b
43191037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
43291037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, m3
43391037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, m4
43491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
43591037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
43691037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m0, 4
43791037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
43891037db265ecdd914a26e056cf69207b4f50924ehkuang  ; FIXME(rbultje) pipeline
43991037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m0, m2
44091037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
44191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
44291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
44391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
44491037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m1, m5
44591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
44691037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
44791037db265ecdd914a26e056cf69207b4f50924ehkuang
44891037db265ecdd914a26e056cf69207b4f50924ehkuang  add                srcq, src_strideq
44991037db265ecdd914a26e056cf69207b4f50924ehkuang  add                dstq, dst_strideq
45091037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16
45168e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m0, [srcq]
45268e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m2, [srcq+src_strideq]
45368e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m4, [srcq+src_strideq*2]
45468e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [dstq+dst_strideq]
45591037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
45668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
45791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m2
45891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m2, m4
45991037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m0, filter_y_a
46091037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m2, filter_y_a
46191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
46291037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
46391037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
46491037db265ecdd914a26e056cf69207b4f50924ehkuang%else
46591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
46691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m2, m5
46791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m4, m5
46891037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m0, filter_y_a
46991037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m1, m2, filter_y_b
47091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
47191037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
47291037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m2, filter_y_a
47391037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m4, filter_y_b
47491037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, m1
47591037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
47668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
47791037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, m4
47891037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
47991037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m0, 4
48091037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
48191037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
48291037db265ecdd914a26e056cf69207b4f50924ehkuang  ; FIXME(rbultje) pipeline
48368e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 == 4
48468e1c830ade592be74773e249bf94e2bbfb50de7Johann  movlhps              m0, m2
48568e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
48691037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m0, m2
48768e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
48891037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
48991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
49091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
49168e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh
49268e1c830ade592be74773e249bf94e2bbfb50de7Johann  movh                 m2, [secq]
49368e1c830ade592be74773e249bf94e2bbfb50de7Johann  pavgb                m0, m2
49468e1c830ade592be74773e249bf94e2bbfb50de7Johann  punpcklbw            m0, m5
49568e1c830ade592be74773e249bf94e2bbfb50de7Johann  movhlps              m2, m0
49668e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
49791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
49891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
49991037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
50091037db265ecdd914a26e056cf69207b4f50924ehkuang
50191037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                srcq, [srcq+src_strideq*2]
50291037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                dstq, [dstq+dst_strideq*2]
50391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
50491037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
50591037db265ecdd914a26e056cf69207b4f50924ehkuang  add                secq, sec_str
50691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
5077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  dec                   block_height
50891037db265ecdd914a26e056cf69207b4f50924ehkuang  jg .x_zero_y_other_loop
50991037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_y_a
51091037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_y_b
51191037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_rnd
51268e1c830ade592be74773e249bf94e2bbfb50de7Johann  STORE_AND_RET %1
51391037db265ecdd914a26e056cf69207b4f50924ehkuang
51491037db265ecdd914a26e056cf69207b4f50924ehkuang.x_nonzero:
51568e1c830ade592be74773e249bf94e2bbfb50de7Johann  cmp           x_offsetd, 4
51691037db265ecdd914a26e056cf69207b4f50924ehkuang  jne .x_nonhalf
51791037db265ecdd914a26e056cf69207b4f50924ehkuang  ; x_offset == 0.5
51891037db265ecdd914a26e056cf69207b4f50924ehkuang  test          y_offsetd, y_offsetd
51991037db265ecdd914a26e056cf69207b4f50924ehkuang  jnz .x_half_y_nonzero
52091037db265ecdd914a26e056cf69207b4f50924ehkuang
52191037db265ecdd914a26e056cf69207b4f50924ehkuang  ; x_offset == 0.5 && y_offset == 0
52291037db265ecdd914a26e056cf69207b4f50924ehkuang.x_half_y_zero_loop:
52391037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16
52491037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m0, [srcq]
52591037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m4, [srcq+1]
52691037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m1, [dstq]
52791037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, m4
52891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m1, m5
52991037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
53091037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
53191037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
53291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
53391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
53491037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
53591037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
53691037db265ecdd914a26e056cf69207b4f50924ehkuang
53791037db265ecdd914a26e056cf69207b4f50924ehkuang  add                srcq, src_strideq
53891037db265ecdd914a26e056cf69207b4f50924ehkuang  add                dstq, dst_strideq
53991037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16
54068e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m0, [srcq]
54168e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m4, [srcq+1]
54291037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
54368e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
54491037db265ecdd914a26e056cf69207b4f50924ehkuang  movhps               m0, [srcq+src_strideq]
54591037db265ecdd914a26e056cf69207b4f50924ehkuang  movhps               m4, [srcq+src_strideq+1]
54668e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh
54768e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [srcq+src_strideq]
54868e1c830ade592be74773e249bf94e2bbfb50de7Johann  punpckldq            m0, m1
54968e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m2, [srcq+src_strideq+1]
55068e1c830ade592be74773e249bf94e2bbfb50de7Johann  punpckldq            m4, m2
55168e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
55268e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
55368e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [dstq+dst_strideq]
55491037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, m4
55591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
55668e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
55791037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
55891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
55991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
56091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
56168e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh
56268e1c830ade592be74773e249bf94e2bbfb50de7Johann  movh                 m2, [secq]
56368e1c830ade592be74773e249bf94e2bbfb50de7Johann  pavgb                m0, m2
56468e1c830ade592be74773e249bf94e2bbfb50de7Johann  punpcklbw            m1, m5
56568e1c830ade592be74773e249bf94e2bbfb50de7Johann  punpcklbw            m0, m5
56668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movhlps              m2, m0
56768e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
56891037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; !avg
56968e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m2, [srcq+src_strideq]
57068e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
57191037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, m4
57268e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m4, [srcq+src_strideq+1]
57368e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [dstq+dst_strideq]
57491037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m2, m4
57591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
57691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m2, m5
57791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
57891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
57991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
58091037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
58191037db265ecdd914a26e056cf69207b4f50924ehkuang
58291037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                srcq, [srcq+src_strideq*2]
58391037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                dstq, [dstq+dst_strideq*2]
58491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
58591037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
58691037db265ecdd914a26e056cf69207b4f50924ehkuang  add                secq, sec_str
58791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
5887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  dec                   block_height
58991037db265ecdd914a26e056cf69207b4f50924ehkuang  jg .x_half_y_zero_loop
59068e1c830ade592be74773e249bf94e2bbfb50de7Johann  STORE_AND_RET %1
59191037db265ecdd914a26e056cf69207b4f50924ehkuang
59291037db265ecdd914a26e056cf69207b4f50924ehkuang.x_half_y_nonzero:
59368e1c830ade592be74773e249bf94e2bbfb50de7Johann  cmp           y_offsetd, 4
59491037db265ecdd914a26e056cf69207b4f50924ehkuang  jne .x_half_y_nonhalf
59591037db265ecdd914a26e056cf69207b4f50924ehkuang
59691037db265ecdd914a26e056cf69207b4f50924ehkuang  ; x_offset == 0.5 && y_offset == 0.5
59791037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16
59891037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m0, [srcq]
59991037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m3, [srcq+1]
60091037db265ecdd914a26e056cf69207b4f50924ehkuang  add                srcq, src_strideq
60191037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, m3
60291037db265ecdd914a26e056cf69207b4f50924ehkuang.x_half_y_half_loop:
60391037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m4, [srcq]
60491037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m3, [srcq+1]
60591037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m1, [dstq]
60691037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m4, m3
60791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m1, m5
60891037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, m4
60991037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
61091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
61191037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
61291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
61391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
61491037db265ecdd914a26e056cf69207b4f50924ehkuang%else
61591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
61691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
61791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
61891037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
61991037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
62091037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m0, m4
62191037db265ecdd914a26e056cf69207b4f50924ehkuang
62291037db265ecdd914a26e056cf69207b4f50924ehkuang  add                srcq, src_strideq
62391037db265ecdd914a26e056cf69207b4f50924ehkuang  add                dstq, dst_strideq
62491037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16
62568e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m0, [srcq]
62668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [srcq+1]
62791037db265ecdd914a26e056cf69207b4f50924ehkuang  add                srcq, src_strideq
62891037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, m3
62991037db265ecdd914a26e056cf69207b4f50924ehkuang.x_half_y_half_loop:
63068e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m2, [srcq]
63168e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [srcq+1]
63291037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
63368e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
63491037db265ecdd914a26e056cf69207b4f50924ehkuang  movhps               m2, [srcq+src_strideq]
63591037db265ecdd914a26e056cf69207b4f50924ehkuang  movhps               m3, [srcq+src_strideq+1]
63691037db265ecdd914a26e056cf69207b4f50924ehkuang%else
63768e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [srcq+src_strideq]
6381184aebb761cbeac9124c37189a80a1a58f04b6bhkuang  punpckldq            m2, m1
63968e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [srcq+src_strideq+1]
6401184aebb761cbeac9124c37189a80a1a58f04b6bhkuang  punpckldq            m3, m1
6411184aebb761cbeac9124c37189a80a1a58f04b6bhkuang%endif
64291037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m2, m3
64368e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
64491037db265ecdd914a26e056cf69207b4f50924ehkuang  movlhps              m0, m2
64591037db265ecdd914a26e056cf69207b4f50924ehkuang  movhlps              m4, m2
64668e1c830ade592be74773e249bf94e2bbfb50de7Johann%else ; 4xh
64791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckldq            m0, m2
64868e1c830ade592be74773e249bf94e2bbfb50de7Johann  pshuflw              m4, m2, 0xe
64991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
65068e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
65191037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, m2
65268e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [dstq+dst_strideq]
65368e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
65491037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
65568e1c830ade592be74773e249bf94e2bbfb50de7Johann%else
65668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movh                 m2, [secq]
65768e1c830ade592be74773e249bf94e2bbfb50de7Johann  pavgb                m0, m2
65868e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
65991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
66091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
66168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
66291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
66391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
66468e1c830ade592be74773e249bf94e2bbfb50de7Johann%else
66568e1c830ade592be74773e249bf94e2bbfb50de7Johann  punpcklbw            m0, m5
66668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movhlps              m2, m0
66768e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
66891037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; !avg
66968e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m4, [srcq+src_strideq]
67068e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [srcq+src_strideq+1]
67191037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m2, m3
67291037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m4, m1
67391037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, m2
67491037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m2, m4
67568e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
67668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [dstq+dst_strideq]
67791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
67891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m2, m5
67991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
68091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
68191037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
68291037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
68391037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m0, m4
68491037db265ecdd914a26e056cf69207b4f50924ehkuang
68591037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                srcq, [srcq+src_strideq*2]
68691037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                dstq, [dstq+dst_strideq*2]
68791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
68891037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
68991037db265ecdd914a26e056cf69207b4f50924ehkuang  add                secq, sec_str
69091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
6917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  dec                   block_height
69291037db265ecdd914a26e056cf69207b4f50924ehkuang  jg .x_half_y_half_loop
69368e1c830ade592be74773e249bf94e2bbfb50de7Johann  STORE_AND_RET %1
69491037db265ecdd914a26e056cf69207b4f50924ehkuang
69591037db265ecdd914a26e056cf69207b4f50924ehkuang.x_half_y_nonhalf:
69691037db265ecdd914a26e056cf69207b4f50924ehkuang  ; x_offset == 0.5 && y_offset == bilin interpolation
69791037db265ecdd914a26e056cf69207b4f50924ehkuang%ifdef PIC
69891037db265ecdd914a26e056cf69207b4f50924ehkuang  lea        bilin_filter, [bilin_filter_m]
69991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
70091037db265ecdd914a26e056cf69207b4f50924ehkuang  shl           y_offsetd, filter_idx_shift
70168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if ARCH_X86_64 && %1 > 4
70291037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m8, [bilin_filter+y_offsetq]
70391037db265ecdd914a26e056cf69207b4f50924ehkuang%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
70491037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m9, [bilin_filter+y_offsetq+16]
70591037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
70691037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                m10, [pw_8]
70791037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_a m8
70891037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_b m9
70991037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd m10
7109b35249446b07f40ac5fcc3205f2c048616efacchkuang%else  ;x86_32
7119b35249446b07f40ac5fcc3205f2c048616efacchkuang%if ARCH_X86=1 && CONFIG_PIC=1
7129b35249446b07f40ac5fcc3205f2c048616efacchkuang; x_offset == 0.5. We can reuse x_offset reg
7139b35249446b07f40ac5fcc3205f2c048616efacchkuang%define tempq x_offsetq
7149b35249446b07f40ac5fcc3205f2c048616efacchkuang  add y_offsetq, g_bilin_filterm
7159b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_y_a [y_offsetq]
7169b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_y_b [y_offsetq+16]
7179b35249446b07f40ac5fcc3205f2c048616efacchkuang  mov tempq, g_pw_8m
7189b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_rnd [tempq]
71991037db265ecdd914a26e056cf69207b4f50924ehkuang%else
72091037db265ecdd914a26e056cf69207b4f50924ehkuang  add           y_offsetq, bilin_filter
72191037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_a [y_offsetq]
72291037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_b [y_offsetq+16]
72391037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd [pw_8]
72491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
7259b35249446b07f40ac5fcc3205f2c048616efacchkuang%endif
7269b35249446b07f40ac5fcc3205f2c048616efacchkuang
72791037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16
72891037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m0, [srcq]
72991037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m3, [srcq+1]
73091037db265ecdd914a26e056cf69207b4f50924ehkuang  add                srcq, src_strideq
73191037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, m3
73291037db265ecdd914a26e056cf69207b4f50924ehkuang.x_half_y_other_loop:
73391037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m4, [srcq]
73491037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m2, [srcq+1]
73591037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m1, [dstq]
73691037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m4, m2
73791037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
73891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m4
73991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m4
74091037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m2, filter_y_a
74191037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m0, filter_y_a
74291037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
74391037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
74491037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
74591037db265ecdd914a26e056cf69207b4f50924ehkuang%else
74691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
74791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m4, m5
74891037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m2, filter_y_a
74991037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m3, filter_y_b
75091037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
75191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
75291037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, m3
75391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m4, m5
75491037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m0, filter_y_a
75591037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m3, filter_y_b
75691037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
75791037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
75891037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, m3
75991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
76091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m1, m5
76191037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m0, 4
76291037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
76391037db265ecdd914a26e056cf69207b4f50924ehkuang  ; FIXME(rbultje) pipeline
76491037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m0, m2
76591037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
76691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
76791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
76891037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
76991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
77091037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
77191037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m0, m4
77291037db265ecdd914a26e056cf69207b4f50924ehkuang
77391037db265ecdd914a26e056cf69207b4f50924ehkuang  add                srcq, src_strideq
77491037db265ecdd914a26e056cf69207b4f50924ehkuang  add                dstq, dst_strideq
77591037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16
77668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m0, [srcq]
77768e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [srcq+1]
77891037db265ecdd914a26e056cf69207b4f50924ehkuang  add                srcq, src_strideq
77991037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, m3
78091037db265ecdd914a26e056cf69207b4f50924ehkuang%if notcpuflag(ssse3)
78191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
78291037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
78391037db265ecdd914a26e056cf69207b4f50924ehkuang.x_half_y_other_loop:
78468e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m2, [srcq]
78568e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [srcq+1]
78668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m4, [srcq+src_strideq]
78768e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [srcq+src_strideq+1]
78891037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m2, m1
78991037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m4, m3
79068e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [dstq+dst_strideq]
79191037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
79268e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
79391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m2
79491037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m2, m4
79591037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m0, filter_y_a
79691037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m2, filter_y_a
79791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
79891037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
79991037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
80091037db265ecdd914a26e056cf69207b4f50924ehkuang%else
80191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m2, m5
80291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m4, m5
80391037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m0, filter_y_a
80491037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m1, m2, filter_y_b
80591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
80691037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
80791037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m2, filter_y_a
80891037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, m1
80991037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m1, m4, filter_y_b
81091037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
81191037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, m1
81268e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
81391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
81491037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m0, 4
81591037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
81691037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
81791037db265ecdd914a26e056cf69207b4f50924ehkuang  ; FIXME(rbultje) pipeline
81868e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 == 4
81968e1c830ade592be74773e249bf94e2bbfb50de7Johann  movlhps              m0, m2
82068e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
82191037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m0, m2
82268e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
82391037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
82491037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
82591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
82668e1c830ade592be74773e249bf94e2bbfb50de7Johann%else
82768e1c830ade592be74773e249bf94e2bbfb50de7Johann  movh                 m2, [secq]
82868e1c830ade592be74773e249bf94e2bbfb50de7Johann  pavgb                m0, m2
82968e1c830ade592be74773e249bf94e2bbfb50de7Johann  punpcklbw            m0, m5
83068e1c830ade592be74773e249bf94e2bbfb50de7Johann  movhlps              m2, m0
83168e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
83291037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
83391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
83491037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
83591037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m0, m4
83691037db265ecdd914a26e056cf69207b4f50924ehkuang
83791037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                srcq, [srcq+src_strideq*2]
83891037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                dstq, [dstq+dst_strideq*2]
83991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
84091037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
84191037db265ecdd914a26e056cf69207b4f50924ehkuang  add                secq, sec_str
84291037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
8437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  dec                   block_height
84491037db265ecdd914a26e056cf69207b4f50924ehkuang  jg .x_half_y_other_loop
84591037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_y_a
84691037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_y_b
84791037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_rnd
84868e1c830ade592be74773e249bf94e2bbfb50de7Johann  STORE_AND_RET %1
84991037db265ecdd914a26e056cf69207b4f50924ehkuang
85091037db265ecdd914a26e056cf69207b4f50924ehkuang.x_nonhalf:
85191037db265ecdd914a26e056cf69207b4f50924ehkuang  test          y_offsetd, y_offsetd
85291037db265ecdd914a26e056cf69207b4f50924ehkuang  jnz .x_nonhalf_y_nonzero
85391037db265ecdd914a26e056cf69207b4f50924ehkuang
85491037db265ecdd914a26e056cf69207b4f50924ehkuang  ; x_offset == bilin interpolation && y_offset == 0
85591037db265ecdd914a26e056cf69207b4f50924ehkuang%ifdef PIC
85691037db265ecdd914a26e056cf69207b4f50924ehkuang  lea        bilin_filter, [bilin_filter_m]
85791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
85891037db265ecdd914a26e056cf69207b4f50924ehkuang  shl           x_offsetd, filter_idx_shift
85968e1c830ade592be74773e249bf94e2bbfb50de7Johann%if ARCH_X86_64 && %1 > 4
86091037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m8, [bilin_filter+x_offsetq]
86191037db265ecdd914a26e056cf69207b4f50924ehkuang%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
86291037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m9, [bilin_filter+x_offsetq+16]
86391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
86491037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                m10, [pw_8]
86591037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_a m8
86691037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_b m9
86791037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd m10
8689b35249446b07f40ac5fcc3205f2c048616efacchkuang%else    ; x86-32
8699b35249446b07f40ac5fcc3205f2c048616efacchkuang%if ARCH_X86=1 && CONFIG_PIC=1
8709b35249446b07f40ac5fcc3205f2c048616efacchkuang;y_offset == 0. We can reuse y_offset reg.
8719b35249446b07f40ac5fcc3205f2c048616efacchkuang%define tempq y_offsetq
8729b35249446b07f40ac5fcc3205f2c048616efacchkuang  add x_offsetq, g_bilin_filterm
8739b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_x_a [x_offsetq]
8749b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_x_b [x_offsetq+16]
8759b35249446b07f40ac5fcc3205f2c048616efacchkuang  mov tempq, g_pw_8m
8769b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_rnd [tempq]
87791037db265ecdd914a26e056cf69207b4f50924ehkuang%else
87891037db265ecdd914a26e056cf69207b4f50924ehkuang  add           x_offsetq, bilin_filter
87991037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_a [x_offsetq]
88091037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_b [x_offsetq+16]
88191037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd [pw_8]
88291037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
8839b35249446b07f40ac5fcc3205f2c048616efacchkuang%endif
8849b35249446b07f40ac5fcc3205f2c048616efacchkuang
88591037db265ecdd914a26e056cf69207b4f50924ehkuang.x_other_y_zero_loop:
88691037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16
88791037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m0, [srcq]
88891037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m4, [srcq+1]
88991037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m1, [dstq]
89091037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
89191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m4
89291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m4
89391037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m2, filter_x_a
89491037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m0, filter_x_a
89591037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
89691037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
89791037db265ecdd914a26e056cf69207b4f50924ehkuang%else
89891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
89991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m4, m5
90091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
90191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m4, m5
90291037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m2, filter_x_a
90391037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m3, filter_x_b
90491037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
90591037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m0, filter_x_a
90691037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m4, filter_x_b
90791037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
90891037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, m3
90991037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, m4
91091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
91191037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
91291037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m0, 4
91391037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
91491037db265ecdd914a26e056cf69207b4f50924ehkuang  ; FIXME(rbultje) pipeline
91591037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m0, m2
91691037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
91791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
91891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
91991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
92091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m1, m5
92191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
92291037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
92391037db265ecdd914a26e056cf69207b4f50924ehkuang
92491037db265ecdd914a26e056cf69207b4f50924ehkuang  add                srcq, src_strideq
92591037db265ecdd914a26e056cf69207b4f50924ehkuang  add                dstq, dst_strideq
92691037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16
92768e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m0, [srcq]
92868e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [srcq+1]
92968e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m2, [srcq+src_strideq]
93068e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m4, [srcq+src_strideq+1]
93168e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [dstq+dst_strideq]
93291037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
93391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m1
93468e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
93591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m2, m4
93691037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m0, filter_x_a
93791037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m2, filter_x_a
93891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
93991037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
94091037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
94191037db265ecdd914a26e056cf69207b4f50924ehkuang%else
94291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
94391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
94491037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m2, m5
94591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m4, m5
94691037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m0, filter_x_a
94791037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m1, filter_x_b
94891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
94991037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
95091037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m2, filter_x_a
95191037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m4, filter_x_b
95291037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, m1
95391037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
95468e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
95591037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, m4
95691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
95791037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m0, 4
95891037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
95991037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
96091037db265ecdd914a26e056cf69207b4f50924ehkuang  ; FIXME(rbultje) pipeline
96168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 == 4
96268e1c830ade592be74773e249bf94e2bbfb50de7Johann  movlhps              m0, m2
96368e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
96491037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m0, m2
96568e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
96691037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
96791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
96891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
96968e1c830ade592be74773e249bf94e2bbfb50de7Johann%else
97068e1c830ade592be74773e249bf94e2bbfb50de7Johann  movh                 m2, [secq]
97168e1c830ade592be74773e249bf94e2bbfb50de7Johann  pavgb                m0, m2
97268e1c830ade592be74773e249bf94e2bbfb50de7Johann  punpcklbw            m0, m5
97368e1c830ade592be74773e249bf94e2bbfb50de7Johann  movhlps              m2, m0
97468e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
97591037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
97691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
97791037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
97891037db265ecdd914a26e056cf69207b4f50924ehkuang
97991037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                srcq, [srcq+src_strideq*2]
98091037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                dstq, [dstq+dst_strideq*2]
98191037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
98291037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
98391037db265ecdd914a26e056cf69207b4f50924ehkuang  add                secq, sec_str
98491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
9857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  dec                   block_height
98691037db265ecdd914a26e056cf69207b4f50924ehkuang  jg .x_other_y_zero_loop
98791037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_x_a
98891037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_x_b
98991037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_rnd
99068e1c830ade592be74773e249bf94e2bbfb50de7Johann  STORE_AND_RET %1
99191037db265ecdd914a26e056cf69207b4f50924ehkuang
99291037db265ecdd914a26e056cf69207b4f50924ehkuang.x_nonhalf_y_nonzero:
99368e1c830ade592be74773e249bf94e2bbfb50de7Johann  cmp           y_offsetd, 4
99491037db265ecdd914a26e056cf69207b4f50924ehkuang  jne .x_nonhalf_y_nonhalf
99591037db265ecdd914a26e056cf69207b4f50924ehkuang
99691037db265ecdd914a26e056cf69207b4f50924ehkuang  ; x_offset == bilin interpolation && y_offset == 0.5
99791037db265ecdd914a26e056cf69207b4f50924ehkuang%ifdef PIC
99891037db265ecdd914a26e056cf69207b4f50924ehkuang  lea        bilin_filter, [bilin_filter_m]
99991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
100091037db265ecdd914a26e056cf69207b4f50924ehkuang  shl           x_offsetd, filter_idx_shift
100168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if ARCH_X86_64 && %1 > 4
100291037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m8, [bilin_filter+x_offsetq]
100391037db265ecdd914a26e056cf69207b4f50924ehkuang%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
100491037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m9, [bilin_filter+x_offsetq+16]
100591037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
100691037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                m10, [pw_8]
100791037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_a m8
100891037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_b m9
100991037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd m10
10109b35249446b07f40ac5fcc3205f2c048616efacchkuang%else    ; x86-32
10119b35249446b07f40ac5fcc3205f2c048616efacchkuang%if ARCH_X86=1 && CONFIG_PIC=1
10129b35249446b07f40ac5fcc3205f2c048616efacchkuang; y_offset == 0.5. We can reuse y_offset reg.
10139b35249446b07f40ac5fcc3205f2c048616efacchkuang%define tempq y_offsetq
10149b35249446b07f40ac5fcc3205f2c048616efacchkuang  add x_offsetq, g_bilin_filterm
10159b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_x_a [x_offsetq]
10169b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_x_b [x_offsetq+16]
10179b35249446b07f40ac5fcc3205f2c048616efacchkuang  mov tempq, g_pw_8m
10189b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_rnd [tempq]
101991037db265ecdd914a26e056cf69207b4f50924ehkuang%else
102091037db265ecdd914a26e056cf69207b4f50924ehkuang  add           x_offsetq, bilin_filter
102191037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_a [x_offsetq]
102291037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_b [x_offsetq+16]
102391037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd [pw_8]
102491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
10259b35249446b07f40ac5fcc3205f2c048616efacchkuang%endif
10269b35249446b07f40ac5fcc3205f2c048616efacchkuang
102791037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16
102891037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m0, [srcq]
102991037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m1, [srcq+1]
103091037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
103191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m1
103291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m1
103391037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m2, filter_x_a
103491037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m0, filter_x_a
103591037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
103691037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
103791037db265ecdd914a26e056cf69207b4f50924ehkuang%else
103891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
103991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m1, m5
104091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
104191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
104291037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m0, filter_x_a
104391037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m1, filter_x_b
104491037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
104591037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m2, filter_x_a
104691037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m3, filter_x_b
104791037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
104891037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, m1
104991037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, m3
105091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
105191037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m0, 4
105291037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
105391037db265ecdd914a26e056cf69207b4f50924ehkuang  add                srcq, src_strideq
105491037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m0, m2
105591037db265ecdd914a26e056cf69207b4f50924ehkuang.x_other_y_half_loop:
105691037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m4, [srcq]
105791037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m3, [srcq+1]
105891037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
105991037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m1, [dstq]
106091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m4, m3
106191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m4, m3
106291037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m2, filter_x_a
106391037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m4, filter_x_a
106491037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
106591037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m4, filter_rnd
106691037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
106791037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m4, 4
106891037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m4, m2
106991037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, m4
107091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m1, m5
107191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
107291037db265ecdd914a26e056cf69207b4f50924ehkuang%else
107391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m4, m5
107491037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m1, m3, m5
107591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m4, m5
107691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
107791037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m4, filter_x_a
107891037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m3, filter_x_b
107991037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m4, filter_rnd
108091037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m2, filter_x_a
108191037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m1, filter_x_b
108291037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
108391037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m4, m3
108491037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, m1
108591037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m1, [dstq]
108691037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m4, 4
108791037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
108891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m1, m5
108991037db265ecdd914a26e056cf69207b4f50924ehkuang  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
109091037db265ecdd914a26e056cf69207b4f50924ehkuang  ; have a 1-register shortage to be able to store the backup of the bilin
109191037db265ecdd914a26e056cf69207b4f50924ehkuang  ; filtered second line as words as cache for the next line. Packing into
109291037db265ecdd914a26e056cf69207b4f50924ehkuang  ; a byte costs 1 pack and 2 unpacks, but saves a register.
109391037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m4, m2
109491037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
109591037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, m4
109691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
109791037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
109891037db265ecdd914a26e056cf69207b4f50924ehkuang  ; FIXME(rbultje) pipeline
109991037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
110091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
110191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
110291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
110391037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
110491037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m0, m4
110591037db265ecdd914a26e056cf69207b4f50924ehkuang
110691037db265ecdd914a26e056cf69207b4f50924ehkuang  add                srcq, src_strideq
110791037db265ecdd914a26e056cf69207b4f50924ehkuang  add                dstq, dst_strideq
110891037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16
110968e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m0, [srcq]
111068e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [srcq+1]
111191037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
111291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m1
111391037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m0, filter_x_a
111491037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
111591037db265ecdd914a26e056cf69207b4f50924ehkuang%else
111691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
111791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
111891037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m0, filter_x_a
111991037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m1, filter_x_b
112091037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
112191037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, m1
112291037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
112391037db265ecdd914a26e056cf69207b4f50924ehkuang  add                srcq, src_strideq
112491037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m0, 4
112591037db265ecdd914a26e056cf69207b4f50924ehkuang.x_other_y_half_loop:
112668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m2, [srcq]
112768e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [srcq+1]
112868e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m4, [srcq+src_strideq]
112968e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [srcq+src_strideq+1]
113091037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
113191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m2, m1
113291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m4, m3
113391037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m2, filter_x_a
113491037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m4, filter_x_a
113568e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
113668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [dstq+dst_strideq]
113791037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
113891037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m4, filter_rnd
113991037db265ecdd914a26e056cf69207b4f50924ehkuang%else
114091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m2, m5
114191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
114291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m4, m5
114391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
114491037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m2, filter_x_a
114591037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m1, filter_x_b
114691037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
114791037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m4, filter_x_a
114891037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m3, filter_x_b
114991037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m4, filter_rnd
115091037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, m1
115168e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
115291037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m4, m3
115368e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [dstq+dst_strideq]
115491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
115591037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
115691037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m4, 4
115791037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgw                m0, m2
115891037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgw                m2, m4
115991037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
116091037db265ecdd914a26e056cf69207b4f50924ehkuang  ; FIXME(rbultje) pipeline - also consider going to bytes here
116168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 == 4
116268e1c830ade592be74773e249bf94e2bbfb50de7Johann  movlhps              m0, m2
116368e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
116491037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m0, m2
116568e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
116691037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
116791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
116891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
116968e1c830ade592be74773e249bf94e2bbfb50de7Johann%else
117068e1c830ade592be74773e249bf94e2bbfb50de7Johann  movh                 m2, [secq]
117168e1c830ade592be74773e249bf94e2bbfb50de7Johann  pavgb                m0, m2
117268e1c830ade592be74773e249bf94e2bbfb50de7Johann  punpcklbw            m0, m5
117368e1c830ade592be74773e249bf94e2bbfb50de7Johann  movhlps              m2, m0
117468e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
117591037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
117691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
117791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
117891037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
117991037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m0, m4
118091037db265ecdd914a26e056cf69207b4f50924ehkuang
118191037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                srcq, [srcq+src_strideq*2]
118291037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                dstq, [dstq+dst_strideq*2]
118391037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
118491037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
118591037db265ecdd914a26e056cf69207b4f50924ehkuang  add                secq, sec_str
118691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
11877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  dec                   block_height
118891037db265ecdd914a26e056cf69207b4f50924ehkuang  jg .x_other_y_half_loop
118991037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_x_a
119091037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_x_b
119191037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_rnd
119268e1c830ade592be74773e249bf94e2bbfb50de7Johann  STORE_AND_RET %1
119391037db265ecdd914a26e056cf69207b4f50924ehkuang
119491037db265ecdd914a26e056cf69207b4f50924ehkuang.x_nonhalf_y_nonhalf:
119591037db265ecdd914a26e056cf69207b4f50924ehkuang%ifdef PIC
119691037db265ecdd914a26e056cf69207b4f50924ehkuang  lea        bilin_filter, [bilin_filter_m]
119791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
119891037db265ecdd914a26e056cf69207b4f50924ehkuang  shl           x_offsetd, filter_idx_shift
119991037db265ecdd914a26e056cf69207b4f50924ehkuang  shl           y_offsetd, filter_idx_shift
120068e1c830ade592be74773e249bf94e2bbfb50de7Johann%if ARCH_X86_64 && %1 > 4
120191037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m8, [bilin_filter+x_offsetq]
120291037db265ecdd914a26e056cf69207b4f50924ehkuang%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
120391037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m9, [bilin_filter+x_offsetq+16]
120491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
120591037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                m10, [bilin_filter+y_offsetq]
120691037db265ecdd914a26e056cf69207b4f50924ehkuang%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
120791037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                m11, [bilin_filter+y_offsetq+16]
120891037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
120991037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                m12, [pw_8]
121091037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_a m8
121191037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_b m9
121291037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_a m10
121391037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_b m11
121491037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd m12
12159b35249446b07f40ac5fcc3205f2c048616efacchkuang%else   ; x86-32
12169b35249446b07f40ac5fcc3205f2c048616efacchkuang%if ARCH_X86=1 && CONFIG_PIC=1
12179b35249446b07f40ac5fcc3205f2c048616efacchkuang; In this case, there is NO unused register. Used src_stride register. Later,
12189b35249446b07f40ac5fcc3205f2c048616efacchkuang; src_stride has to be loaded from stack when it is needed.
12199b35249446b07f40ac5fcc3205f2c048616efacchkuang%define tempq src_strideq
12209b35249446b07f40ac5fcc3205f2c048616efacchkuang  mov tempq, g_bilin_filterm
12219b35249446b07f40ac5fcc3205f2c048616efacchkuang  add           x_offsetq, tempq
12229b35249446b07f40ac5fcc3205f2c048616efacchkuang  add           y_offsetq, tempq
12239b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_x_a [x_offsetq]
12249b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_x_b [x_offsetq+16]
12259b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_y_a [y_offsetq]
12269b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_y_b [y_offsetq+16]
12279b35249446b07f40ac5fcc3205f2c048616efacchkuang
12289b35249446b07f40ac5fcc3205f2c048616efacchkuang  mov tempq, g_pw_8m
12299b35249446b07f40ac5fcc3205f2c048616efacchkuang%define filter_rnd [tempq]
123091037db265ecdd914a26e056cf69207b4f50924ehkuang%else
123191037db265ecdd914a26e056cf69207b4f50924ehkuang  add           x_offsetq, bilin_filter
123291037db265ecdd914a26e056cf69207b4f50924ehkuang  add           y_offsetq, bilin_filter
123391037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_a [x_offsetq]
123491037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_x_b [x_offsetq+16]
123591037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_a [y_offsetq]
123691037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_y_b [y_offsetq+16]
123791037db265ecdd914a26e056cf69207b4f50924ehkuang%define filter_rnd [pw_8]
123891037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
12399b35249446b07f40ac5fcc3205f2c048616efacchkuang%endif
12409b35249446b07f40ac5fcc3205f2c048616efacchkuang
124191037db265ecdd914a26e056cf69207b4f50924ehkuang  ; x_offset == bilin interpolation && y_offset == bilin interpolation
124291037db265ecdd914a26e056cf69207b4f50924ehkuang%if %1 == 16
124391037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m0, [srcq]
124491037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m1, [srcq+1]
124591037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
124691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m1
124791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m1
124891037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m2, filter_x_a
124991037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m0, filter_x_a
125091037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
125191037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
125291037db265ecdd914a26e056cf69207b4f50924ehkuang%else
125391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
125491037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m1, m5
125591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
125691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
125791037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m0, filter_x_a
125891037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m1, filter_x_b
125991037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
126091037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m2, filter_x_a
126191037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m3, filter_x_b
126291037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
126391037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, m1
126491037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, m3
126591037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
126691037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m0, 4
126791037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
12689b35249446b07f40ac5fcc3205f2c048616efacchkuang
12699b35249446b07f40ac5fcc3205f2c048616efacchkuang  INC_SRC_BY_SRC_STRIDE
12709b35249446b07f40ac5fcc3205f2c048616efacchkuang
127191037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m0, m2
127291037db265ecdd914a26e056cf69207b4f50924ehkuang.x_other_y_other_loop:
127391037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
127491037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m4, [srcq]
127591037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m3, [srcq+1]
127691037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m1, [dstq]
127791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m4, m3
127891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m4, m3
127991037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m2, filter_x_a
128091037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m4, filter_x_a
128191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m1, m5
128291037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
128391037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m4, filter_rnd
128491037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
128591037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m4, 4
128691037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m4, m2
128791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m4
128891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m4
128991037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m2, filter_y_a
129091037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m0, filter_y_a
129191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
129291037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
129391037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
129491037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
129591037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m0, 4
129691037db265ecdd914a26e056cf69207b4f50924ehkuang%else
129791037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m3, [srcq]
129891037db265ecdd914a26e056cf69207b4f50924ehkuang  movu                 m4, [srcq+1]
129991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m1, m3, m5
130091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m4, m5
130191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
130291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m4, m5
130391037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m3, filter_x_a
130491037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m4, filter_x_b
130591037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m3, filter_rnd
130691037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m1, filter_x_a
130791037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m2, filter_x_b
130891037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m1, filter_rnd
130991037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m3, m4
131091037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m1, m2
131191037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m3, 4
131291037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m1, 4
131391037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m4, m3, m1
131491037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
131591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
131691037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m2, filter_y_a
131791037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m1, filter_y_b
131891037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
131991037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m0, filter_y_a
132091037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m3, filter_y_b
132191037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, m1
132291037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m1, [dstq]
132391037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
132491037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
132591037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, m3
132691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m3, m1, m5
132791037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m0, 4
132891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
132991037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
133091037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
133191037db265ecdd914a26e056cf69207b4f50924ehkuang  ; FIXME(rbultje) pipeline
133291037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m0, m2
133391037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
133491037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
133591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
133691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
133791037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
133891037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m0, m4
133991037db265ecdd914a26e056cf69207b4f50924ehkuang
13409b35249446b07f40ac5fcc3205f2c048616efacchkuang  INC_SRC_BY_SRC_STRIDE
134191037db265ecdd914a26e056cf69207b4f50924ehkuang  add                dstq, dst_strideq
134291037db265ecdd914a26e056cf69207b4f50924ehkuang%else ; %1 < 16
134368e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m0, [srcq]
134468e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [srcq+1]
134591037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
134691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m1
134791037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m0, filter_x_a
134891037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
134991037db265ecdd914a26e056cf69207b4f50924ehkuang%else
135091037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
135191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
135291037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m0, filter_x_a
135391037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m1, filter_x_b
135491037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
135591037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, m1
135691037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
135791037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m0, 4
135891037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
135991037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m0, m0
136091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
13619b35249446b07f40ac5fcc3205f2c048616efacchkuang
13629b35249446b07f40ac5fcc3205f2c048616efacchkuang  INC_SRC_BY_SRC_STRIDE
13639b35249446b07f40ac5fcc3205f2c048616efacchkuang
136491037db265ecdd914a26e056cf69207b4f50924ehkuang.x_other_y_other_loop:
136568e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m2, [srcq]
136668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [srcq+1]
13679b35249446b07f40ac5fcc3205f2c048616efacchkuang
13689b35249446b07f40ac5fcc3205f2c048616efacchkuang  INC_SRC_BY_SRC_STRIDE
136968e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m4, [srcq]
137068e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [srcq+1]
13719b35249446b07f40ac5fcc3205f2c048616efacchkuang
137291037db265ecdd914a26e056cf69207b4f50924ehkuang%if cpuflag(ssse3)
137391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m2, m1
137491037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m4, m3
137591037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m2, filter_x_a
137691037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m4, filter_x_a
137768e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [dstq+dst_strideq]
137868e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
137991037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
138091037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m4, filter_rnd
138191037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
138291037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m4, 4
138391037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m2, m2
138491037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m4, m4
138591037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m2
138691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m2, m4
138791037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m0, filter_y_a
138891037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddubsw            m2, filter_y_a
138991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
139091037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
139191037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
139291037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m0, 4
139391037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
139491037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
139591037db265ecdd914a26e056cf69207b4f50924ehkuang%else
139691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m2, m5
139791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
139891037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m4, m5
139991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
140091037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m2, filter_x_a
140191037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m1, filter_x_b
140291037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
140391037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m4, filter_x_a
140491037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m3, filter_x_b
140591037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m4, filter_rnd
140691037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, m1
140791037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m4, m3
140891037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
140991037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m4, 4
141091037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m0, filter_y_a
141191037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m3, m2, filter_y_b
141291037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, filter_rnd
141391037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m2, filter_y_a
141491037db265ecdd914a26e056cf69207b4f50924ehkuang  pmullw               m1, m4, filter_y_b
141591037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, filter_rnd
141691037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m0, m3
141768e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m3, [dstq+dst_strideq]
141891037db265ecdd914a26e056cf69207b4f50924ehkuang  paddw                m2, m1
141968e1c830ade592be74773e249bf94e2bbfb50de7Johann  movx                 m1, [dstq]
142091037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m0, 4
142191037db265ecdd914a26e056cf69207b4f50924ehkuang  psraw                m2, 4
142291037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m3, m5
142391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m1, m5
142491037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
142591037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
142691037db265ecdd914a26e056cf69207b4f50924ehkuang  ; FIXME(rbultje) pipeline
142768e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 == 4
142868e1c830ade592be74773e249bf94e2bbfb50de7Johann  movlhps              m0, m2
142968e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
143091037db265ecdd914a26e056cf69207b4f50924ehkuang  packuswb             m0, m2
143168e1c830ade592be74773e249bf94e2bbfb50de7Johann%if %1 > 4
143291037db265ecdd914a26e056cf69207b4f50924ehkuang  pavgb                m0, [secq]
143391037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhbw            m2, m0, m5
143491037db265ecdd914a26e056cf69207b4f50924ehkuang  punpcklbw            m0, m5
143568e1c830ade592be74773e249bf94e2bbfb50de7Johann%else
143668e1c830ade592be74773e249bf94e2bbfb50de7Johann  movh                 m2, [secq]
143768e1c830ade592be74773e249bf94e2bbfb50de7Johann  pavgb                m0, m2
143868e1c830ade592be74773e249bf94e2bbfb50de7Johann  punpcklbw            m0, m5
143968e1c830ade592be74773e249bf94e2bbfb50de7Johann  movhlps              m2, m0
144068e1c830ade592be74773e249bf94e2bbfb50de7Johann%endif
144191037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
144291037db265ecdd914a26e056cf69207b4f50924ehkuang  SUM_SSE              m0, m1, m2, m3, m6, m7
144391037db265ecdd914a26e056cf69207b4f50924ehkuang  mova                 m0, m4
144491037db265ecdd914a26e056cf69207b4f50924ehkuang
14459b35249446b07f40ac5fcc3205f2c048616efacchkuang  INC_SRC_BY_SRC_STRIDE
144691037db265ecdd914a26e056cf69207b4f50924ehkuang  lea                dstq, [dstq+dst_strideq*2]
144791037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
144891037db265ecdd914a26e056cf69207b4f50924ehkuang%if %2 == 1 ; avg
144991037db265ecdd914a26e056cf69207b4f50924ehkuang  add                secq, sec_str
145091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
14517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  dec                   block_height
145291037db265ecdd914a26e056cf69207b4f50924ehkuang  jg .x_other_y_other_loop
145391037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_x_a
145491037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_x_b
145591037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_y_a
145691037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_y_b
145791037db265ecdd914a26e056cf69207b4f50924ehkuang%undef filter_rnd
145868e1c830ade592be74773e249bf94e2bbfb50de7Johann%undef movx
145968e1c830ade592be74773e249bf94e2bbfb50de7Johann  STORE_AND_RET %1
146091037db265ecdd914a26e056cf69207b4f50924ehkuang%endmacro
146191037db265ecdd914a26e056cf69207b4f50924ehkuang
146291037db265ecdd914a26e056cf69207b4f50924ehkuang; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
146391037db265ecdd914a26e056cf69207b4f50924ehkuang; between the ssse3 and non-ssse3 version. It may make sense to merge their
146491037db265ecdd914a26e056cf69207b4f50924ehkuang; code in the sense that the ssse3 version would jump to the appropriate
146591037db265ecdd914a26e056cf69207b4f50924ehkuang; location in the sse/2 version, rather than duplicating that code in the
146691037db265ecdd914a26e056cf69207b4f50924ehkuang; binary.
146791037db265ecdd914a26e056cf69207b4f50924ehkuang
146891037db265ecdd914a26e056cf69207b4f50924ehkuangINIT_XMM sse2
146968e1c830ade592be74773e249bf94e2bbfb50de7JohannSUBPEL_VARIANCE  4
147091037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE  8
147191037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE 16
147291037db265ecdd914a26e056cf69207b4f50924ehkuang
147391037db265ecdd914a26e056cf69207b4f50924ehkuangINIT_XMM ssse3
147468e1c830ade592be74773e249bf94e2bbfb50de7JohannSUBPEL_VARIANCE  4
147591037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE  8
147691037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE 16
147791037db265ecdd914a26e056cf69207b4f50924ehkuang
147891037db265ecdd914a26e056cf69207b4f50924ehkuangINIT_XMM sse2
147968e1c830ade592be74773e249bf94e2bbfb50de7JohannSUBPEL_VARIANCE  4, 1
148091037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE  8, 1
148191037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE 16, 1
148291037db265ecdd914a26e056cf69207b4f50924ehkuang
148391037db265ecdd914a26e056cf69207b4f50924ehkuangINIT_XMM ssse3
148468e1c830ade592be74773e249bf94e2bbfb50de7JohannSUBPEL_VARIANCE  4, 1
148591037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE  8, 1
148691037db265ecdd914a26e056cf69207b4f50924ehkuangSUBPEL_VARIANCE 16, 1
1487