1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/*
2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *
4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */
10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include <assert.h>
12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h"
13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/mips/vpx_convolve_msa.h"
14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianconst uint8_t mc_filt_mask_arr[16 * 3] = {
16da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* 8 width cases */
17da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* 4 width cases */
19da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* 4 width cases */
21da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
22da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian};
23da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
24da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
25da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     uint8_t *dst, int32_t dst_stride,
26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     int8_t *filter_horiz, int8_t *filter_vert,
27da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     int32_t height) {
28da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
29da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
31da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 mask0, mask1, mask2, mask3, out;
32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
33da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
34da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
36da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask0 = LD_UB(&mc_filt_mask_arr[16]);
37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src -= (3 + 3 * src_stride);
38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_horiz);
41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask1 = mask0 + 2;
44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask2 = mask0 + 4;
45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask3 = mask0 + 6;
46da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
48da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += (7 * src_stride);
50da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
56da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
57da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_vert);
62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
63da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
64da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
65da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
66da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src7, src8, src9, src10);
69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    XORI_B4_128_SB(src7, src8, src9, src10);
70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
727bc9febe8749e98a3812a0dc4380ceae75c29450Johann    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
737bc9febe8749e98a3812a0dc4380ceae75c29450Johann                              filt_hz1, filt_hz2, filt_hz3);
74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               filt_vt2, filt_vt3);
78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
797bc9febe8749e98a3812a0dc4380ceae75c29450Johann    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
807bc9febe8749e98a3812a0dc4380ceae75c29450Johann                              filt_hz1, filt_hz2, filt_hz3);
81da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
83da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               filt_vt2, filt_vt3);
85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SAT_SH2_SH(tmp0, tmp1, 7);
87da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out = PCKEV_XORI128_UB(tmp0, tmp1);
88da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out5 = hz_out9;
92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out0 = out2;
93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out1 = out3;
94da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out2 = out4;
95da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     uint8_t *dst, int32_t dst_stride,
100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     int8_t *filter_horiz, int8_t *filter_vert,
101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     int32_t height) {
102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask0 = LD_UB(&mc_filt_mask_arr[0]);
112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src -= (3 + 3 * src_stride);
113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_horiz);
116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask1 = mask0 + 2;
119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask2 = mask0 + 4;
120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask3 = mask0 + 6;
121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += (7 * src_stride);
124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_vert);
142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src7, src8, src9, src10);
150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    XORI_B4_128_SB(src7, src8, src9, src10);
153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1547bc9febe8749e98a3812a0dc4380ceae75c29450Johann    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
1557bc9febe8749e98a3812a0dc4380ceae75c29450Johann                              filt_hz1, filt_hz2, filt_hz3);
156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               filt_vt2, filt_vt3);
159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1607bc9febe8749e98a3812a0dc4380ceae75c29450Johann    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
1617bc9febe8749e98a3812a0dc4380ceae75c29450Johann                              filt_hz1, filt_hz2, filt_hz3);
162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               filt_vt2, filt_vt3);
165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1667bc9febe8749e98a3812a0dc4380ceae75c29450Johann    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
1677bc9febe8749e98a3812a0dc4380ceae75c29450Johann                              filt_hz1, filt_hz2, filt_hz3);
168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               filt_vt2, filt_vt3);
171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               filt_vt2, filt_vt3);
177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST8x4_UB(vec0, vec1, dst, dst_stride);
182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out6 = hz_out10;
185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out0 = out2;
186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out1 = out3;
187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out2 = out8;
188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out4 = out6;
189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out5 = out7;
190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out6 = out9;
191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      uint8_t *dst, int32_t dst_stride,
196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int8_t *filter_horiz, int8_t *filter_vert,
197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int32_t height) {
198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t multiple8_cnt;
199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (multiple8_cnt = 2; multiple8_cnt--;) {
200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                             filter_vert, height);
202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 8;
203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 8;
204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      uint8_t *dst, int32_t dst_stride,
209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int8_t *filter_horiz, int8_t *filter_vert,
210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int32_t height) {
211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t multiple8_cnt;
212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (multiple8_cnt = 4; multiple8_cnt--;) {
213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                             filter_vert, height);
215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 8;
216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 8;
217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      uint8_t *dst, int32_t dst_stride,
222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int8_t *filter_horiz, int8_t *filter_vert,
223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int32_t height) {
224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t multiple8_cnt;
225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (multiple8_cnt = 8; multiple8_cnt--;) {
226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                             filter_vert, height);
228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 8;
229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 8;
230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      uint8_t *dst, int32_t dst_stride,
235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int8_t *filter_horiz,
236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int8_t *filter_vert) {
237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, mask;
238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[16]);
242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_UH(filter_horiz);
245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_UH(filter_vert);
248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      uint8_t *dst, int32_t dst_stride,
266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int8_t *filter_horiz,
267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int8_t *filter_vert) {
268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 res0, res1, res2, res3;
270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[16]);
275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_UH(filter_horiz);
278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_UH(filter_vert);
281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += (8 * src_stride);
285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src8 = LD_SB(src);
286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian             hz_out3, hz_out5, 8);
294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2987bc9febe8749e98a3812a0dc4380ceae75c29450Johann  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
2997bc9febe8749e98a3812a0dc4380ceae75c29450Johann              vec5, vec6, vec7);
300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
3017bc9febe8749e98a3812a0dc4380ceae75c29450Johann  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
3027bc9febe8749e98a3812a0dc4380ceae75c29450Johann              res3);
303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  dst += (4 * dst_stride);
305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     uint8_t *dst, int32_t dst_stride,
310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     int8_t *filter_horiz, int8_t *filter_vert,
311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     int32_t height) {
312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (4 == height) {
313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              filter_vert);
315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else if (8 == height) {
316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              filter_vert);
318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      uint8_t *dst, int32_t dst_stride,
323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int8_t *filter_horiz,
324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int8_t *filter_vert) {
325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt;
329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[0]);
331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_horiz);
334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_splati_h(filt, 0);
335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_vert);
337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_splati_h(filt, 0);
338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ST8x4_UB(out0, out1, dst, dst_stride);
361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
3647bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                          int32_t src_stride, uint8_t *dst,
365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                          int32_t dst_stride,
366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                          int8_t *filter_horiz,
3677bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                          int8_t *filter_vert, int32_t height) {
368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt_hz, filt_vt, vec0;
371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt;
373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[0]);
375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_horiz);
378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_splati_h(filt, 0);
379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_vert);
381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_splati_h(filt, 0);
382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src0 = LD_SB(src);
384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 3); loop_cnt--;) {
389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src1, src2, src3, src4);
390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src1, src2, src3, src4);
408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp4 = __msa_dotp_u_h(vec0, filt_vt);
411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST8x4_UB(out0, out1, dst, dst_stride);
415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp5 = __msa_dotp_u_h(vec0, filt_vt);
420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp6 = __msa_dotp_u_h(vec0, filt_vt);
424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp7 = __msa_dotp_u_h(vec0, filt_vt);
428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp8 = __msa_dotp_u_h(vec0, filt_vt);
432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST8x4_UB(out0, out1, dst, dst_stride);
436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     uint8_t *dst, int32_t dst_stride,
442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     int8_t *filter_horiz, int8_t *filter_vert,
443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     int32_t height) {
444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (4 == height) {
445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              filter_vert);
447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                  filter_horiz, filter_vert, height);
450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      uint8_t *dst, int32_t dst_stride,
455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int8_t *filter_horiz, int8_t *filter_vert,
456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int32_t height) {
457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt_hz, filt_vt, vec0, vec1;
460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt;
462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[0]);
464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_horiz);
467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_splati_h(filt, 0);
468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_vert);
470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_splati_h(filt, 0);
471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB2(src, 8, src0, src1);
473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
477da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src2, src4, src6);
480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_ST_SB(tmp1, tmp2, dst);
489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_ST_SB(tmp1, tmp2, dst);
497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_ST_SB(tmp1, tmp2, dst);
505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_ST_SB(tmp1, tmp2, dst);
513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      uint8_t *dst, int32_t dst_stride,
519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int8_t *filter_horiz, int8_t *filter_vert,
520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int32_t height) {
521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t multiple8_cnt;
522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (multiple8_cnt = 2; multiple8_cnt--;) {
523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              filter_vert, height);
525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      uint8_t *dst, int32_t dst_stride,
532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int8_t *filter_horiz, int8_t *filter_vert,
533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int32_t height) {
534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t multiple8_cnt;
535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (multiple8_cnt = 4; multiple8_cnt--;) {
536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              filter_vert, height);
538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
5437bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
544df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                       ptrdiff_t dst_stride, const InterpKernel *filter,
545df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                       int x0_q4, int32_t x_step_q4, int y0_q4,
5467bc9febe8749e98a3812a0dc4380ceae75c29450Johann                       int32_t y_step_q4, int32_t w, int32_t h) {
547df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  const int16_t *const filter_x = filter[x0_q4];
548df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  const int16_t *const filter_y = filter[y0_q4];
549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int8_t cnt, filt_hor[8], filt_ver[8];
550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(x_step_q4 == 16);
552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(y_step_q4 == 16);
553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(((const int32_t *)filter_x)[1] != 0x800000);
554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(((const int32_t *)filter_y)[1] != 0x800000);
555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (cnt = 0; cnt < 8; ++cnt) {
557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt_hor[cnt] = filter_x[cnt];
558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt_ver[cnt] = filter_y[cnt];
559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (((const int32_t *)filter_x)[0] == 0 &&
562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      ((const int32_t *)filter_y)[0] == 0) {
563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    switch (w) {
564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 4:
5657bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
5667bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                 (int32_t)dst_stride, &filt_hor[3],
5677bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                 &filt_ver[3], (int32_t)h);
568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 8:
5707bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst,
5717bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                 (int32_t)dst_stride, &filt_hor[3],
5727bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                 &filt_ver[3], (int32_t)h);
573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 16:
5757bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst,
5767bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  (int32_t)dst_stride, &filt_hor[3],
5777bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  &filt_ver[3], (int32_t)h);
578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 32:
5807bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst,
5817bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  (int32_t)dst_stride, &filt_hor[3],
5827bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  &filt_ver[3], (int32_t)h);
583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 64:
5857bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst,
5867bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  (int32_t)dst_stride, &filt_hor[3],
5877bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  &filt_ver[3], (int32_t)h);
588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      default:
590df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
591df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                        x_step_q4, y0_q4, y_step_q4, w, h);
592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else if (((const int32_t *)filter_x)[0] == 0 ||
595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian             ((const int32_t *)filter_y)[0] == 0) {
596df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
597df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                    y0_q4, y_step_q4, w, h);
598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    switch (w) {
600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 4:
6017bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst,
6027bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                 (int32_t)dst_stride, filt_hor, filt_ver,
6037bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                 (int32_t)h);
604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 8:
6067bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst,
6077bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                 (int32_t)dst_stride, filt_hor, filt_ver,
6087bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                 (int32_t)h);
609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 16:
6117bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst,
6127bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  (int32_t)dst_stride, filt_hor, filt_ver,
6137bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  (int32_t)h);
614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 32:
6167bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst,
6177bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  (int32_t)dst_stride, filt_hor, filt_ver,
6187bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  (int32_t)h);
619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 64:
6217bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst,
6227bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  (int32_t)dst_stride, filt_hor, filt_ver,
6237bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  (int32_t)h);
624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      default:
626df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
627df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                        x_step_q4, y0_q4, y_step_q4, w, h);
628da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
629da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
632df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
633df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void filter_horiz_w4_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
634df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                uint8_t *dst, const int16_t *x_filter) {
635df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint64_t srcd0, srcd1, srcd2, srcd3;
636df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint32_t res;
637df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
638df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 out0, out1;
639df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 shf1 = { 0, 8, 16, 24, 4, 12, 20, 28, 1, 9, 17, 25, 5, 13, 21, 29 };
640df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 shf2 = shf1 + 2;
641df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
642df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 filt_shf1 = filt_shf0 + 2;
643df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 filt_shf2 = filt_shf0 + 4;
644df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 filt_shf3 = filt_shf0 + 6;
645df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v8i16 filt, src0_h, src1_h, src2_h, src3_h, filt0, filt1, filt2, filt3;
646df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
647df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
648df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(srcd0, srcd1, src0);
649df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(srcd2, srcd3, src1);
650df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
651df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  XORI_B2_128_SB(out0, out1);
652df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out0, src0_h, src1_h);
653df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out1, src2_h, src3_h);
654df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
655df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  filt = LD_SH(x_filter);
656df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
657df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
658df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
659df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h *= filt0;
660df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h += src1_h * filt1;
661df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h += src2_h * filt2;
662df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h += src3_h * filt3;
663df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
664df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
665df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
666df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h = __msa_adds_s_h(src0_h, src1_h);
667df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
668df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h = __msa_sat_s_h(src0_h, 7);
669df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
670df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  res = __msa_copy_u_w((v4i32)dst0, 0);
671df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SW(res, dst);
672df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
673df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
674df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void filter_horiz_w8_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
675df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                uint8_t *dst, const int16_t *x_filter) {
676df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint64_t srcd0, srcd1, srcd2, srcd3;
677df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
678df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
679df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 out0, out1, out2, out3;
680df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
681df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 shf2 = shf1 + 4;
682df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
683df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
684df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
685df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
686df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(srcd0, srcd1, src0);
687df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(srcd2, srcd3, src1);
688df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
689df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(srcd0, srcd1, src2);
690df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(srcd2, srcd3, src3);
691df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
692df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  filt = LD_SH(x_filter);
693df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
694df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
695df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
696df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // transpose
697df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
698df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
699df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ILVRL_W2_SB(tmp2, tmp0, out0, out1);
700df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ILVRL_W2_SB(tmp3, tmp1, out2, out3);
701df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
702df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  XORI_B4_128_SB(out0, out1, out2, out3);
703df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out0, src0_h, src1_h);
704df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out1, src2_h, src3_h);
705df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out2, src4_h, src5_h);
706df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out3, src6_h, src7_h);
707df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
708df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h *= filt0;
709df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src4_h *= filt4;
710df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h += src1_h * filt1;
711df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src4_h += src5_h * filt5;
712df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h += src2_h * filt2;
713df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src4_h += src6_h * filt6;
714df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h += src3_h * filt3;
715df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src4_h += src7_h * filt7;
716df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
717df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h = __msa_adds_s_h(src0_h, src4_h);
718df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
719df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h = __msa_sat_s_h(src0_h, 7);
720df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
721df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ST8x1_UB(dst0, dst);
722df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
723df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
724df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void filter_horiz_w16_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
725df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                 uint8_t *dst, const int16_t *x_filter) {
726df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint64_t srcd0, srcd1, srcd2, srcd3;
727df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
728df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 src4 = { 0 }, src5 = { 0 }, src6 = { 0 }, src7 = { 0 };
729df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
730df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 out0, out1, out2, out3, out4, out5, out6, out7;
731df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
732df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 shf2 = shf1 + 4;
733df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
734df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
735df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v8i16 dst0_h, dst1_h, dst2_h, dst3_h;
736df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
737df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
738df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(srcd0, srcd1, src0);
739df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(srcd2, srcd3, src1);
740df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
741df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(srcd0, srcd1, src2);
742df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(srcd2, srcd3, src3);
743df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LD4(src_x + 8 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
744df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(srcd0, srcd1, src4);
745df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(srcd2, srcd3, src5);
746df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LD4(src_x + 12 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
747df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(srcd0, srcd1, src6);
748df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(srcd2, srcd3, src7);
749df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
750df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  filt = LD_SH(x_filter);
751df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
752df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
753df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
754df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // transpose
755df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
756df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
757df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ILVRL_W2_SB(tmp2, tmp0, out0, out1);
758df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ILVRL_W2_SB(tmp3, tmp1, out2, out3);
759df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  XORI_B4_128_SB(out0, out1, out2, out3);
760df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
761df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out0, src0_h, src1_h);
762df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out1, src2_h, src3_h);
763df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out2, src4_h, src5_h);
764df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out3, src6_h, src7_h);
765df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
766df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  VSHF_B2_UB(src4, src5, src4, src5, shf1, shf2, tmp0, tmp1);
767df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  VSHF_B2_UB(src6, src7, src6, src7, shf1, shf2, tmp2, tmp3);
768df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ILVRL_W2_SB(tmp2, tmp0, out4, out5);
769df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ILVRL_W2_SB(tmp3, tmp1, out6, out7);
770df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  XORI_B4_128_SB(out4, out5, out6, out7);
771df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
772df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst0_h = src0_h * filt0;
773df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst1_h = src4_h * filt4;
774df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst0_h += src1_h * filt1;
775df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst1_h += src5_h * filt5;
776df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst0_h += src2_h * filt2;
777df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst1_h += src6_h * filt6;
778df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst0_h += src3_h * filt3;
779df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst1_h += src7_h * filt7;
780df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
781df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out4, src0_h, src1_h);
782df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out5, src2_h, src3_h);
783df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out6, src4_h, src5_h);
784df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out7, src6_h, src7_h);
785df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
786df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst2_h = src0_h * filt0;
787df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst3_h = src4_h * filt4;
788df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst2_h += src1_h * filt1;
789df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst3_h += src5_h * filt5;
790df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst2_h += src2_h * filt2;
791df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst3_h += src6_h * filt6;
792df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst2_h += src3_h * filt3;
793df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst3_h += src7_h * filt7;
794df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
795df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ADDS_SH2_SH(dst0_h, dst1_h, dst2_h, dst3_h, dst0_h, dst2_h);
796df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SRARI_H2_SH(dst0_h, dst2_h, FILTER_BITS);
797df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SAT_SH2_SH(dst0_h, dst2_h, 7);
798df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst0 = PCKEV_XORI128_UB(dst0_h, dst2_h);
799df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ST_UB(dst0, dst);
800df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
801df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
802df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void transpose4x4_to_dst(const uint8_t *src, uint8_t *dst,
803df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                ptrdiff_t dst_stride) {
804df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 in0;
805df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 out0 = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
806df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
807df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0 = LD_UB(src);
808df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out0 = __msa_vshf_b(out0, (v16i8)in0, (v16i8)in0);
809df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
810df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
811df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
812df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void transpose8x8_to_dst(const uint8_t *src, uint8_t *dst,
813df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                ptrdiff_t dst_stride) {
814df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
815df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
816df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 shf2 = shf1 + 4;
817df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
818df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LD_UB4(src, 16, in0, in1, in2, in3);
819df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  VSHF_B2_UB(in0, in1, in0, in1, shf1, shf2, tmp0, tmp1);
820df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  VSHF_B2_UB(in2, in3, in2, in3, shf1, shf2, tmp2, tmp3);
821df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ILVRL_W2_UB(tmp2, tmp0, out0, out1);
822df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ILVRL_W2_UB(tmp3, tmp1, out2, out3);
823df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ST8x4_UB(out0, out1, dst, dst_stride);
824df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ST8x4_UB(out2, out3, dst + 4 * dst_stride, dst_stride);
825df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
826df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
827df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void transpose16x16_to_dst(const uint8_t *src, uint8_t *dst,
828df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                  ptrdiff_t dst_stride) {
829df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12;
830df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7, out8;
831df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 out9, out10, out11, out12, out13, out14, out15;
832df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
833df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LD_UB8(src, 16, in0, in1, in2, in3, in4, in5, in6, in7);
834df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LD_UB8(src + 16 * 8, 16, in8, in9, in10, in11, in12, in13, in14, in15);
835df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
836df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
837df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                      in11, in12, in13, in14, in15, out0, out1, out2, out3,
838df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                      out4, out5, out6, out7);
839df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ST_UB8(out0, out1, out2, out3, out4, out5, out6, out7, dst, dst_stride);
840df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst += 8 * dst_stride;
841df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
842df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SLDI_B4_0_UB(in0, in1, in2, in3, in0, in1, in2, in3, 8);
843df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SLDI_B4_0_UB(in4, in5, in6, in7, in4, in5, in6, in7, 8);
844df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SLDI_B4_0_UB(in8, in9, in10, in11, in8, in9, in10, in11, 8);
845df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SLDI_B4_0_UB(in12, in13, in14, in15, in12, in13, in14, in15, 8);
846df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
847df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
848df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                      in11, in12, in13, in14, in15, out8, out9, out10, out11,
849df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                      out12, out13, out14, out15);
850df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ST_UB8(out8, out9, out10, out11, out12, out13, out14, out15, dst, dst_stride);
851df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
852df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
853df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
854df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                    uint8_t *dst, ptrdiff_t dst_stride,
855df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                    const InterpKernel *x_filters, int x0_q4,
856df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                    int x_step_q4, int h) {
857df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
858df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int y, z, i;
859df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src -= SUBPEL_TAPS / 2 - 1;
860df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
861df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  for (y = 0; y < h; y += 4) {
862df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    int x_q4 = x0_q4;
863df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    for (z = 0; z < 4; ++z) {
864df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
865df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
866df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
867df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      if (x_q4 & SUBPEL_MASK) {
868df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        filter_horiz_w4_msa(src_x, src_stride, temp + (z * 4), x_filter);
869df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      } else {
870df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        for (i = 0; i < 4; ++i) {
871df37111358d02836cb29bbcb9c6e4c95dff90a16Johann          temp[z * 4 + i] = src_x[i * src_stride + 3];
872df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        }
873df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      }
874df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
875df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      x_q4 += x_step_q4;
876df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    }
877df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
878df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    transpose4x4_to_dst(temp, dst, dst_stride);
879df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
880df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src += src_stride * 4;
881df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    dst += dst_stride * 4;
882df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  }
883df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
884df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
885df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
886df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                    uint8_t *dst, ptrdiff_t dst_stride,
887df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                    const InterpKernel *x_filters, int x0_q4,
888df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                    int x_step_q4, int h) {
889df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
890df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int y, z, i;
891df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src -= SUBPEL_TAPS / 2 - 1;
892df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
893df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // This function processes 8x8 areas. The intermediate height is not always
894df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // a multiple of 8, so force it to be a multiple of 8 here.
895df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  y = h + (8 - (h & 0x7));
896df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
897df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  do {
898df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    int x_q4 = x0_q4;
899df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    for (z = 0; z < 8; ++z) {
900df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
901df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
902df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
903df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      if (x_q4 & SUBPEL_MASK) {
904df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        filter_horiz_w8_msa(src_x, src_stride, temp + (z * 8), x_filter);
905df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      } else {
906df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        for (i = 0; i < 8; ++i) {
907df37111358d02836cb29bbcb9c6e4c95dff90a16Johann          temp[z * 8 + i] = src_x[3 + i * src_stride];
908df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        }
909df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      }
910df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
911df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      x_q4 += x_step_q4;
912df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    }
913df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
914df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    transpose8x8_to_dst(temp, dst, dst_stride);
915df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
916df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src += src_stride * 8;
917df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    dst += dst_stride * 8;
918df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  } while (y -= 8);
919df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
920df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
921df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void scaledconvolve_horiz_mul16(const uint8_t *src, ptrdiff_t src_stride,
922df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                       uint8_t *dst, ptrdiff_t dst_stride,
923df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                       const InterpKernel *x_filters, int x0_q4,
924df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                       int x_step_q4, int w, int h) {
925df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(16, uint8_t, temp[16 * 16]);
926df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int x, y, z, i;
927df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
928df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src -= SUBPEL_TAPS / 2 - 1;
929df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
930df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // This function processes 16x16 areas.  The intermediate height is not always
931df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // a multiple of 16, so force it to be a multiple of 8 here.
932df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  y = h + (16 - (h & 0xF));
933df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
934df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  do {
935df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    int x_q4 = x0_q4;
936df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    for (x = 0; x < w; x += 16) {
937df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      for (z = 0; z < 16; ++z) {
938df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
939df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
940df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
941df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        if (x_q4 & SUBPEL_MASK) {
942df37111358d02836cb29bbcb9c6e4c95dff90a16Johann          filter_horiz_w16_msa(src_x, src_stride, temp + (z * 16), x_filter);
943df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        } else {
944df37111358d02836cb29bbcb9c6e4c95dff90a16Johann          for (i = 0; i < 16; ++i) {
945df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            temp[z * 16 + i] = src_x[3 + i * src_stride];
946df37111358d02836cb29bbcb9c6e4c95dff90a16Johann          }
947df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        }
948df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
949df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        x_q4 += x_step_q4;
950df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      }
951df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
952df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      transpose16x16_to_dst(temp, dst + x, dst_stride);
953df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    }
954df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
955df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src += src_stride * 16;
956df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    dst += dst_stride * 16;
957df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  } while (y -= 16);
958df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
959df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
960df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void filter_vert_w4_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
961df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               uint8_t *dst, const int16_t *y_filter) {
962df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint32_t srcw0, srcw1, srcw2, srcw3, srcw4, srcw5, srcw6, srcw7;
963df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint32_t res;
964df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
965df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 out0, out1;
966df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 shf1 = { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 };
967df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 shf2 = shf1 + 8;
968df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
969df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 filt_shf1 = filt_shf0 + 2;
970df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 filt_shf2 = filt_shf0 + 4;
971df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 filt_shf3 = filt_shf0 + 6;
972df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v8i16 filt, src0_h, src1_h, src2_h, src3_h;
973df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v8i16 filt0, filt1, filt2, filt3;
974df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
975df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LW4(src_y, src_pitch, srcw0, srcw1, srcw2, srcw3);
976df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LW4(src_y + 4 * src_pitch, src_pitch, srcw4, srcw5, srcw6, srcw7);
977df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_W4_UB(srcw0, srcw1, srcw2, srcw3, src0);
978df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_W4_UB(srcw4, srcw5, srcw6, srcw7, src1);
979df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
980df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  XORI_B2_128_SB(out0, out1);
981df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out0, src0_h, src1_h);
982df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(out1, src2_h, src3_h);
983df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
984df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  filt = LD_SH(y_filter);
985df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
986df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
987df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
988df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h *= filt0;
989df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h += src1_h * filt1;
990df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h += src2_h * filt2;
991df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h += src3_h * filt3;
992df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
993df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
994df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
995df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h = __msa_adds_s_h(src0_h, src1_h);
996df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
997df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h = __msa_sat_s_h(src0_h, 7);
998df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
999df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  res = __msa_copy_u_w((v4i32)dst0, 0);
1000df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SW(res, dst);
1001df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
1002df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1003df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void filter_vert_w8_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
1004df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               uint8_t *dst, const int16_t *y_filter) {
1005df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint64_t srcd0, srcd1, srcd2, srcd3;
1006df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 dst0;
1007df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
1008df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
1009df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
1010df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1011df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LD4(src_y, src_pitch, srcd0, srcd1, srcd2, srcd3);
1012df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_SB(srcd0, srcd1, src0);
1013df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_SB(srcd2, srcd3, src1);
1014df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LD4(src_y + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
1015df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_SB(srcd0, srcd1, src2);
1016df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_SB(srcd2, srcd3, src3);
1017df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1018df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  filt = LD_SH(y_filter);
1019df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1020df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
1021df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1022df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  XORI_B4_128_SB(src0, src1, src2, src3);
1023df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(src0, src0_h, src1_h);
1024df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(src1, src2_h, src3_h);
1025df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(src2, src4_h, src5_h);
1026df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  UNPCK_SB_SH(src3, src6_h, src7_h);
1027df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1028df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h *= filt0;
1029df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src4_h *= filt4;
1030df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h += src1_h * filt1;
1031df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src4_h += src5_h * filt5;
1032df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h += src2_h * filt2;
1033df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src4_h += src6_h * filt6;
1034df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h += src3_h * filt3;
1035df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src4_h += src7_h * filt7;
1036df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1037df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h = __msa_adds_s_h(src0_h, src4_h);
1038df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
1039df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src0_h = __msa_sat_s_h(src0_h, 7);
1040df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
1041df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ST8x1_UB(dst0, dst);
1042df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
1043df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1044df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void filter_vert_mul_w16_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
1045df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                    uint8_t *dst, const int16_t *y_filter,
1046df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                    int w) {
1047df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int x;
1048df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 dst0;
1049df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1050df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
1051df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v8i16 src8_h, src9_h, src10_h, src11_h, src12_h, src13_h, src14_h, src15_h;
1052df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
1053df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1054df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  filt = LD_SH(y_filter);
1055df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1056df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
1057df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1058df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  for (x = 0; x < w; x += 16) {
1059df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    LD_SB8(src_y, src_pitch, src0, src1, src2, src3, src4, src5, src6, src7);
1060df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src_y += 16;
1061df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1062df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    XORI_B4_128_SB(src0, src1, src2, src3);
1063df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    XORI_B4_128_SB(src4, src5, src6, src7);
1064df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    UNPCK_SB_SH(src0, src0_h, src1_h);
1065df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    UNPCK_SB_SH(src1, src2_h, src3_h);
1066df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    UNPCK_SB_SH(src2, src4_h, src5_h);
1067df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    UNPCK_SB_SH(src3, src6_h, src7_h);
1068df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    UNPCK_SB_SH(src4, src8_h, src9_h);
1069df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    UNPCK_SB_SH(src5, src10_h, src11_h);
1070df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    UNPCK_SB_SH(src6, src12_h, src13_h);
1071df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    UNPCK_SB_SH(src7, src14_h, src15_h);
1072df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1073df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src0_h *= filt0;
1074df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src1_h *= filt0;
1075df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src8_h *= filt4;
1076df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src9_h *= filt4;
1077df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src0_h += src2_h * filt1;
1078df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src1_h += src3_h * filt1;
1079df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src8_h += src10_h * filt5;
1080df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src9_h += src11_h * filt5;
1081df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src0_h += src4_h * filt2;
1082df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src1_h += src5_h * filt2;
1083df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src8_h += src12_h * filt6;
1084df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src9_h += src13_h * filt6;
1085df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src0_h += src6_h * filt3;
1086df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src1_h += src7_h * filt3;
1087df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src8_h += src14_h * filt7;
1088df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    src9_h += src15_h * filt7;
1089df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1090df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    ADDS_SH2_SH(src0_h, src8_h, src1_h, src9_h, src0_h, src1_h);
1091df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    SRARI_H2_SH(src0_h, src1_h, FILTER_BITS);
1092df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    SAT_SH2_SH(src0_h, src1_h, 7);
1093df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    dst0 = PCKEV_XORI128_UB(src0_h, src1_h);
1094df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    ST_UB(dst0, dst);
1095df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    dst += 16;
1096df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  }
1097df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
1098df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1099df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
1100df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                   uint8_t *dst, ptrdiff_t dst_stride,
1101df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                   const InterpKernel *y_filters, int y0_q4,
1102df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                   int y_step_q4, int h) {
1103df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int y;
1104df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int y_q4 = y0_q4;
1105df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1106df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1107df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1108df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  for (y = 0; y < h; ++y) {
1109df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1110df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1111df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1112df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    if (y_q4 & SUBPEL_MASK) {
1113df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      filter_vert_w4_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
1114df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    } else {
1115df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      uint32_t srcd = LW(src_y + 3 * src_stride);
1116df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      SW(srcd, dst + y * dst_stride);
1117df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    }
1118df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1119df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    y_q4 += y_step_q4;
1120df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  }
1121df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
1122df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1123df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
1124df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                   uint8_t *dst, ptrdiff_t dst_stride,
1125df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                   const InterpKernel *y_filters, int y0_q4,
1126df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                   int y_step_q4, int h) {
1127df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int y;
1128df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int y_q4 = y0_q4;
1129df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1130df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1131df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1132df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  for (y = 0; y < h; ++y) {
1133df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1134df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1135df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1136df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    if (y_q4 & SUBPEL_MASK) {
1137df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      filter_vert_w8_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
1138df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    } else {
1139df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      uint64_t srcd = LD(src_y + 3 * src_stride);
1140df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      SD(srcd, dst + y * dst_stride);
1141df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    }
1142df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1143df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    y_q4 += y_step_q4;
1144df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  }
1145df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
1146df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1147df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void scaledconvolve_vert_mul16(const uint8_t *src, ptrdiff_t src_stride,
1148df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                      uint8_t *dst, ptrdiff_t dst_stride,
1149df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                      const InterpKernel *y_filters, int y0_q4,
1150df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                      int y_step_q4, int w, int h) {
1151df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int x, y;
1152df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int y_q4 = y0_q4;
1153df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1154df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1155df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  for (y = 0; y < h; ++y) {
1156df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1157df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1158df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1159df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    if (y_q4 & SUBPEL_MASK) {
1160df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      filter_vert_mul_w16_msa(src_y, src_stride, &dst[y * dst_stride], y_filter,
1161df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              w);
1162df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    } else {
1163df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      for (x = 0; x < w; ++x) {
1164df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        dst[x + y * dst_stride] = src_y[x + 3 * src_stride];
1165df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      }
1166df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    }
1167df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1168df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    y_q4 += y_step_q4;
1169df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  }
1170df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
1171df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1172df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1173df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                       ptrdiff_t dst_stride, const InterpKernel *filter,
1174df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                       int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
1175df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                       int w, int h) {
1176df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1177df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // 2d filtering proceeds in 2 steps:
1178df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  //   (1) Interpolate horizontally into an intermediate buffer, temp.
1179df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  //   (2) Interpolate temp vertically to derive the sub-pixel result.
1180df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // Deriving the maximum number of rows in the temp buffer (135):
1181df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1182df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // --Largest block size is 64x64 pixels.
1183df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1184df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  //   original frame (in 1/16th pixel units).
1185df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // --Must round-up because block may be located at sub-pixel position.
1186df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1187df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1188df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // --Require an additional 8 rows for the horiz_w8 transpose tail.
1189df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
1190df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  const int intermediate_height =
1191df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1192df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1193df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  assert(w <= 64);
1194df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  assert(h <= 64);
1195df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
1196df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  assert(x_step_q4 <= 64);
1197df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1198df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  if ((0 == x0_q4) && (16 == x_step_q4) && (0 == y0_q4) && (16 == y_step_q4)) {
1199df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    vpx_convolve_copy_msa(src, src_stride, dst, dst_stride, filter, x0_q4,
1200df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                          x_step_q4, y0_q4, y_step_q4, w, h);
1201df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  } else {
1202df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    if (w >= 16) {
1203df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      scaledconvolve_horiz_mul16(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1204df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                 src_stride, temp, 64, filter, x0_q4, x_step_q4,
1205df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                 w, intermediate_height);
1206df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    } else if (w == 8) {
1207df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1208df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              src_stride, temp, 64, filter, x0_q4, x_step_q4,
1209df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              intermediate_height);
1210df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    } else {
1211df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1212df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              src_stride, temp, 64, filter, x0_q4, x_step_q4,
1213df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              intermediate_height);
1214df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    }
1215df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1216df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    if (w >= 16) {
1217df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      scaledconvolve_vert_mul16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1218df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                dst_stride, filter, y0_q4, y_step_q4, w, h);
1219df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    } else if (w == 8) {
1220df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1221df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                             dst_stride, filter, y0_q4, y_step_q4, h);
1222df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    } else {
1223df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1224df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                             dst_stride, filter, y0_q4, y_step_q4, h);
1225df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    }
1226df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  }
1227df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
1228