1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include "./vpx_dsp_rtcd.h"
13#include "vpx_dsp/mips/vpx_convolve_msa.h"
14
15static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
16    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
17    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
18  uint32_t loop_cnt;
19  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
20  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
21  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
22  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
23  v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
24  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
25
26  mask0 = LD_UB(&mc_filt_mask_arr[16]);
27  src -= (3 + 3 * src_stride);
28
29  /* rearranging filter */
30  filt = LD_SH(filter_horiz);
31  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
32
33  mask1 = mask0 + 2;
34  mask2 = mask0 + 4;
35  mask3 = mask0 + 6;
36
37  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
38  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
39  src += (7 * src_stride);
40
41  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
42                            filt_hz1, filt_hz2, filt_hz3);
43  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
44                            filt_hz1, filt_hz2, filt_hz3);
45  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
46                            filt_hz1, filt_hz2, filt_hz3);
47  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
48                            filt_hz1, filt_hz2, filt_hz3);
49  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
50
51  filt = LD_SH(filter_vert);
52  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
53
54  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
55  vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
56
57  for (loop_cnt = (height >> 2); loop_cnt--;) {
58    LD_SB4(src, src_stride, src7, src8, src9, src10);
59    XORI_B4_128_SB(src7, src8, src9, src10);
60    src += (4 * src_stride);
61
62    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
63    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
64                              filt_hz1, filt_hz2, filt_hz3);
65    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
66    vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
67    res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
68                               filt_vt2, filt_vt3);
69
70    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
71                              filt_hz1, filt_hz2, filt_hz3);
72    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
73    vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
74    res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
75                               filt_vt2, filt_vt3);
76    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
77
78    SRARI_H2_SH(res0, res1, FILTER_BITS);
79    SAT_SH2_SH(res0, res1, 7);
80    PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
81    XORI_B2_128_UB(tmp0, tmp1);
82    AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
83    ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
84    dst += (4 * dst_stride);
85
86    hz_out5 = hz_out9;
87    vec0 = vec2;
88    vec1 = vec3;
89    vec2 = vec4;
90  }
91}
92
93static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
94    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
95    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
96  uint32_t loop_cnt;
97  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
98  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
99  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
100  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
101  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
102  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
103  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
104
105  mask0 = LD_UB(&mc_filt_mask_arr[0]);
106  src -= (3 + 3 * src_stride);
107
108  /* rearranging filter */
109  filt = LD_SH(filter_horiz);
110  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
111
112  mask1 = mask0 + 2;
113  mask2 = mask0 + 4;
114  mask3 = mask0 + 6;
115
116  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
117  src += (7 * src_stride);
118
119  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
120  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
121                            filt_hz1, filt_hz2, filt_hz3);
122  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
123                            filt_hz1, filt_hz2, filt_hz3);
124  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
125                            filt_hz1, filt_hz2, filt_hz3);
126  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
127                            filt_hz1, filt_hz2, filt_hz3);
128  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
129                            filt_hz1, filt_hz2, filt_hz3);
130  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
131                            filt_hz1, filt_hz2, filt_hz3);
132  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
133                            filt_hz1, filt_hz2, filt_hz3);
134
135  filt = LD_SH(filter_vert);
136  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
137
138  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
139  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
140  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
141
142  for (loop_cnt = (height >> 2); loop_cnt--;) {
143    LD_SB4(src, src_stride, src7, src8, src9, src10);
144    XORI_B4_128_SB(src7, src8, src9, src10);
145    src += (4 * src_stride);
146
147    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
148
149    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
150                              filt_hz1, filt_hz2, filt_hz3);
151    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
152    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
153                               filt_vt2, filt_vt3);
154
155    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
156                              filt_hz1, filt_hz2, filt_hz3);
157    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
158    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
159                               filt_vt2, filt_vt3);
160
161    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
162                              filt_hz1, filt_hz2, filt_hz3);
163    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
164    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
165                               filt_vt2, filt_vt3);
166
167    hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
168                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
169    out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
170    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
171                               filt_vt2, filt_vt3);
172
173    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
174    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
175    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst,
176                            dst_stride);
177    dst += (4 * dst_stride);
178
179    hz_out6 = hz_out10;
180    out0 = out2;
181    out1 = out3;
182    out2 = out8;
183    out4 = out6;
184    out5 = out7;
185    out6 = out9;
186  }
187}
188
189static void common_hv_8ht_8vt_and_aver_dst_16w_msa(
190    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
191    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
192  int32_t multiple8_cnt;
193  for (multiple8_cnt = 2; multiple8_cnt--;) {
194    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
195                                          filter_horiz, filter_vert, height);
196    src += 8;
197    dst += 8;
198  }
199}
200
201static void common_hv_8ht_8vt_and_aver_dst_32w_msa(
202    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
203    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
204  int32_t multiple8_cnt;
205  for (multiple8_cnt = 4; multiple8_cnt--;) {
206    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
207                                          filter_horiz, filter_vert, height);
208    src += 8;
209    dst += 8;
210  }
211}
212
213static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
214    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
215    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
216  int32_t multiple8_cnt;
217  for (multiple8_cnt = 8; multiple8_cnt--;) {
218    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
219                                          filter_horiz, filter_vert, height);
220    src += 8;
221    dst += 8;
222  }
223}
224
225static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
226    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
227    int8_t *filter_horiz, int8_t *filter_vert) {
228  v16i8 src0, src1, src2, src3, src4, mask;
229  v16u8 filt_hz, filt_vt, vec0, vec1;
230  v16u8 dst0, dst1, dst2, dst3, res0, res1;
231  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
232
233  mask = LD_SB(&mc_filt_mask_arr[16]);
234
235  /* rearranging filter */
236  filt = LD_UH(filter_horiz);
237  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
238
239  filt = LD_UH(filter_vert);
240  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
241
242  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
243
244  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
245  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
246  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
247  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
248  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
249  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
250
251  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
252  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
253  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
254  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
255  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
256  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
257  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
258}
259
260static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
261    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
262    int8_t *filter_horiz, int8_t *filter_vert) {
263  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
264  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
265  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
266  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
267  v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
268  v8i16 filt;
269
270  mask = LD_SB(&mc_filt_mask_arr[16]);
271
272  /* rearranging filter */
273  filt = LD_SH(filter_horiz);
274  filt_hz = (v16u8)__msa_splati_h(filt, 0);
275
276  filt = LD_SH(filter_vert);
277  filt_vt = (v16u8)__msa_splati_h(filt, 0);
278
279  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
280  src += (8 * src_stride);
281  src8 = LD_SB(src);
282
283  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
284  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
285  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
286  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
287  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
288  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
289             hz_out3, hz_out5, 8);
290  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
291
292  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
293  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
294             dst6);
295  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
296  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
297  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
298              tmp1, tmp2, tmp3);
299  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
300  PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2,
301              res3);
302  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
303              res3);
304  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
305  dst += (4 * dst_stride);
306  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
307}
308
309static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
310    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
311    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
312  if (4 == height) {
313    common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
314                                           filter_horiz, filter_vert);
315  } else if (8 == height) {
316    common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
317                                           filter_horiz, filter_vert);
318  }
319}
320
321static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
322    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
323    int8_t *filter_horiz, int8_t *filter_vert) {
324  v16i8 src0, src1, src2, src3, src4, mask;
325  v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
326  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
327  v8i16 filt;
328
329  mask = LD_SB(&mc_filt_mask_arr[0]);
330
331  /* rearranging filter */
332  filt = LD_SH(filter_horiz);
333  filt_hz = (v16u8)__msa_splati_h(filt, 0);
334
335  filt = LD_SH(filter_vert);
336  filt_vt = (v16u8)__msa_splati_h(filt, 0);
337
338  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
339  src += (5 * src_stride);
340
341  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
342  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
343  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
344  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
345  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
346
347  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
348  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
349  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
350
351  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
352  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
353  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
354
355  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
356  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
357  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
358
359  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
360  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
361                     dst_stride);
362}
363
364static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
365    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
366    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
367  uint32_t loop_cnt;
368  v16i8 src0, src1, src2, src3, src4, mask;
369  v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
370  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
371  v8i16 filt;
372
373  mask = LD_SB(&mc_filt_mask_arr[0]);
374
375  /* rearranging filter */
376  filt = LD_SH(filter_horiz);
377  filt_hz = (v16u8)__msa_splati_h(filt, 0);
378
379  filt = LD_SH(filter_vert);
380  filt_vt = (v16u8)__msa_splati_h(filt, 0);
381
382  src0 = LD_SB(src);
383  src += src_stride;
384
385  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
386
387  for (loop_cnt = (height >> 2); loop_cnt--;) {
388    LD_SB4(src, src_stride, src1, src2, src3, src4);
389    src += (4 * src_stride);
390
391    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
392    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
393    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
394
395    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
396    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
397    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
398
399    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
400
401    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
402    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
403    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
404
405    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
406    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
407    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
408
409    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
410    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
411    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
412                       dst_stride);
413    dst += (4 * dst_stride);
414  }
415}
416
417static void common_hv_2ht_2vt_and_aver_dst_8w_msa(
418    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
419    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
420  if (4 == height) {
421    common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
422                                           filter_horiz, filter_vert);
423  } else {
424    common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
425        src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
426  }
427}
428
429static void common_hv_2ht_2vt_and_aver_dst_16w_msa(
430    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
431    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
432  uint32_t loop_cnt;
433  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
434  v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
435  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
436  v8i16 filt;
437
438  mask = LD_SB(&mc_filt_mask_arr[0]);
439
440  /* rearranging filter */
441  filt = LD_SH(filter_horiz);
442  filt_hz = (v16u8)__msa_splati_h(filt, 0);
443
444  filt = LD_SH(filter_vert);
445  filt_vt = (v16u8)__msa_splati_h(filt, 0);
446
447  LD_SB2(src, 8, src0, src1);
448  src += src_stride;
449
450  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
451  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
452
453  for (loop_cnt = (height >> 2); loop_cnt--;) {
454    LD_SB4(src, src_stride, src0, src2, src4, src6);
455    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
456    src += (4 * src_stride);
457    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
458
459    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
460    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
461    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
462    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
463    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
464    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
465    dst += dst_stride;
466
467    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
468    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
469    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
470    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
471    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
472    PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
473    dst += dst_stride;
474
475    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
476    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
477    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
478    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
479    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
480    PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
481    dst += dst_stride;
482
483    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
484    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
485    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
486    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
487    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
488    PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
489    dst += dst_stride;
490  }
491}
492
493static void common_hv_2ht_2vt_and_aver_dst_32w_msa(
494    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
495    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
496  int32_t multiple8_cnt;
497  for (multiple8_cnt = 2; multiple8_cnt--;) {
498    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
499                                           filter_horiz, filter_vert, height);
500    src += 16;
501    dst += 16;
502  }
503}
504
505static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
506    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
507    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
508  int32_t multiple8_cnt;
509  for (multiple8_cnt = 4; multiple8_cnt--;) {
510    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
511                                           filter_horiz, filter_vert, height);
512    src += 16;
513    dst += 16;
514  }
515}
516
517void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
518                           uint8_t *dst, ptrdiff_t dst_stride,
519                           const int16_t *filter_x, int x_step_q4,
520                           const int16_t *filter_y, int y_step_q4, int w,
521                           int h) {
522  int8_t cnt, filt_hor[8], filt_ver[8];
523
524  assert(x_step_q4 == 16);
525  assert(y_step_q4 == 16);
526  assert(((const int32_t *)filter_x)[1] != 0x800000);
527  assert(((const int32_t *)filter_y)[1] != 0x800000);
528
529  for (cnt = 0; cnt < 8; ++cnt) {
530    filt_hor[cnt] = filter_x[cnt];
531    filt_ver[cnt] = filter_y[cnt];
532  }
533
534  if (((const int32_t *)filter_x)[0] == 0 &&
535      ((const int32_t *)filter_y)[0] == 0) {
536    switch (w) {
537      case 4:
538        common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
539                                              (int32_t)dst_stride, &filt_hor[3],
540                                              &filt_ver[3], h);
541        break;
542      case 8:
543        common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
544                                              (int32_t)dst_stride, &filt_hor[3],
545                                              &filt_ver[3], h);
546        break;
547      case 16:
548        common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
549                                               (int32_t)dst_stride,
550                                               &filt_hor[3], &filt_ver[3], h);
551        break;
552      case 32:
553        common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
554                                               (int32_t)dst_stride,
555                                               &filt_hor[3], &filt_ver[3], h);
556        break;
557      case 64:
558        common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
559                                               (int32_t)dst_stride,
560                                               &filt_hor[3], &filt_ver[3], h);
561        break;
562      default:
563        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
564                            x_step_q4, filter_y, y_step_q4, w, h);
565        break;
566    }
567  } else if (((const int32_t *)filter_x)[0] == 0 ||
568             ((const int32_t *)filter_y)[0] == 0) {
569    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
570                        filter_y, y_step_q4, w, h);
571  } else {
572    switch (w) {
573      case 4:
574        common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
575                                              (int32_t)dst_stride, filt_hor,
576                                              filt_ver, h);
577        break;
578      case 8:
579        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
580                                              (int32_t)dst_stride, filt_hor,
581                                              filt_ver, h);
582        break;
583      case 16:
584        common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
585                                               (int32_t)dst_stride, filt_hor,
586                                               filt_ver, h);
587        break;
588      case 32:
589        common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
590                                               (int32_t)dst_stride, filt_hor,
591                                               filt_ver, h);
592        break;
593      case 64:
594        common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
595                                               (int32_t)dst_stride, filt_hor,
596                                               filt_ver, h);
597        break;
598      default:
599        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
600                            x_step_q4, filter_y, y_step_q4, w, h);
601        break;
602    }
603  }
604}
605