vpx_convolve8_avg_msa.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include "./vpx_dsp_rtcd.h"
13#include "vpx_dsp/mips/vpx_convolve_msa.h"
14
15static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
16                                                  int32_t src_stride,
17                                                  uint8_t *dst,
18                                                  int32_t dst_stride,
19                                                  int8_t *filter_horiz,
20                                                  int8_t *filter_vert,
21                                                  int32_t height) {
22  uint32_t loop_cnt;
23  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
24  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
25  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
26  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
27  v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
28  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
29
30  mask0 = LD_UB(&mc_filt_mask_arr[16]);
31  src -= (3 + 3 * src_stride);
32
33  /* rearranging filter */
34  filt = LD_SH(filter_horiz);
35  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
36
37  mask1 = mask0 + 2;
38  mask2 = mask0 + 4;
39  mask3 = mask0 + 6;
40
41  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
42  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
43  src += (7 * src_stride);
44
45  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
46                            filt_hz1, filt_hz2, filt_hz3);
47  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
48                            filt_hz1, filt_hz2, filt_hz3);
49  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
50                            filt_hz1, filt_hz2, filt_hz3);
51  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
52                            filt_hz1, filt_hz2, filt_hz3);
53  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
54
55  filt = LD_SH(filter_vert);
56  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
57
58  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
59  vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
60
61  for (loop_cnt = (height >> 2); loop_cnt--;) {
62    LD_SB4(src, src_stride, src7, src8, src9, src10);
63    XORI_B4_128_SB(src7, src8, src9, src10);
64    src += (4 * src_stride);
65
66    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
67    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
68                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
69    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
70    vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
71    res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
72                               filt_vt2, filt_vt3);
73
74    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
75                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
76    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
77    vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
78    res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
79                               filt_vt2, filt_vt3);
80    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
81
82    SRARI_H2_SH(res0, res1, FILTER_BITS);
83    SAT_SH2_SH(res0, res1, 7);
84    PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
85    XORI_B2_128_UB(tmp0, tmp1);
86    AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
87    ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
88    dst += (4 * dst_stride);
89
90    hz_out5 = hz_out9;
91    vec0 = vec2;
92    vec1 = vec3;
93    vec2 = vec4;
94  }
95}
96
97static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
98                                                  int32_t src_stride,
99                                                  uint8_t *dst,
100                                                  int32_t dst_stride,
101                                                  int8_t *filter_horiz,
102                                                  int8_t *filter_vert,
103                                                  int32_t height) {
104  uint32_t loop_cnt;
105  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
106  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
107  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
108  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
109  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
110  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
111  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
112
113  mask0 = LD_UB(&mc_filt_mask_arr[0]);
114  src -= (3 + 3 * src_stride);
115
116  /* rearranging filter */
117  filt = LD_SH(filter_horiz);
118  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
119
120  mask1 = mask0 + 2;
121  mask2 = mask0 + 4;
122  mask3 = mask0 + 6;
123
124  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
125  src += (7 * src_stride);
126
127  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
128  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
129                            filt_hz1, filt_hz2, filt_hz3);
130  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
131                            filt_hz1, filt_hz2, filt_hz3);
132  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
133                            filt_hz1, filt_hz2, filt_hz3);
134  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
135                            filt_hz1, filt_hz2, filt_hz3);
136  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
137                            filt_hz1, filt_hz2, filt_hz3);
138  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
139                            filt_hz1, filt_hz2, filt_hz3);
140  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
141                            filt_hz1, filt_hz2, filt_hz3);
142
143  filt = LD_SH(filter_vert);
144  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
145
146  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
147  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
148  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
149
150  for (loop_cnt = (height >> 2); loop_cnt--;) {
151    LD_SB4(src, src_stride, src7, src8, src9, src10);
152    XORI_B4_128_SB(src7, src8, src9, src10);
153    src += (4 * src_stride);
154
155    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
156
157    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
158                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
159    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
160    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
161                               filt_vt2, filt_vt3);
162
163    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
164                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
165    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
166    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
167                               filt_vt2, filt_vt3);
168
169    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
170                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
171    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
172    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
173                               filt_vt2, filt_vt3);
174
175    hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
176                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
177    out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
178    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
179                               filt_vt2, filt_vt3);
180
181    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
182    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
183    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3,
184                            dst, dst_stride);
185    dst += (4 * dst_stride);
186
187    hz_out6 = hz_out10;
188    out0 = out2;
189    out1 = out3;
190    out2 = out8;
191    out4 = out6;
192    out5 = out7;
193    out6 = out9;
194  }
195}
196
197static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
198                                                   int32_t src_stride,
199                                                   uint8_t *dst,
200                                                   int32_t dst_stride,
201                                                   int8_t *filter_horiz,
202                                                   int8_t *filter_vert,
203                                                   int32_t height) {
204  int32_t multiple8_cnt;
205  for (multiple8_cnt = 2; multiple8_cnt--;) {
206    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
207                                          filter_horiz, filter_vert, height);
208    src += 8;
209    dst += 8;
210  }
211}
212
213static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
214                                                   int32_t src_stride,
215                                                   uint8_t *dst,
216                                                   int32_t dst_stride,
217                                                   int8_t *filter_horiz,
218                                                   int8_t *filter_vert,
219                                                   int32_t height) {
220  int32_t multiple8_cnt;
221  for (multiple8_cnt = 4; multiple8_cnt--;) {
222    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
223                                          filter_horiz, filter_vert, height);
224    src += 8;
225    dst += 8;
226  }
227}
228
229static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
230                                                   int32_t src_stride,
231                                                   uint8_t *dst,
232                                                   int32_t dst_stride,
233                                                   int8_t *filter_horiz,
234                                                   int8_t *filter_vert,
235                                                   int32_t height) {
236  int32_t multiple8_cnt;
237  for (multiple8_cnt = 8; multiple8_cnt--;) {
238    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
239                                          filter_horiz, filter_vert, height);
240    src += 8;
241    dst += 8;
242  }
243}
244
245static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
246                                                   int32_t src_stride,
247                                                   uint8_t *dst,
248                                                   int32_t dst_stride,
249                                                   int8_t *filter_horiz,
250                                                   int8_t *filter_vert) {
251  v16i8 src0, src1, src2, src3, src4, mask;
252  v16u8 filt_hz, filt_vt, vec0, vec1;
253  v16u8 dst0, dst1, dst2, dst3, res0, res1;
254  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
255
256  mask = LD_SB(&mc_filt_mask_arr[16]);
257
258  /* rearranging filter */
259  filt = LD_UH(filter_horiz);
260  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
261
262  filt = LD_UH(filter_vert);
263  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
264
265  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
266
267  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
268  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
269  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
270  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
271  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
272  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
273
274  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
275  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
276  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
277  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
278  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
279  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
280  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
281}
282
283static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
284                                                   int32_t src_stride,
285                                                   uint8_t *dst,
286                                                   int32_t dst_stride,
287                                                   int8_t *filter_horiz,
288                                                   int8_t *filter_vert) {
289  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
290  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
291  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
292  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
293  v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
294  v8i16 filt;
295
296  mask = LD_SB(&mc_filt_mask_arr[16]);
297
298  /* rearranging filter */
299  filt = LD_SH(filter_horiz);
300  filt_hz = (v16u8)__msa_splati_h(filt, 0);
301
302  filt = LD_SH(filter_vert);
303  filt_vt = (v16u8)__msa_splati_h(filt, 0);
304
305  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
306  src += (8 * src_stride);
307  src8 = LD_SB(src);
308
309  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
310  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
311  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
312  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
313  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
314  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
315             hz_out3, hz_out5, 8);
316  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
317
318  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
319  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
320             dst4, dst6);
321  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
322  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
323  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
324              tmp0, tmp1, tmp2, tmp3);
325  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
326  PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
327              res2, res3);
328  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
329              res2, res3);
330  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
331  dst += (4 * dst_stride);
332  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
333}
334
335static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src,
336                                                  int32_t src_stride,
337                                                  uint8_t *dst,
338                                                  int32_t dst_stride,
339                                                  int8_t *filter_horiz,
340                                                  int8_t *filter_vert,
341                                                  int32_t height) {
342  if (4 == height) {
343    common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
344                                           filter_horiz, filter_vert);
345  } else if (8 == height) {
346    common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
347                                           filter_horiz, filter_vert);
348  }
349}
350
351static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
352                                                   int32_t src_stride,
353                                                   uint8_t *dst,
354                                                   int32_t dst_stride,
355                                                   int8_t *filter_horiz,
356                                                   int8_t *filter_vert) {
357  v16i8 src0, src1, src2, src3, src4, mask;
358  v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
359  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
360  v8i16 filt;
361
362  mask = LD_SB(&mc_filt_mask_arr[0]);
363
364  /* rearranging filter */
365  filt = LD_SH(filter_horiz);
366  filt_hz = (v16u8)__msa_splati_h(filt, 0);
367
368  filt = LD_SH(filter_vert);
369  filt_vt = (v16u8)__msa_splati_h(filt, 0);
370
371  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
372  src += (5 * src_stride);
373
374  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
375  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
376  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
377  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
378  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
379
380  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
381  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
382  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
383
384  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
385  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
386  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
387
388  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
389  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
390  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
391
392  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
393  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
394                     dst, dst_stride);
395}
396
397static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
398                                                       int32_t src_stride,
399                                                       uint8_t *dst,
400                                                       int32_t dst_stride,
401                                                       int8_t *filter_horiz,
402                                                       int8_t *filter_vert,
403                                                       int32_t height) {
404  uint32_t loop_cnt;
405  v16i8 src0, src1, src2, src3, src4, mask;
406  v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
407  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
408  v8i16 filt;
409
410  mask = LD_SB(&mc_filt_mask_arr[0]);
411
412  /* rearranging filter */
413  filt = LD_SH(filter_horiz);
414  filt_hz = (v16u8)__msa_splati_h(filt, 0);
415
416  filt = LD_SH(filter_vert);
417  filt_vt = (v16u8)__msa_splati_h(filt, 0);
418
419  src0 = LD_SB(src);
420  src += src_stride;
421
422  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
423
424  for (loop_cnt = (height >> 2); loop_cnt--;) {
425    LD_SB4(src, src_stride, src1, src2, src3, src4);
426    src += (4 * src_stride);
427
428    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
429    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
430    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
431
432    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
433    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
434    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
435
436    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
437
438    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
439    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
440    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
441
442    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
443    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
444    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
445
446    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
447    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
448    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
449                       dst, dst_stride);
450    dst += (4 * dst_stride);
451  }
452}
453
454static void common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t *src,
455                                                  int32_t src_stride,
456                                                  uint8_t *dst,
457                                                  int32_t dst_stride,
458                                                  int8_t *filter_horiz,
459                                                  int8_t *filter_vert,
460                                                  int32_t height) {
461  if (4 == height) {
462    common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
463                                           filter_horiz, filter_vert);
464  } else {
465    common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
466                                               filter_horiz, filter_vert,
467                                               height);
468  }
469}
470
471static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
472                                                   int32_t src_stride,
473                                                   uint8_t *dst,
474                                                   int32_t dst_stride,
475                                                   int8_t *filter_horiz,
476                                                   int8_t *filter_vert,
477                                                   int32_t height) {
478  uint32_t loop_cnt;
479  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
480  v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
481  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
482  v8i16 filt;
483
484  mask = LD_SB(&mc_filt_mask_arr[0]);
485
486  /* rearranging filter */
487  filt = LD_SH(filter_horiz);
488  filt_hz = (v16u8)__msa_splati_h(filt, 0);
489
490  filt = LD_SH(filter_vert);
491  filt_vt = (v16u8)__msa_splati_h(filt, 0);
492
493  LD_SB2(src, 8, src0, src1);
494  src += src_stride;
495
496  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
497  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
498
499  for (loop_cnt = (height >> 2); loop_cnt--;) {
500    LD_SB4(src, src_stride, src0, src2, src4, src6);
501    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
502    src += (4 * src_stride);
503    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
504
505    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
506    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
507    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
508    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
509    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
510    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
511    dst += dst_stride;
512
513    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
514    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
515    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
516    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
517    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
518    PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
519    dst += dst_stride;
520
521    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
522    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
523    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
524    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
525    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
526    PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
527    dst += dst_stride;
528
529    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
530    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
531    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
532    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
533    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
534    PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
535    dst += dst_stride;
536  }
537}
538
539static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src,
540                                                   int32_t src_stride,
541                                                   uint8_t *dst,
542                                                   int32_t dst_stride,
543                                                   int8_t *filter_horiz,
544                                                   int8_t *filter_vert,
545                                                   int32_t height) {
546  int32_t multiple8_cnt;
547  for (multiple8_cnt = 2; multiple8_cnt--;) {
548    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
549                                           filter_horiz, filter_vert, height);
550    src += 16;
551    dst += 16;
552  }
553}
554
555static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src,
556                                                   int32_t src_stride,
557                                                   uint8_t *dst,
558                                                   int32_t dst_stride,
559                                                   int8_t *filter_horiz,
560                                                   int8_t *filter_vert,
561                                                   int32_t height) {
562  int32_t multiple8_cnt;
563  for (multiple8_cnt = 4; multiple8_cnt--;) {
564    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
565                                           filter_horiz, filter_vert, height);
566    src += 16;
567    dst += 16;
568  }
569}
570
571void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
572                           uint8_t *dst, ptrdiff_t dst_stride,
573                           const int16_t *filter_x, int x_step_q4,
574                           const int16_t *filter_y, int y_step_q4,
575                           int w, int h) {
576  int8_t cnt, filt_hor[8], filt_ver[8];
577
578  assert(x_step_q4 == 16);
579  assert(y_step_q4 == 16);
580  assert(((const int32_t *)filter_x)[1] != 0x800000);
581  assert(((const int32_t *)filter_y)[1] != 0x800000);
582
583  for (cnt = 0; cnt < 8; ++cnt) {
584    filt_hor[cnt] = filter_x[cnt];
585    filt_ver[cnt] = filter_y[cnt];
586  }
587
588  if (((const int32_t *)filter_x)[0] == 0 &&
589      ((const int32_t *)filter_y)[0] == 0) {
590    switch (w) {
591      case 4:
592        common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
593                                              dst, (int32_t)dst_stride,
594                                              &filt_hor[3], &filt_ver[3], h);
595        break;
596      case 8:
597        common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
598                                              dst, (int32_t)dst_stride,
599                                              &filt_hor[3], &filt_ver[3], h);
600        break;
601      case 16:
602        common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
603                                               dst, (int32_t)dst_stride,
604                                               &filt_hor[3], &filt_ver[3], h);
605        break;
606      case 32:
607        common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
608                                               dst, (int32_t)dst_stride,
609                                               &filt_hor[3], &filt_ver[3], h);
610        break;
611      case 64:
612        common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
613                                               dst, (int32_t)dst_stride,
614                                               &filt_hor[3], &filt_ver[3], h);
615        break;
616      default:
617        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
618                            filter_x, x_step_q4, filter_y, y_step_q4,
619                            w, h);
620        break;
621    }
622  } else if (((const int32_t *)filter_x)[0] == 0 ||
623             ((const int32_t *)filter_y)[0] == 0) {
624    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
625                        filter_x, x_step_q4, filter_y, y_step_q4,
626                        w, h);
627  } else {
628    switch (w) {
629      case 4:
630        common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
631                                              dst, (int32_t)dst_stride,
632                                              filt_hor, filt_ver, h);
633        break;
634      case 8:
635        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
636                                              dst, (int32_t)dst_stride,
637                                              filt_hor, filt_ver, h);
638        break;
639      case 16:
640        common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
641                                               dst, (int32_t)dst_stride,
642                                               filt_hor, filt_ver, h);
643        break;
644      case 32:
645        common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
646                                               dst, (int32_t)dst_stride,
647                                               filt_hor, filt_ver, h);
648        break;
649      case 64:
650        common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
651                                               dst, (int32_t)dst_stride,
652                                               filt_hor, filt_ver, h);
653        break;
654      default:
655        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
656                            filter_x, x_step_q4, filter_y, y_step_q4,
657                            w, h);
658        break;
659    }
660  }
661}
662