1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include "./vpx_dsp_rtcd.h"
13#include "vpx_dsp/mips/vpx_convolve_msa.h"
14
15static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
16                                              int32_t src_stride, uint8_t *dst,
17                                              int32_t dst_stride,
18                                              int8_t *filter) {
19  uint32_t tp0, tp1, tp2, tp3;
20  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
21  v16u8 dst0 = { 0 }, res;
22  v16u8 mask0, mask1, mask2, mask3;
23  v8i16 filt, res0, res1;
24
25  mask0 = LD_UB(&mc_filt_mask_arr[16]);
26  src -= 3;
27
28  /* rearranging filter */
29  filt = LD_SH(filter);
30  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
31
32  mask1 = mask0 + 2;
33  mask2 = mask0 + 4;
34  mask3 = mask0 + 6;
35
36  LD_SB4(src, src_stride, src0, src1, src2, src3);
37  XORI_B4_128_SB(src0, src1, src2, src3);
38  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
39                             filt0, filt1, filt2, filt3, res0, res1);
40  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
41  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
42  SRARI_H2_SH(res0, res1, FILTER_BITS);
43  SAT_SH2_SH(res0, res1, 7);
44  res = PCKEV_XORI128_UB(res0, res1);
45  res = (v16u8)__msa_aver_u_b(res, dst0);
46  ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
47}
48
49static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
50                                              int32_t src_stride, uint8_t *dst,
51                                              int32_t dst_stride,
52                                              int8_t *filter) {
53  uint32_t tp0, tp1, tp2, tp3;
54  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
55  v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
56  v16u8 dst0 = { 0 }, dst1 = { 0 };
57  v8i16 filt, vec0, vec1, vec2, vec3;
58
59  mask0 = LD_UB(&mc_filt_mask_arr[16]);
60  src -= 3;
61
62  /* rearranging filter */
63  filt = LD_SH(filter);
64  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
65
66  mask1 = mask0 + 2;
67  mask2 = mask0 + 4;
68  mask3 = mask0 + 6;
69
70  LD_SB4(src, src_stride, src0, src1, src2, src3);
71  XORI_B4_128_SB(src0, src1, src2, src3);
72  src += (4 * src_stride);
73  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
74  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
75  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
76  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
77  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
78                             filt0, filt1, filt2, filt3, vec0, vec1);
79  LD_SB4(src, src_stride, src0, src1, src2, src3);
80  XORI_B4_128_SB(src0, src1, src2, src3);
81  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
82                             filt0, filt1, filt2, filt3, vec2, vec3);
83  SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS);
84  SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
85  PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2,
86              res3);
87  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
88  XORI_B2_128_UB(res0, res2);
89  AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
90  ST4x8_UB(res0, res2, dst, dst_stride);
91}
92
93static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
94                                             int32_t src_stride, uint8_t *dst,
95                                             int32_t dst_stride, int8_t *filter,
96                                             int32_t height) {
97  if (4 == height) {
98    common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
99  } else if (8 == height) {
100    common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
101  }
102}
103
104static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
105                                             int32_t src_stride, uint8_t *dst,
106                                             int32_t dst_stride, int8_t *filter,
107                                             int32_t height) {
108  int32_t loop_cnt;
109  int64_t tp0, tp1, tp2, tp3;
110  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
111  v16u8 mask0, mask1, mask2, mask3, dst0 = { 0 }, dst1 = { 0 };
112  v8i16 filt, out0, out1, out2, out3;
113
114  mask0 = LD_UB(&mc_filt_mask_arr[0]);
115  src -= 3;
116
117  /* rearranging filter */
118  filt = LD_SH(filter);
119  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
120
121  mask1 = mask0 + 2;
122  mask2 = mask0 + 4;
123  mask3 = mask0 + 6;
124
125  for (loop_cnt = (height >> 2); loop_cnt--;) {
126    LD_SB4(src, src_stride, src0, src1, src2, src3);
127    XORI_B4_128_SB(src0, src1, src2, src3);
128    src += (4 * src_stride);
129    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
130                               mask3, filt0, filt1, filt2, filt3, out0, out1,
131                               out2, out3);
132    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
133    INSERT_D2_UB(tp0, tp1, dst0);
134    INSERT_D2_UB(tp2, tp3, dst1);
135    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
136    SAT_SH4_SH(out0, out1, out2, out3, 7);
137    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst,
138                            dst_stride);
139    dst += (4 * dst_stride);
140  }
141}
142
143static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
144                                              int32_t src_stride, uint8_t *dst,
145                                              int32_t dst_stride,
146                                              int8_t *filter, int32_t height) {
147  int32_t loop_cnt;
148  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
149  v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
150  v8i16 filt, out0, out1, out2, out3;
151  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
152  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
153
154  mask0 = LD_UB(&mc_filt_mask_arr[0]);
155  src -= 3;
156
157  /* rearranging filter */
158  filt = LD_SH(filter);
159  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
160
161  mask1 = mask0 + 2;
162  mask2 = mask0 + 4;
163  mask3 = mask0 + 6;
164
165  for (loop_cnt = height >> 1; loop_cnt--;) {
166    LD_SB2(src, src_stride, src0, src2);
167    LD_SB2(src + 8, src_stride, src1, src3);
168    src += (2 * src_stride);
169
170    XORI_B4_128_SB(src0, src1, src2, src3);
171    VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
172    VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
173    VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
174               vec14);
175    VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
176               vec15);
177    DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
178                vec2, vec3);
179    DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
180                vec9, vec10, vec11);
181    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
182                 vec2, vec3);
183    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
184                 vec9, vec10, vec11);
185    ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
186                out2, out3);
187    LD_UB2(dst, dst_stride, dst0, dst1);
188    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
189    SAT_SH4_SH(out0, out1, out2, out3, 7);
190    PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
191    dst += dst_stride;
192    PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
193    dst += dst_stride;
194  }
195}
196
197static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
198                                              int32_t src_stride, uint8_t *dst,
199                                              int32_t dst_stride,
200                                              int8_t *filter, int32_t height) {
201  uint32_t loop_cnt;
202  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
203  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
204  v8i16 filt, out0, out1, out2, out3;
205  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
206  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
207
208  mask0 = LD_UB(&mc_filt_mask_arr[0]);
209  src -= 3;
210
211  /* rearranging filter */
212  filt = LD_SH(filter);
213  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
214
215  mask1 = mask0 + 2;
216  mask2 = mask0 + 4;
217  mask3 = mask0 + 6;
218
219  for (loop_cnt = height; loop_cnt--;) {
220    src0 = LD_SB(src);
221    src2 = LD_SB(src + 16);
222    src3 = LD_SB(src + 24);
223    src1 = __msa_sldi_b(src2, src0, 8);
224    src += src_stride;
225
226    XORI_B4_128_SB(src0, src1, src2, src3);
227    VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
228    VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
229    VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
230               vec14);
231    VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
232               vec15);
233    DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
234                vec2, vec3);
235    DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
236                vec9, vec10, vec11);
237    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
238                 vec2, vec3);
239    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
240                 vec9, vec10, vec11);
241    ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
242                out2, out3);
243    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
244    SAT_SH4_SH(out0, out1, out2, out3, 7);
245    LD_UB2(dst, 16, dst1, dst2);
246    PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
247    PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
248    dst += dst_stride;
249  }
250}
251
252static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
253                                              int32_t src_stride, uint8_t *dst,
254                                              int32_t dst_stride,
255                                              int8_t *filter, int32_t height) {
256  uint32_t loop_cnt, cnt;
257  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
258  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
259  v8i16 filt, out0, out1, out2, out3;
260  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
261  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
262
263  mask0 = LD_UB(&mc_filt_mask_arr[0]);
264  src -= 3;
265
266  /* rearranging filter */
267  filt = LD_SH(filter);
268  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
269
270  mask1 = mask0 + 2;
271  mask2 = mask0 + 4;
272  mask3 = mask0 + 6;
273
274  for (loop_cnt = height; loop_cnt--;) {
275    for (cnt = 0; cnt < 2; ++cnt) {
276      src0 = LD_SB(&src[cnt << 5]);
277      src2 = LD_SB(&src[16 + (cnt << 5)]);
278      src3 = LD_SB(&src[24 + (cnt << 5)]);
279      src1 = __msa_sldi_b(src2, src0, 8);
280
281      XORI_B4_128_SB(src0, src1, src2, src3);
282      VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
283                 vec12);
284      VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
285                 vec13);
286      VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
287                 vec14);
288      VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
289                 vec15);
290      DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
291                  vec1, vec2, vec3);
292      DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
293                  vec9, vec10, vec11);
294      DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
295                   vec1, vec2, vec3);
296      DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
297                   vec9, vec10, vec11);
298      ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
299                  out2, out3);
300      SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
301      SAT_SH4_SH(out0, out1, out2, out3, 7);
302      LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
303      PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
304      PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
305    }
306
307    src += src_stride;
308    dst += dst_stride;
309  }
310}
311
312static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
313                                              int32_t src_stride, uint8_t *dst,
314                                              int32_t dst_stride,
315                                              int8_t *filter) {
316  uint32_t tp0, tp1, tp2, tp3;
317  v16i8 src0, src1, src2, src3, mask;
318  v16u8 filt0, dst0 = { 0 }, vec0, vec1, res;
319  v8u16 vec2, vec3, filt;
320
321  mask = LD_SB(&mc_filt_mask_arr[16]);
322
323  /* rearranging filter */
324  filt = LD_UH(filter);
325  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
326
327  LD_SB4(src, src_stride, src0, src1, src2, src3);
328  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
329  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
330  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
331  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
332  SRARI_H2_UH(vec2, vec3, FILTER_BITS);
333  res = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
334  res = (v16u8)__msa_aver_u_b(res, dst0);
335  ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
336}
337
338static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
339                                              int32_t src_stride, uint8_t *dst,
340                                              int32_t dst_stride,
341                                              int8_t *filter) {
342  uint32_t tp0, tp1, tp2, tp3;
343  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
344  v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
345  v16u8 dst0 = { 0 }, dst1 = { 0 };
346  v8u16 vec4, vec5, vec6, vec7, filt;
347
348  mask = LD_SB(&mc_filt_mask_arr[16]);
349
350  /* rearranging filter */
351  filt = LD_UH(filter);
352  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
353
354  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
355  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
356  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
357  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
358  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
359  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
360  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
361  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
362              vec6, vec7);
363  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
364  PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
365              res3);
366  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
367  AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
368  ST4x8_UB(res0, res2, dst, dst_stride);
369}
370
371static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
372                                             int32_t src_stride, uint8_t *dst,
373                                             int32_t dst_stride, int8_t *filter,
374                                             int32_t height) {
375  if (4 == height) {
376    common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
377  } else if (8 == height) {
378    common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
379  }
380}
381
382static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
383                                              int32_t src_stride, uint8_t *dst,
384                                              int32_t dst_stride,
385                                              int8_t *filter) {
386  int64_t tp0, tp1, tp2, tp3;
387  v16i8 src0, src1, src2, src3, mask;
388  v16u8 filt0, dst0 = { 0 }, dst1 = { 0 };
389  v8u16 vec0, vec1, vec2, vec3, filt;
390
391  mask = LD_SB(&mc_filt_mask_arr[0]);
392
393  /* rearranging filter */
394  filt = LD_UH(filter);
395  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
396
397  LD_SB4(src, src_stride, src0, src1, src2, src3);
398  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
399  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
400  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
401              vec2, vec3);
402  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
403  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
404  INSERT_D2_UB(tp0, tp1, dst0);
405  INSERT_D2_UB(tp2, tp3, dst1);
406  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
407}
408
409static void common_hz_2t_and_aver_dst_8x8mult_msa(
410    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
411    int8_t *filter, int32_t height) {
412  int64_t tp0, tp1, tp2, tp3;
413  v16i8 src0, src1, src2, src3, mask;
414  v16u8 filt0, dst0 = { 0 }, dst1 = { 0 };
415  v8u16 vec0, vec1, vec2, vec3, filt;
416
417  mask = LD_SB(&mc_filt_mask_arr[0]);
418
419  /* rearranging filter */
420  filt = LD_UH(filter);
421  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
422
423  LD_SB4(src, src_stride, src0, src1, src2, src3);
424  src += (4 * src_stride);
425  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
426  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
427  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
428              vec2, vec3);
429  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
430  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
431  INSERT_D2_UB(tp0, tp1, dst0);
432  INSERT_D2_UB(tp2, tp3, dst1);
433  LD_SB4(src, src_stride, src0, src1, src2, src3);
434  src += (4 * src_stride);
435  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
436  dst += (4 * dst_stride);
437
438  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
439  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
440  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
441              vec2, vec3);
442  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
443  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
444  INSERT_D2_UB(tp0, tp1, dst0);
445  INSERT_D2_UB(tp2, tp3, dst1);
446  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
447  dst += (4 * dst_stride);
448
449  if (16 == height) {
450    LD_SB4(src, src_stride, src0, src1, src2, src3);
451    src += (4 * src_stride);
452
453    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
454    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
455    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
456                vec2, vec3);
457    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
458    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
459    INSERT_D2_UB(tp0, tp1, dst0);
460    INSERT_D2_UB(tp2, tp3, dst1);
461    LD_SB4(src, src_stride, src0, src1, src2, src3);
462    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
463    dst += (4 * dst_stride);
464
465    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
466    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
467    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
468                vec2, vec3);
469    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
470    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
471    INSERT_D2_UB(tp0, tp1, dst0);
472    INSERT_D2_UB(tp2, tp3, dst1);
473    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
474  }
475}
476
477static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
478                                             int32_t src_stride, uint8_t *dst,
479                                             int32_t dst_stride, int8_t *filter,
480                                             int32_t height) {
481  if (4 == height) {
482    common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
483  } else {
484    common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
485                                          filter, height);
486  }
487}
488
489static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
490                                              int32_t src_stride, uint8_t *dst,
491                                              int32_t dst_stride,
492                                              int8_t *filter, int32_t height) {
493  uint32_t loop_cnt;
494  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
495  v16u8 filt0, dst0, dst1, dst2, dst3;
496  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
497  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
498
499  mask = LD_SB(&mc_filt_mask_arr[0]);
500
501  /* rearranging filter */
502  filt = LD_UH(filter);
503  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
504
505  LD_SB4(src, src_stride, src0, src2, src4, src6);
506  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
507  src += (4 * src_stride);
508
509  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
510  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
511  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
512  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
513  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
514              res2, res3);
515  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
516              res6, res7);
517  SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
518  SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
519  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
520  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
521  dst += dst_stride;
522  PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
523  dst += dst_stride;
524  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
525  dst += dst_stride;
526  PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
527  dst += dst_stride;
528
529  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
530    LD_SB4(src, src_stride, src0, src2, src4, src6);
531    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
532    src += (4 * src_stride);
533
534    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
535    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
536    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
537    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
538    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
539                res2, res3);
540    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
541                res6, res7);
542    SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
543    SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
544    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
545    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
546    dst += dst_stride;
547    PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
548    dst += dst_stride;
549    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
550    dst += dst_stride;
551    PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
552    dst += dst_stride;
553  }
554}
555
556static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
557                                              int32_t src_stride, uint8_t *dst,
558                                              int32_t dst_stride,
559                                              int8_t *filter, int32_t height) {
560  uint32_t loop_cnt;
561  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
562  v16u8 filt0, dst0, dst1, dst2, dst3;
563  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
564  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
565
566  mask = LD_SB(&mc_filt_mask_arr[0]);
567
568  /* rearranging filter */
569  filt = LD_UH(filter);
570  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
571
572  for (loop_cnt = (height >> 1); loop_cnt--;) {
573    src0 = LD_SB(src);
574    src2 = LD_SB(src + 16);
575    src3 = LD_SB(src + 24);
576    src1 = __msa_sldi_b(src2, src0, 8);
577    src += src_stride;
578    src4 = LD_SB(src);
579    src6 = LD_SB(src + 16);
580    src7 = LD_SB(src + 24);
581    src5 = __msa_sldi_b(src6, src4, 8);
582    src += src_stride;
583
584    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
585    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
586    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
587    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
588    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
589                res2, res3);
590    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
591                res6, res7);
592    SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
593    SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
594    LD_UB2(dst, 16, dst0, dst1);
595    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
596    PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
597    dst += dst_stride;
598    LD_UB2(dst, 16, dst2, dst3);
599    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
600    PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
601    dst += dst_stride;
602  }
603}
604
605static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
606                                              int32_t src_stride, uint8_t *dst,
607                                              int32_t dst_stride,
608                                              int8_t *filter, int32_t height) {
609  uint32_t loop_cnt;
610  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
611  v16u8 filt0, dst0, dst1, dst2, dst3;
612  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
613  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
614
615  mask = LD_SB(&mc_filt_mask_arr[0]);
616
617  /* rearranging filter */
618  filt = LD_UH(filter);
619  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
620
621  for (loop_cnt = height; loop_cnt--;) {
622    LD_SB4(src, 16, src0, src2, src4, src6);
623    src7 = LD_SB(src + 56);
624    SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
625    src += src_stride;
626
627    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
628    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
629    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
630    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
631    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
632                out2, out3);
633    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
634                out6, out7);
635    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
636    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
637    LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
638    PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
639    PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
640    PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
641    PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
642    dst += dst_stride;
643  }
644}
645
646void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
647                                 uint8_t *dst, ptrdiff_t dst_stride,
648                                 const InterpKernel *filter, int x0_q4,
649                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
650                                 int h) {
651  const int16_t *const filter_x = filter[x0_q4];
652  int8_t cnt, filt_hor[8];
653
654  assert(x_step_q4 == 16);
655  assert(((const int32_t *)filter_x)[1] != 0x800000);
656
657  for (cnt = 0; cnt < 8; ++cnt) {
658    filt_hor[cnt] = filter_x[cnt];
659  }
660
661  if (((const int32_t *)filter_x)[0] == 0) {
662    switch (w) {
663      case 4:
664        common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
665                                         (int32_t)dst_stride, &filt_hor[3], h);
666        break;
667      case 8:
668        common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
669                                         (int32_t)dst_stride, &filt_hor[3], h);
670        break;
671      case 16:
672        common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
673                                          (int32_t)dst_stride, &filt_hor[3], h);
674        break;
675      case 32:
676        common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
677                                          (int32_t)dst_stride, &filt_hor[3], h);
678        break;
679      case 64:
680        common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
681                                          (int32_t)dst_stride, &filt_hor[3], h);
682        break;
683      default:
684        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
685                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
686        break;
687    }
688  } else {
689    switch (w) {
690      case 4:
691        common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
692                                         (int32_t)dst_stride, filt_hor, h);
693        break;
694      case 8:
695        common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
696                                         (int32_t)dst_stride, filt_hor, h);
697        break;
698      case 16:
699        common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
700                                          (int32_t)dst_stride, filt_hor, h);
701        break;
702      case 32:
703        common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
704                                          (int32_t)dst_stride, filt_hor, h);
705        break;
706      case 64:
707        common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
708                                          (int32_t)dst_stride, filt_hor, h);
709        break;
710      default:
711        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
712                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
713        break;
714    }
715  }
716}
717