1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vp8_rtcd.h"
12#include "vpx_ports/mem.h"
13#include "vp8/common/filter.h"
14#include "vp8/common/mips/msa/vp8_macros_msa.h"
15
16DECLARE_ALIGNED(16, static const int8_t, vp8_bilinear_filters_msa[7][2]) = {
17  { 112, 16 }, { 96, 32 }, { 80, 48 }, { 64, 64 },
18  { 48, 80 },  { 32, 96 }, { 16, 112 }
19};
20
21static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
22  /* 8 width cases */
23  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
24  /* 4 width cases */
25  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
26  /* 4 width cases */
27  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
28};
29
30static void common_hz_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
31                                 uint8_t *RESTRICT dst, int32_t dst_stride,
32                                 const int8_t *filter) {
33  v16i8 src0, src1, src2, src3, mask;
34  v16u8 filt0, vec0, vec1, res0, res1;
35  v8u16 vec2, vec3, filt;
36
37  mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
38
39  filt = LD_UH(filter);
40  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
41
42  LD_SB4(src, src_stride, src0, src1, src2, src3);
43  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
44  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
45  SRARI_H2_UH(vec2, vec3, VP8_FILTER_SHIFT);
46  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
47  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
48}
49
50static void common_hz_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
51                                 uint8_t *RESTRICT dst, int32_t dst_stride,
52                                 const int8_t *filter) {
53  v16u8 vec0, vec1, vec2, vec3, filt0;
54  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
55  v16i8 res0, res1, res2, res3;
56  v8u16 vec4, vec5, vec6, vec7, filt;
57
58  mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
59
60  filt = LD_UH(filter);
61  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
62
63  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
64  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
65  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
66  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
67              vec6, vec7);
68  SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
69  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
70              res3);
71  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
72  dst += (4 * dst_stride);
73  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
74}
75
76static void common_hz_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
77                                uint8_t *RESTRICT dst, int32_t dst_stride,
78                                const int8_t *filter, int32_t height) {
79  if (4 == height) {
80    common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
81  } else if (8 == height) {
82    common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
83  }
84}
85
86static void common_hz_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
87                                 uint8_t *RESTRICT dst, int32_t dst_stride,
88                                 const int8_t *filter) {
89  v16u8 filt0;
90  v16i8 src0, src1, src2, src3, mask;
91  v8u16 vec0, vec1, vec2, vec3, filt;
92
93  mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
94
95  filt = LD_UH(filter);
96  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
97
98  LD_SB4(src, src_stride, src0, src1, src2, src3);
99  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
100  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
101  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
102              vec2, vec3);
103  SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
104  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
105  ST8x4_UB(src0, src1, dst, dst_stride);
106}
107
108static void common_hz_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
109                                     uint8_t *RESTRICT dst, int32_t dst_stride,
110                                     const int8_t *filter, int32_t height) {
111  v16u8 filt0;
112  v16i8 src0, src1, src2, src3, mask, out0, out1;
113  v8u16 vec0, vec1, vec2, vec3, filt;
114
115  mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
116
117  filt = LD_UH(filter);
118  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
119
120  LD_SB4(src, src_stride, src0, src1, src2, src3);
121  src += (4 * src_stride);
122
123  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
124  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
125  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
126              vec2, vec3);
127  SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
128
129  LD_SB4(src, src_stride, src0, src1, src2, src3);
130  src += (4 * src_stride);
131
132  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
133  ST8x4_UB(out0, out1, dst, dst_stride);
134  dst += (4 * dst_stride);
135
136  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
137  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
138  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
139              vec2, vec3);
140  SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
141  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
142  ST8x4_UB(out0, out1, dst, dst_stride);
143  dst += (4 * dst_stride);
144
145  if (16 == height) {
146    LD_SB4(src, src_stride, src0, src1, src2, src3);
147    src += (4 * src_stride);
148
149    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
150    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
151    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
152                vec2, vec3);
153    SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
154    LD_SB4(src, src_stride, src0, src1, src2, src3);
155    src += (4 * src_stride);
156
157    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
158    ST8x4_UB(out0, out1, dst, dst_stride);
159
160    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
161    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
162    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
163                vec2, vec3);
164    SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
165    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
166    ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
167  }
168}
169
170static void common_hz_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
171                                uint8_t *RESTRICT dst, int32_t dst_stride,
172                                const int8_t *filter, int32_t height) {
173  if (4 == height) {
174    common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
175  } else {
176    common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
177  }
178}
179
180static void common_hz_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
181                                 uint8_t *RESTRICT dst, int32_t dst_stride,
182                                 const int8_t *filter, int32_t height) {
183  uint32_t loop_cnt;
184  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
185  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
186  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
187
188  mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
189
190  loop_cnt = (height >> 2) - 1;
191
192  filt = LD_UH(filter);
193  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
194
195  LD_SB4(src, src_stride, src0, src2, src4, src6);
196  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
197  src += (4 * src_stride);
198
199  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
200  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
201  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
202  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
203  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
204              out2, out3);
205  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
206              out6, out7);
207  SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
208  SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
209  PCKEV_ST_SB(out0, out1, dst);
210  dst += dst_stride;
211  PCKEV_ST_SB(out2, out3, dst);
212  dst += dst_stride;
213  PCKEV_ST_SB(out4, out5, dst);
214  dst += dst_stride;
215  PCKEV_ST_SB(out6, out7, dst);
216  dst += dst_stride;
217
218  for (; loop_cnt--;) {
219    LD_SB4(src, src_stride, src0, src2, src4, src6);
220    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
221    src += (4 * src_stride);
222
223    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
224    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
225    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
226    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
227    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
228                out2, out3);
229    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
230                out6, out7);
231    SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
232    SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
233    PCKEV_ST_SB(out0, out1, dst);
234    dst += dst_stride;
235    PCKEV_ST_SB(out2, out3, dst);
236    dst += dst_stride;
237    PCKEV_ST_SB(out4, out5, dst);
238    dst += dst_stride;
239    PCKEV_ST_SB(out6, out7, dst);
240    dst += dst_stride;
241  }
242}
243
244static void common_vt_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
245                                 uint8_t *RESTRICT dst, int32_t dst_stride,
246                                 const int8_t *filter) {
247  v16i8 src0, src1, src2, src3, src4;
248  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
249  v16u8 filt0;
250  v8i16 filt;
251  v8u16 tmp0, tmp1;
252
253  filt = LD_SH(filter);
254  filt0 = (v16u8)__msa_splati_h(filt, 0);
255
256  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
257  src += (5 * src_stride);
258
259  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
260             src32_r, src43_r);
261  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
262  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
263  SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
264  src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
265  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
266}
267
268static void common_vt_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
269                                 uint8_t *RESTRICT dst, int32_t dst_stride,
270                                 const int8_t *filter) {
271  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
272  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
273  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
274  v8u16 tmp0, tmp1, tmp2, tmp3;
275  v16u8 filt0;
276  v8i16 filt;
277
278  filt = LD_SH(filter);
279  filt0 = (v16u8)__msa_splati_h(filt, 0);
280
281  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
282  src += (8 * src_stride);
283
284  src8 = LD_SB(src);
285  src += src_stride;
286
287  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
288             src32_r, src43_r);
289  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
290             src76_r, src87_r);
291  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
292             src76_r, src2110, src4332, src6554, src8776);
293  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
294              tmp0, tmp1, tmp2, tmp3);
295  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
296  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
297  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
298  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
299}
300
301static void common_vt_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
302                                uint8_t *RESTRICT dst, int32_t dst_stride,
303                                const int8_t *filter, int32_t height) {
304  if (4 == height) {
305    common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
306  } else if (8 == height) {
307    common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
308  }
309}
310
311static void common_vt_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
312                                 uint8_t *RESTRICT dst, int32_t dst_stride,
313                                 const int8_t *filter) {
314  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
315  v16i8 out0, out1;
316  v8u16 tmp0, tmp1, tmp2, tmp3;
317  v8i16 filt;
318
319  filt = LD_SH(filter);
320  filt0 = (v16u8)__msa_splati_h(filt, 0);
321
322  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
323  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
324  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
325  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
326              tmp2, tmp3);
327  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
328  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
329  ST8x4_UB(out0, out1, dst, dst_stride);
330}
331
332static void common_vt_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
333                                     uint8_t *RESTRICT dst, int32_t dst_stride,
334                                     const int8_t *filter, int32_t height) {
335  uint32_t loop_cnt;
336  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
337  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
338  v16i8 out0, out1;
339  v8u16 tmp0, tmp1, tmp2, tmp3;
340  v8i16 filt;
341
342  filt = LD_SH(filter);
343  filt0 = (v16u8)__msa_splati_h(filt, 0);
344
345  src0 = LD_UB(src);
346  src += src_stride;
347
348  for (loop_cnt = (height >> 3); loop_cnt--;) {
349    LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
350    src += (8 * src_stride);
351
352    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
353               vec3);
354    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
355               vec7);
356    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
357                tmp2, tmp3);
358    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
359    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
360    ST8x4_UB(out0, out1, dst, dst_stride);
361    dst += (4 * dst_stride);
362
363    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
364                tmp2, tmp3);
365    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
366    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
367    ST8x4_UB(out0, out1, dst, dst_stride);
368    dst += (4 * dst_stride);
369
370    src0 = src8;
371  }
372}
373
374static void common_vt_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
375                                uint8_t *RESTRICT dst, int32_t dst_stride,
376                                const int8_t *filter, int32_t height) {
377  if (4 == height) {
378    common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
379  } else {
380    common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
381  }
382}
383
384static void common_vt_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
385                                 uint8_t *RESTRICT dst, int32_t dst_stride,
386                                 const int8_t *filter, int32_t height) {
387  uint32_t loop_cnt;
388  v16u8 src0, src1, src2, src3, src4;
389  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
390  v8u16 tmp0, tmp1, tmp2, tmp3;
391  v8i16 filt;
392
393  filt = LD_SH(filter);
394  filt0 = (v16u8)__msa_splati_h(filt, 0);
395
396  src0 = LD_UB(src);
397  src += src_stride;
398
399  for (loop_cnt = (height >> 2); loop_cnt--;) {
400    LD_UB4(src, src_stride, src1, src2, src3, src4);
401    src += (4 * src_stride);
402
403    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
404    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
405    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
406    SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
407    PCKEV_ST_SB(tmp0, tmp1, dst);
408    dst += dst_stride;
409
410    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
411    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
412    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
413    SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
414    PCKEV_ST_SB(tmp2, tmp3, dst);
415    dst += dst_stride;
416
417    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
418    SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
419    PCKEV_ST_SB(tmp0, tmp1, dst);
420    dst += dst_stride;
421
422    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
423    SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
424    PCKEV_ST_SB(tmp2, tmp3, dst);
425    dst += dst_stride;
426
427    src0 = src4;
428  }
429}
430
431static void common_hv_2ht_2vt_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
432                                      uint8_t *RESTRICT dst, int32_t dst_stride,
433                                      const int8_t *filter_horiz,
434                                      const int8_t *filter_vert) {
435  v16i8 src0, src1, src2, src3, src4, mask;
436  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
437  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
438
439  mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
440
441  filt = LD_UH(filter_horiz);
442  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
443  filt = LD_UH(filter_vert);
444  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
445
446  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
447  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
448  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
449  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
450  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
451  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
452
453  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
454  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
455  SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
456  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
457  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
458}
459
460static void common_hv_2ht_2vt_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
461                                      uint8_t *RESTRICT dst, int32_t dst_stride,
462                                      const int8_t *filter_horiz,
463                                      const int8_t *filter_vert) {
464  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
465  v16i8 res0, res1, res2, res3;
466  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
467  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
468  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
469
470  mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
471
472  filt = LD_UH(filter_horiz);
473  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
474  filt = LD_UH(filter_vert);
475  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
476
477  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
478  src += (8 * src_stride);
479  src8 = LD_SB(src);
480
481  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
482  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
483  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, VP8_FILTER_SHIFT);
484  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, VP8_FILTER_SHIFT);
485  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, VP8_FILTER_SHIFT);
486  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
487             hz_out3, hz_out5, 8);
488  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
489
490  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
491  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
492  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
493              vec5, vec6, vec7);
494  SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
495  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
496              res3);
497  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
498  dst += (4 * dst_stride);
499  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
500}
501
502static void common_hv_2ht_2vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
503                                     uint8_t *RESTRICT dst, int32_t dst_stride,
504                                     const int8_t *filter_horiz,
505                                     const int8_t *filter_vert,
506                                     int32_t height) {
507  if (4 == height) {
508    common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
509                              filter_vert);
510  } else if (8 == height) {
511    common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
512                              filter_vert);
513  }
514}
515
516static void common_hv_2ht_2vt_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
517                                      uint8_t *RESTRICT dst, int32_t dst_stride,
518                                      const int8_t *filter_horiz,
519                                      const int8_t *filter_vert) {
520  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
521  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
522  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
523  v8i16 filt;
524
525  mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
526
527  filt = LD_SH(filter_horiz);
528  filt_hz = (v16u8)__msa_splati_h(filt, 0);
529  filt = LD_SH(filter_vert);
530  filt_vt = (v16u8)__msa_splati_h(filt, 0);
531
532  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
533
534  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
535  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
536  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
537  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
538
539  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
540  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
541  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
542
543  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
544  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
545  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
546
547  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
548  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
549  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
550
551  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
552  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
553  ST8x4_UB(out0, out1, dst, dst_stride);
554}
555
556static void common_hv_2ht_2vt_8x8mult_msa(
557    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
558    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
559    int32_t height) {
560  uint32_t loop_cnt;
561  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
562  v16u8 filt_hz, filt_vt, vec0;
563  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
564  v8i16 filt;
565
566  mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
567
568  filt = LD_SH(filter_horiz);
569  filt_hz = (v16u8)__msa_splati_h(filt, 0);
570  filt = LD_SH(filter_vert);
571  filt_vt = (v16u8)__msa_splati_h(filt, 0);
572
573  src0 = LD_SB(src);
574  src += src_stride;
575
576  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
577
578  for (loop_cnt = (height >> 3); loop_cnt--;) {
579    LD_SB4(src, src_stride, src1, src2, src3, src4);
580    src += (4 * src_stride);
581
582    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
583    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
584    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
585
586    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
587    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
588    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
589
590    SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
591
592    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
593    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
594    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
595
596    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
597    LD_SB4(src, src_stride, src1, src2, src3, src4);
598    src += (4 * src_stride);
599    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
600    tmp4 = __msa_dotp_u_h(vec0, filt_vt);
601
602    SRARI_H2_UH(tmp3, tmp4, VP8_FILTER_SHIFT);
603    PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
604    ST8x4_UB(out0, out1, dst, dst_stride);
605    dst += (4 * dst_stride);
606
607    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
608    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
609    tmp5 = __msa_dotp_u_h(vec0, filt_vt);
610
611    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
612    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
613    tmp6 = __msa_dotp_u_h(vec0, filt_vt);
614
615    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
616    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
617    tmp7 = __msa_dotp_u_h(vec0, filt_vt);
618
619    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
620    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
621    tmp8 = __msa_dotp_u_h(vec0, filt_vt);
622
623    SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, VP8_FILTER_SHIFT);
624    PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
625    ST8x4_UB(out0, out1, dst, dst_stride);
626    dst += (4 * dst_stride);
627  }
628}
629
630static void common_hv_2ht_2vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
631                                     uint8_t *RESTRICT dst, int32_t dst_stride,
632                                     const int8_t *filter_horiz,
633                                     const int8_t *filter_vert,
634                                     int32_t height) {
635  if (4 == height) {
636    common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
637                              filter_vert);
638  } else {
639    common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
640                                  filter_horiz, filter_vert, height);
641  }
642}
643
644static void common_hv_2ht_2vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
645                                      uint8_t *RESTRICT dst, int32_t dst_stride,
646                                      const int8_t *filter_horiz,
647                                      const int8_t *filter_vert,
648                                      int32_t height) {
649  uint32_t loop_cnt;
650  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
651  v16u8 filt_hz, filt_vt, vec0, vec1;
652  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
653  v8i16 filt;
654
655  mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
656
657  /* rearranging filter */
658  filt = LD_SH(filter_horiz);
659  filt_hz = (v16u8)__msa_splati_h(filt, 0);
660  filt = LD_SH(filter_vert);
661  filt_vt = (v16u8)__msa_splati_h(filt, 0);
662
663  LD_SB2(src, 8, src0, src1);
664  src += src_stride;
665
666  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
667  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
668
669  for (loop_cnt = (height >> 2); loop_cnt--;) {
670    LD_SB4(src, src_stride, src0, src2, src4, src6);
671    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
672    src += (4 * src_stride);
673
674    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
675    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
676    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
677    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
678    SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
679    PCKEV_ST_SB(tmp1, tmp2, dst);
680    dst += dst_stride;
681
682    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
683    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
684    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
685    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
686    SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
687    PCKEV_ST_SB(tmp1, tmp2, dst);
688    dst += dst_stride;
689
690    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
691    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, VP8_FILTER_SHIFT);
692    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
693    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
694    SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
695    PCKEV_ST_SB(tmp1, tmp2, dst);
696    dst += dst_stride;
697
698    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, VP8_FILTER_SHIFT);
699    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, VP8_FILTER_SHIFT);
700    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
701    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
702    SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
703    PCKEV_ST_SB(tmp1, tmp2, dst);
704    dst += dst_stride;
705  }
706}
707
708void vp8_bilinear_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
709                                 int32_t xoffset, int32_t yoffset,
710                                 uint8_t *RESTRICT dst, int32_t dst_stride) {
711  const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
712  const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
713
714  if (yoffset) {
715    if (xoffset) {
716      common_hv_2ht_2vt_4w_msa(src, src_stride, dst, dst_stride, h_filter,
717                               v_filter, 4);
718    } else {
719      common_vt_2t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
720    }
721  } else {
722    if (xoffset) {
723      common_hz_2t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
724    } else {
725      uint32_t tp0, tp1, tp2, tp3;
726
727      LW4(src, src_stride, tp0, tp1, tp2, tp3);
728      SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
729    }
730  }
731}
732
733void vp8_bilinear_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
734                                 int32_t xoffset, int32_t yoffset,
735                                 uint8_t *RESTRICT dst, int32_t dst_stride) {
736  const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
737  const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
738
739  if (yoffset) {
740    if (xoffset) {
741      common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride, h_filter,
742                               v_filter, 4);
743    } else {
744      common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
745    }
746  } else {
747    if (xoffset) {
748      common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
749    } else {
750      vp8_copy_mem8x4(src, src_stride, dst, dst_stride);
751    }
752  }
753}
754
755void vp8_bilinear_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
756                                 int32_t xoffset, int32_t yoffset,
757                                 uint8_t *RESTRICT dst, int32_t dst_stride) {
758  const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
759  const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
760
761  if (yoffset) {
762    if (xoffset) {
763      common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride, h_filter,
764                               v_filter, 8);
765    } else {
766      common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8);
767    }
768  } else {
769    if (xoffset) {
770      common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8);
771    } else {
772      vp8_copy_mem8x8(src, src_stride, dst, dst_stride);
773    }
774  }
775}
776
777void vp8_bilinear_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
778                                   int32_t xoffset, int32_t yoffset,
779                                   uint8_t *RESTRICT dst, int32_t dst_stride) {
780  const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
781  const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
782
783  if (yoffset) {
784    if (xoffset) {
785      common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, h_filter,
786                                v_filter, 16);
787    } else {
788      common_vt_2t_16w_msa(src, src_stride, dst, dst_stride, v_filter, 16);
789    }
790  } else {
791    if (xoffset) {
792      common_hz_2t_16w_msa(src, src_stride, dst, dst_stride, h_filter, 16);
793    } else {
794      vp8_copy_mem16x16(src, src_stride, dst, dst_stride);
795    }
796  }
797}
798