1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include "./vpx_dsp_rtcd.h"
13#include "vpx_dsp/mips/vpx_convolve_msa.h"
14
15const uint8_t mc_filt_mask_arr[16 * 3] = {
16  /* 8 width cases */
17  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
18  /* 4 width cases */
19  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
20  /* 4 width cases */
21  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
22};
23
24static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
25                                     uint8_t *dst, int32_t dst_stride,
26                                     int8_t *filter_horiz, int8_t *filter_vert,
27                                     int32_t height) {
28  uint32_t loop_cnt;
29  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
30  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
31  v16u8 mask0, mask1, mask2, mask3, out;
32  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
33  v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
34  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
35
36  mask0 = LD_UB(&mc_filt_mask_arr[16]);
37  src -= (3 + 3 * src_stride);
38
39  /* rearranging filter */
40  filt = LD_SH(filter_horiz);
41  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
42
43  mask1 = mask0 + 2;
44  mask2 = mask0 + 4;
45  mask3 = mask0 + 6;
46
47  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
48  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
49  src += (7 * src_stride);
50
51  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
52                            filt_hz1, filt_hz2, filt_hz3);
53  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
54                            filt_hz1, filt_hz2, filt_hz3);
55  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
56                            filt_hz1, filt_hz2, filt_hz3);
57  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
58                            filt_hz1, filt_hz2, filt_hz3);
59  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
60
61  filt = LD_SH(filter_vert);
62  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
63
64  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
65  out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
66
67  for (loop_cnt = (height >> 2); loop_cnt--;) {
68    LD_SB4(src, src_stride, src7, src8, src9, src10);
69    XORI_B4_128_SB(src7, src8, src9, src10);
70    src += (4 * src_stride);
71
72    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
73                              filt_hz1, filt_hz2, filt_hz3);
74    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
75    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
76    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
77                               filt_vt2, filt_vt3);
78
79    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
80                              filt_hz1, filt_hz2, filt_hz3);
81    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
82    out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
83    tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
84                               filt_vt2, filt_vt3);
85    SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
86    SAT_SH2_SH(tmp0, tmp1, 7);
87    out = PCKEV_XORI128_UB(tmp0, tmp1);
88    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
89    dst += (4 * dst_stride);
90
91    hz_out5 = hz_out9;
92    out0 = out2;
93    out1 = out3;
94    out2 = out4;
95  }
96}
97
98static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
99                                     uint8_t *dst, int32_t dst_stride,
100                                     int8_t *filter_horiz, int8_t *filter_vert,
101                                     int32_t height) {
102  uint32_t loop_cnt;
103  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
104  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
105  v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
106  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
107  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
108  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
109  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
110
111  mask0 = LD_UB(&mc_filt_mask_arr[0]);
112  src -= (3 + 3 * src_stride);
113
114  /* rearranging filter */
115  filt = LD_SH(filter_horiz);
116  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
117
118  mask1 = mask0 + 2;
119  mask2 = mask0 + 4;
120  mask3 = mask0 + 6;
121
122  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
123  src += (7 * src_stride);
124
125  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
126  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
127                            filt_hz1, filt_hz2, filt_hz3);
128  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
129                            filt_hz1, filt_hz2, filt_hz3);
130  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
131                            filt_hz1, filt_hz2, filt_hz3);
132  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
133                            filt_hz1, filt_hz2, filt_hz3);
134  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
135                            filt_hz1, filt_hz2, filt_hz3);
136  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
137                            filt_hz1, filt_hz2, filt_hz3);
138  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
139                            filt_hz1, filt_hz2, filt_hz3);
140
141  filt = LD_SH(filter_vert);
142  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
143
144  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
145  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
146  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
147
148  for (loop_cnt = (height >> 2); loop_cnt--;) {
149    LD_SB4(src, src_stride, src7, src8, src9, src10);
150    src += (4 * src_stride);
151
152    XORI_B4_128_SB(src7, src8, src9, src10);
153
154    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
155                              filt_hz1, filt_hz2, filt_hz3);
156    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
157    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
158                               filt_vt2, filt_vt3);
159
160    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
161                              filt_hz1, filt_hz2, filt_hz3);
162    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
163    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
164                               filt_vt2, filt_vt3);
165
166    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
167                              filt_hz1, filt_hz2, filt_hz3);
168    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
169    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
170                               filt_vt2, filt_vt3);
171
172    hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
173                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
174    out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
175    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
176                               filt_vt2, filt_vt3);
177    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
178    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
179    vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
180    vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
181    ST8x4_UB(vec0, vec1, dst, dst_stride);
182    dst += (4 * dst_stride);
183
184    hz_out6 = hz_out10;
185    out0 = out2;
186    out1 = out3;
187    out2 = out8;
188    out4 = out6;
189    out5 = out7;
190    out6 = out9;
191  }
192}
193
194static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
195                                      uint8_t *dst, int32_t dst_stride,
196                                      int8_t *filter_horiz, int8_t *filter_vert,
197                                      int32_t height) {
198  int32_t multiple8_cnt;
199  for (multiple8_cnt = 2; multiple8_cnt--;) {
200    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
201                             filter_vert, height);
202    src += 8;
203    dst += 8;
204  }
205}
206
207static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
208                                      uint8_t *dst, int32_t dst_stride,
209                                      int8_t *filter_horiz, int8_t *filter_vert,
210                                      int32_t height) {
211  int32_t multiple8_cnt;
212  for (multiple8_cnt = 4; multiple8_cnt--;) {
213    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
214                             filter_vert, height);
215    src += 8;
216    dst += 8;
217  }
218}
219
220static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
221                                      uint8_t *dst, int32_t dst_stride,
222                                      int8_t *filter_horiz, int8_t *filter_vert,
223                                      int32_t height) {
224  int32_t multiple8_cnt;
225  for (multiple8_cnt = 8; multiple8_cnt--;) {
226    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
227                             filter_vert, height);
228    src += 8;
229    dst += 8;
230  }
231}
232
233static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
234                                      uint8_t *dst, int32_t dst_stride,
235                                      int8_t *filter_horiz,
236                                      int8_t *filter_vert) {
237  v16i8 src0, src1, src2, src3, src4, mask;
238  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
239  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
240
241  mask = LD_SB(&mc_filt_mask_arr[16]);
242
243  /* rearranging filter */
244  filt = LD_UH(filter_horiz);
245  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
246
247  filt = LD_UH(filter_vert);
248  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
249
250  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
251  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
252  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
253  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
254  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
255  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
256
257  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
258  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
259  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
260  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
261  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
262}
263
264static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
265                                      uint8_t *dst, int32_t dst_stride,
266                                      int8_t *filter_horiz,
267                                      int8_t *filter_vert) {
268  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
269  v16i8 res0, res1, res2, res3;
270  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
271  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
272  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
273
274  mask = LD_SB(&mc_filt_mask_arr[16]);
275
276  /* rearranging filter */
277  filt = LD_UH(filter_horiz);
278  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
279
280  filt = LD_UH(filter_vert);
281  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
282
283  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
284  src += (8 * src_stride);
285  src8 = LD_SB(src);
286
287  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
288  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
289  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
290  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
291  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
292  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
293             hz_out3, hz_out5, 8);
294  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
295
296  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
297  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
298  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
299              vec5, vec6, vec7);
300  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
301  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
302              res3);
303  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
304  dst += (4 * dst_stride);
305  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
306}
307
308static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
309                                     uint8_t *dst, int32_t dst_stride,
310                                     int8_t *filter_horiz, int8_t *filter_vert,
311                                     int32_t height) {
312  if (4 == height) {
313    common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
314                              filter_vert);
315  } else if (8 == height) {
316    common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
317                              filter_vert);
318  }
319}
320
321static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
322                                      uint8_t *dst, int32_t dst_stride,
323                                      int8_t *filter_horiz,
324                                      int8_t *filter_vert) {
325  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
326  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
327  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
328  v8i16 filt;
329
330  mask = LD_SB(&mc_filt_mask_arr[0]);
331
332  /* rearranging filter */
333  filt = LD_SH(filter_horiz);
334  filt_hz = (v16u8)__msa_splati_h(filt, 0);
335
336  filt = LD_SH(filter_vert);
337  filt_vt = (v16u8)__msa_splati_h(filt, 0);
338
339  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
340
341  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
342  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
343  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
344  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
345
346  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
347  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
348  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
349
350  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
351  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
352  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
353
354  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
355  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
356  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
357
358  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
359  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
360  ST8x4_UB(out0, out1, dst, dst_stride);
361}
362
363static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
364                                          int32_t src_stride, uint8_t *dst,
365                                          int32_t dst_stride,
366                                          int8_t *filter_horiz,
367                                          int8_t *filter_vert, int32_t height) {
368  uint32_t loop_cnt;
369  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
370  v16u8 filt_hz, filt_vt, vec0;
371  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
372  v8i16 filt;
373
374  mask = LD_SB(&mc_filt_mask_arr[0]);
375
376  /* rearranging filter */
377  filt = LD_SH(filter_horiz);
378  filt_hz = (v16u8)__msa_splati_h(filt, 0);
379
380  filt = LD_SH(filter_vert);
381  filt_vt = (v16u8)__msa_splati_h(filt, 0);
382
383  src0 = LD_SB(src);
384  src += src_stride;
385
386  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
387
388  for (loop_cnt = (height >> 3); loop_cnt--;) {
389    LD_SB4(src, src_stride, src1, src2, src3, src4);
390    src += (4 * src_stride);
391
392    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
393    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
394    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
395
396    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
397    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
398    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
399
400    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
401
402    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
403    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
404    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
405
406    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
407    LD_SB4(src, src_stride, src1, src2, src3, src4);
408    src += (4 * src_stride);
409    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
410    tmp4 = __msa_dotp_u_h(vec0, filt_vt);
411
412    SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
413    PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
414    ST8x4_UB(out0, out1, dst, dst_stride);
415    dst += (4 * dst_stride);
416
417    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
418    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
419    tmp5 = __msa_dotp_u_h(vec0, filt_vt);
420
421    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
422    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
423    tmp6 = __msa_dotp_u_h(vec0, filt_vt);
424
425    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
426    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
427    tmp7 = __msa_dotp_u_h(vec0, filt_vt);
428
429    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
430    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
431    tmp8 = __msa_dotp_u_h(vec0, filt_vt);
432
433    SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
434    PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
435    ST8x4_UB(out0, out1, dst, dst_stride);
436    dst += (4 * dst_stride);
437  }
438}
439
440static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
441                                     uint8_t *dst, int32_t dst_stride,
442                                     int8_t *filter_horiz, int8_t *filter_vert,
443                                     int32_t height) {
444  if (4 == height) {
445    common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
446                              filter_vert);
447  } else {
448    common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
449                                  filter_horiz, filter_vert, height);
450  }
451}
452
453static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
454                                      uint8_t *dst, int32_t dst_stride,
455                                      int8_t *filter_horiz, int8_t *filter_vert,
456                                      int32_t height) {
457  uint32_t loop_cnt;
458  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
459  v16u8 filt_hz, filt_vt, vec0, vec1;
460  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
461  v8i16 filt;
462
463  mask = LD_SB(&mc_filt_mask_arr[0]);
464
465  /* rearranging filter */
466  filt = LD_SH(filter_horiz);
467  filt_hz = (v16u8)__msa_splati_h(filt, 0);
468
469  filt = LD_SH(filter_vert);
470  filt_vt = (v16u8)__msa_splati_h(filt, 0);
471
472  LD_SB2(src, 8, src0, src1);
473  src += src_stride;
474
475  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
476  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
477
478  for (loop_cnt = (height >> 2); loop_cnt--;) {
479    LD_SB4(src, src_stride, src0, src2, src4, src6);
480    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
481    src += (4 * src_stride);
482
483    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
484    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
485    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
486    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
487    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
488    PCKEV_ST_SB(tmp1, tmp2, dst);
489    dst += dst_stride;
490
491    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
492    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
493    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
494    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
495    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
496    PCKEV_ST_SB(tmp1, tmp2, dst);
497    dst += dst_stride;
498
499    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
500    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
501    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
502    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
503    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
504    PCKEV_ST_SB(tmp1, tmp2, dst);
505    dst += dst_stride;
506
507    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
508    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
509    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
510    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
511    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
512    PCKEV_ST_SB(tmp1, tmp2, dst);
513    dst += dst_stride;
514  }
515}
516
517static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
518                                      uint8_t *dst, int32_t dst_stride,
519                                      int8_t *filter_horiz, int8_t *filter_vert,
520                                      int32_t height) {
521  int32_t multiple8_cnt;
522  for (multiple8_cnt = 2; multiple8_cnt--;) {
523    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
524                              filter_vert, height);
525    src += 16;
526    dst += 16;
527  }
528}
529
530static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
531                                      uint8_t *dst, int32_t dst_stride,
532                                      int8_t *filter_horiz, int8_t *filter_vert,
533                                      int32_t height) {
534  int32_t multiple8_cnt;
535  for (multiple8_cnt = 4; multiple8_cnt--;) {
536    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
537                              filter_vert, height);
538    src += 16;
539    dst += 16;
540  }
541}
542
543void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
544                       ptrdiff_t dst_stride, const InterpKernel *filter,
545                       int x0_q4, int32_t x_step_q4, int y0_q4,
546                       int32_t y_step_q4, int32_t w, int32_t h) {
547  const int16_t *const filter_x = filter[x0_q4];
548  const int16_t *const filter_y = filter[y0_q4];
549  int8_t cnt, filt_hor[8], filt_ver[8];
550
551  assert(x_step_q4 == 16);
552  assert(y_step_q4 == 16);
553  assert(((const int32_t *)filter_x)[1] != 0x800000);
554  assert(((const int32_t *)filter_y)[1] != 0x800000);
555
556  for (cnt = 0; cnt < 8; ++cnt) {
557    filt_hor[cnt] = filter_x[cnt];
558    filt_ver[cnt] = filter_y[cnt];
559  }
560
561  if (((const int32_t *)filter_x)[0] == 0 &&
562      ((const int32_t *)filter_y)[0] == 0) {
563    switch (w) {
564      case 4:
565        common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
566                                 (int32_t)dst_stride, &filt_hor[3],
567                                 &filt_ver[3], (int32_t)h);
568        break;
569      case 8:
570        common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst,
571                                 (int32_t)dst_stride, &filt_hor[3],
572                                 &filt_ver[3], (int32_t)h);
573        break;
574      case 16:
575        common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst,
576                                  (int32_t)dst_stride, &filt_hor[3],
577                                  &filt_ver[3], (int32_t)h);
578        break;
579      case 32:
580        common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst,
581                                  (int32_t)dst_stride, &filt_hor[3],
582                                  &filt_ver[3], (int32_t)h);
583        break;
584      case 64:
585        common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst,
586                                  (int32_t)dst_stride, &filt_hor[3],
587                                  &filt_ver[3], (int32_t)h);
588        break;
589      default:
590        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
591                        x_step_q4, y0_q4, y_step_q4, w, h);
592        break;
593    }
594  } else if (((const int32_t *)filter_x)[0] == 0 ||
595             ((const int32_t *)filter_y)[0] == 0) {
596    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
597                    y0_q4, y_step_q4, w, h);
598  } else {
599    switch (w) {
600      case 4:
601        common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst,
602                                 (int32_t)dst_stride, filt_hor, filt_ver,
603                                 (int32_t)h);
604        break;
605      case 8:
606        common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst,
607                                 (int32_t)dst_stride, filt_hor, filt_ver,
608                                 (int32_t)h);
609        break;
610      case 16:
611        common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst,
612                                  (int32_t)dst_stride, filt_hor, filt_ver,
613                                  (int32_t)h);
614        break;
615      case 32:
616        common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst,
617                                  (int32_t)dst_stride, filt_hor, filt_ver,
618                                  (int32_t)h);
619        break;
620      case 64:
621        common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst,
622                                  (int32_t)dst_stride, filt_hor, filt_ver,
623                                  (int32_t)h);
624        break;
625      default:
626        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
627                        x_step_q4, y0_q4, y_step_q4, w, h);
628        break;
629    }
630  }
631}
632
633static void filter_horiz_w4_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
634                                uint8_t *dst, const int16_t *x_filter) {
635  uint64_t srcd0, srcd1, srcd2, srcd3;
636  uint32_t res;
637  v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
638  v16i8 out0, out1;
639  v16i8 shf1 = { 0, 8, 16, 24, 4, 12, 20, 28, 1, 9, 17, 25, 5, 13, 21, 29 };
640  v16i8 shf2 = shf1 + 2;
641  v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
642  v16i8 filt_shf1 = filt_shf0 + 2;
643  v16i8 filt_shf2 = filt_shf0 + 4;
644  v16i8 filt_shf3 = filt_shf0 + 6;
645  v8i16 filt, src0_h, src1_h, src2_h, src3_h, filt0, filt1, filt2, filt3;
646
647  LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
648  INSERT_D2_UB(srcd0, srcd1, src0);
649  INSERT_D2_UB(srcd2, srcd3, src1);
650  VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
651  XORI_B2_128_SB(out0, out1);
652  UNPCK_SB_SH(out0, src0_h, src1_h);
653  UNPCK_SB_SH(out1, src2_h, src3_h);
654
655  filt = LD_SH(x_filter);
656  VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
657  VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
658
659  src0_h *= filt0;
660  src0_h += src1_h * filt1;
661  src0_h += src2_h * filt2;
662  src0_h += src3_h * filt3;
663
664  src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
665
666  src0_h = __msa_adds_s_h(src0_h, src1_h);
667  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
668  src0_h = __msa_sat_s_h(src0_h, 7);
669  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
670  res = __msa_copy_u_w((v4i32)dst0, 0);
671  SW(res, dst);
672}
673
674static void filter_horiz_w8_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
675                                uint8_t *dst, const int16_t *x_filter) {
676  uint64_t srcd0, srcd1, srcd2, srcd3;
677  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
678  v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
679  v16i8 out0, out1, out2, out3;
680  v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
681  v16i8 shf2 = shf1 + 4;
682  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
683  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
684
685  LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
686  INSERT_D2_UB(srcd0, srcd1, src0);
687  INSERT_D2_UB(srcd2, srcd3, src1);
688  LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
689  INSERT_D2_UB(srcd0, srcd1, src2);
690  INSERT_D2_UB(srcd2, srcd3, src3);
691
692  filt = LD_SH(x_filter);
693  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
694  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
695
696  // transpose
697  VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
698  VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
699  ILVRL_W2_SB(tmp2, tmp0, out0, out1);
700  ILVRL_W2_SB(tmp3, tmp1, out2, out3);
701
702  XORI_B4_128_SB(out0, out1, out2, out3);
703  UNPCK_SB_SH(out0, src0_h, src1_h);
704  UNPCK_SB_SH(out1, src2_h, src3_h);
705  UNPCK_SB_SH(out2, src4_h, src5_h);
706  UNPCK_SB_SH(out3, src6_h, src7_h);
707
708  src0_h *= filt0;
709  src4_h *= filt4;
710  src0_h += src1_h * filt1;
711  src4_h += src5_h * filt5;
712  src0_h += src2_h * filt2;
713  src4_h += src6_h * filt6;
714  src0_h += src3_h * filt3;
715  src4_h += src7_h * filt7;
716
717  src0_h = __msa_adds_s_h(src0_h, src4_h);
718  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
719  src0_h = __msa_sat_s_h(src0_h, 7);
720  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
721  ST8x1_UB(dst0, dst);
722}
723
724static void filter_horiz_w16_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
725                                 uint8_t *dst, const int16_t *x_filter) {
726  uint64_t srcd0, srcd1, srcd2, srcd3;
727  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
728  v16u8 src4 = { 0 }, src5 = { 0 }, src6 = { 0 }, src7 = { 0 };
729  v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
730  v16i8 out0, out1, out2, out3, out4, out5, out6, out7;
731  v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
732  v16i8 shf2 = shf1 + 4;
733  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
734  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
735  v8i16 dst0_h, dst1_h, dst2_h, dst3_h;
736
737  LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
738  INSERT_D2_UB(srcd0, srcd1, src0);
739  INSERT_D2_UB(srcd2, srcd3, src1);
740  LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
741  INSERT_D2_UB(srcd0, srcd1, src2);
742  INSERT_D2_UB(srcd2, srcd3, src3);
743  LD4(src_x + 8 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
744  INSERT_D2_UB(srcd0, srcd1, src4);
745  INSERT_D2_UB(srcd2, srcd3, src5);
746  LD4(src_x + 12 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
747  INSERT_D2_UB(srcd0, srcd1, src6);
748  INSERT_D2_UB(srcd2, srcd3, src7);
749
750  filt = LD_SH(x_filter);
751  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
752  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
753
754  // transpose
755  VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
756  VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
757  ILVRL_W2_SB(tmp2, tmp0, out0, out1);
758  ILVRL_W2_SB(tmp3, tmp1, out2, out3);
759  XORI_B4_128_SB(out0, out1, out2, out3);
760
761  UNPCK_SB_SH(out0, src0_h, src1_h);
762  UNPCK_SB_SH(out1, src2_h, src3_h);
763  UNPCK_SB_SH(out2, src4_h, src5_h);
764  UNPCK_SB_SH(out3, src6_h, src7_h);
765
766  VSHF_B2_UB(src4, src5, src4, src5, shf1, shf2, tmp0, tmp1);
767  VSHF_B2_UB(src6, src7, src6, src7, shf1, shf2, tmp2, tmp3);
768  ILVRL_W2_SB(tmp2, tmp0, out4, out5);
769  ILVRL_W2_SB(tmp3, tmp1, out6, out7);
770  XORI_B4_128_SB(out4, out5, out6, out7);
771
772  dst0_h = src0_h * filt0;
773  dst1_h = src4_h * filt4;
774  dst0_h += src1_h * filt1;
775  dst1_h += src5_h * filt5;
776  dst0_h += src2_h * filt2;
777  dst1_h += src6_h * filt6;
778  dst0_h += src3_h * filt3;
779  dst1_h += src7_h * filt7;
780
781  UNPCK_SB_SH(out4, src0_h, src1_h);
782  UNPCK_SB_SH(out5, src2_h, src3_h);
783  UNPCK_SB_SH(out6, src4_h, src5_h);
784  UNPCK_SB_SH(out7, src6_h, src7_h);
785
786  dst2_h = src0_h * filt0;
787  dst3_h = src4_h * filt4;
788  dst2_h += src1_h * filt1;
789  dst3_h += src5_h * filt5;
790  dst2_h += src2_h * filt2;
791  dst3_h += src6_h * filt6;
792  dst2_h += src3_h * filt3;
793  dst3_h += src7_h * filt7;
794
795  ADDS_SH2_SH(dst0_h, dst1_h, dst2_h, dst3_h, dst0_h, dst2_h);
796  SRARI_H2_SH(dst0_h, dst2_h, FILTER_BITS);
797  SAT_SH2_SH(dst0_h, dst2_h, 7);
798  dst0 = PCKEV_XORI128_UB(dst0_h, dst2_h);
799  ST_UB(dst0, dst);
800}
801
802static void transpose4x4_to_dst(const uint8_t *src, uint8_t *dst,
803                                ptrdiff_t dst_stride) {
804  v16u8 in0;
805  v16i8 out0 = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
806
807  in0 = LD_UB(src);
808  out0 = __msa_vshf_b(out0, (v16i8)in0, (v16i8)in0);
809  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
810}
811
812static void transpose8x8_to_dst(const uint8_t *src, uint8_t *dst,
813                                ptrdiff_t dst_stride) {
814  v16u8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
815  v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
816  v16i8 shf2 = shf1 + 4;
817
818  LD_UB4(src, 16, in0, in1, in2, in3);
819  VSHF_B2_UB(in0, in1, in0, in1, shf1, shf2, tmp0, tmp1);
820  VSHF_B2_UB(in2, in3, in2, in3, shf1, shf2, tmp2, tmp3);
821  ILVRL_W2_UB(tmp2, tmp0, out0, out1);
822  ILVRL_W2_UB(tmp3, tmp1, out2, out3);
823  ST8x4_UB(out0, out1, dst, dst_stride);
824  ST8x4_UB(out2, out3, dst + 4 * dst_stride, dst_stride);
825}
826
827static void transpose16x16_to_dst(const uint8_t *src, uint8_t *dst,
828                                  ptrdiff_t dst_stride) {
829  v16u8 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12;
830  v16u8 in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7, out8;
831  v16u8 out9, out10, out11, out12, out13, out14, out15;
832
833  LD_UB8(src, 16, in0, in1, in2, in3, in4, in5, in6, in7);
834  LD_UB8(src + 16 * 8, 16, in8, in9, in10, in11, in12, in13, in14, in15);
835
836  TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
837                      in11, in12, in13, in14, in15, out0, out1, out2, out3,
838                      out4, out5, out6, out7);
839  ST_UB8(out0, out1, out2, out3, out4, out5, out6, out7, dst, dst_stride);
840  dst += 8 * dst_stride;
841
842  SLDI_B4_0_UB(in0, in1, in2, in3, in0, in1, in2, in3, 8);
843  SLDI_B4_0_UB(in4, in5, in6, in7, in4, in5, in6, in7, 8);
844  SLDI_B4_0_UB(in8, in9, in10, in11, in8, in9, in10, in11, 8);
845  SLDI_B4_0_UB(in12, in13, in14, in15, in12, in13, in14, in15, 8);
846
847  TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
848                      in11, in12, in13, in14, in15, out8, out9, out10, out11,
849                      out12, out13, out14, out15);
850  ST_UB8(out8, out9, out10, out11, out12, out13, out14, out15, dst, dst_stride);
851}
852
853static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
854                                    uint8_t *dst, ptrdiff_t dst_stride,
855                                    const InterpKernel *x_filters, int x0_q4,
856                                    int x_step_q4, int h) {
857  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
858  int y, z, i;
859  src -= SUBPEL_TAPS / 2 - 1;
860
861  for (y = 0; y < h; y += 4) {
862    int x_q4 = x0_q4;
863    for (z = 0; z < 4; ++z) {
864      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
865      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
866
867      if (x_q4 & SUBPEL_MASK) {
868        filter_horiz_w4_msa(src_x, src_stride, temp + (z * 4), x_filter);
869      } else {
870        for (i = 0; i < 4; ++i) {
871          temp[z * 4 + i] = src_x[i * src_stride + 3];
872        }
873      }
874
875      x_q4 += x_step_q4;
876    }
877
878    transpose4x4_to_dst(temp, dst, dst_stride);
879
880    src += src_stride * 4;
881    dst += dst_stride * 4;
882  }
883}
884
885static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
886                                    uint8_t *dst, ptrdiff_t dst_stride,
887                                    const InterpKernel *x_filters, int x0_q4,
888                                    int x_step_q4, int h) {
889  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
890  int y, z, i;
891  src -= SUBPEL_TAPS / 2 - 1;
892
893  // This function processes 8x8 areas. The intermediate height is not always
894  // a multiple of 8, so force it to be a multiple of 8 here.
895  y = h + (8 - (h & 0x7));
896
897  do {
898    int x_q4 = x0_q4;
899    for (z = 0; z < 8; ++z) {
900      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
901      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
902
903      if (x_q4 & SUBPEL_MASK) {
904        filter_horiz_w8_msa(src_x, src_stride, temp + (z * 8), x_filter);
905      } else {
906        for (i = 0; i < 8; ++i) {
907          temp[z * 8 + i] = src_x[3 + i * src_stride];
908        }
909      }
910
911      x_q4 += x_step_q4;
912    }
913
914    transpose8x8_to_dst(temp, dst, dst_stride);
915
916    src += src_stride * 8;
917    dst += dst_stride * 8;
918  } while (y -= 8);
919}
920
921static void scaledconvolve_horiz_mul16(const uint8_t *src, ptrdiff_t src_stride,
922                                       uint8_t *dst, ptrdiff_t dst_stride,
923                                       const InterpKernel *x_filters, int x0_q4,
924                                       int x_step_q4, int w, int h) {
925  DECLARE_ALIGNED(16, uint8_t, temp[16 * 16]);
926  int x, y, z, i;
927
928  src -= SUBPEL_TAPS / 2 - 1;
929
930  // This function processes 16x16 areas.  The intermediate height is not always
931  // a multiple of 16, so force it to be a multiple of 8 here.
932  y = h + (16 - (h & 0xF));
933
934  do {
935    int x_q4 = x0_q4;
936    for (x = 0; x < w; x += 16) {
937      for (z = 0; z < 16; ++z) {
938        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
939        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
940
941        if (x_q4 & SUBPEL_MASK) {
942          filter_horiz_w16_msa(src_x, src_stride, temp + (z * 16), x_filter);
943        } else {
944          for (i = 0; i < 16; ++i) {
945            temp[z * 16 + i] = src_x[3 + i * src_stride];
946          }
947        }
948
949        x_q4 += x_step_q4;
950      }
951
952      transpose16x16_to_dst(temp, dst + x, dst_stride);
953    }
954
955    src += src_stride * 16;
956    dst += dst_stride * 16;
957  } while (y -= 16);
958}
959
960static void filter_vert_w4_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
961                               uint8_t *dst, const int16_t *y_filter) {
962  uint32_t srcw0, srcw1, srcw2, srcw3, srcw4, srcw5, srcw6, srcw7;
963  uint32_t res;
964  v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
965  v16i8 out0, out1;
966  v16i8 shf1 = { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 };
967  v16i8 shf2 = shf1 + 8;
968  v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
969  v16i8 filt_shf1 = filt_shf0 + 2;
970  v16i8 filt_shf2 = filt_shf0 + 4;
971  v16i8 filt_shf3 = filt_shf0 + 6;
972  v8i16 filt, src0_h, src1_h, src2_h, src3_h;
973  v8i16 filt0, filt1, filt2, filt3;
974
975  LW4(src_y, src_pitch, srcw0, srcw1, srcw2, srcw3);
976  LW4(src_y + 4 * src_pitch, src_pitch, srcw4, srcw5, srcw6, srcw7);
977  INSERT_W4_UB(srcw0, srcw1, srcw2, srcw3, src0);
978  INSERT_W4_UB(srcw4, srcw5, srcw6, srcw7, src1);
979  VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
980  XORI_B2_128_SB(out0, out1);
981  UNPCK_SB_SH(out0, src0_h, src1_h);
982  UNPCK_SB_SH(out1, src2_h, src3_h);
983
984  filt = LD_SH(y_filter);
985  VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
986  VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
987
988  src0_h *= filt0;
989  src0_h += src1_h * filt1;
990  src0_h += src2_h * filt2;
991  src0_h += src3_h * filt3;
992
993  src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
994
995  src0_h = __msa_adds_s_h(src0_h, src1_h);
996  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
997  src0_h = __msa_sat_s_h(src0_h, 7);
998  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
999  res = __msa_copy_u_w((v4i32)dst0, 0);
1000  SW(res, dst);
1001}
1002
1003static void filter_vert_w8_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
1004                               uint8_t *dst, const int16_t *y_filter) {
1005  uint64_t srcd0, srcd1, srcd2, srcd3;
1006  v16u8 dst0;
1007  v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
1008  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
1009  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
1010
1011  LD4(src_y, src_pitch, srcd0, srcd1, srcd2, srcd3);
1012  INSERT_D2_SB(srcd0, srcd1, src0);
1013  INSERT_D2_SB(srcd2, srcd3, src1);
1014  LD4(src_y + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
1015  INSERT_D2_SB(srcd0, srcd1, src2);
1016  INSERT_D2_SB(srcd2, srcd3, src3);
1017
1018  filt = LD_SH(y_filter);
1019  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1020  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
1021
1022  XORI_B4_128_SB(src0, src1, src2, src3);
1023  UNPCK_SB_SH(src0, src0_h, src1_h);
1024  UNPCK_SB_SH(src1, src2_h, src3_h);
1025  UNPCK_SB_SH(src2, src4_h, src5_h);
1026  UNPCK_SB_SH(src3, src6_h, src7_h);
1027
1028  src0_h *= filt0;
1029  src4_h *= filt4;
1030  src0_h += src1_h * filt1;
1031  src4_h += src5_h * filt5;
1032  src0_h += src2_h * filt2;
1033  src4_h += src6_h * filt6;
1034  src0_h += src3_h * filt3;
1035  src4_h += src7_h * filt7;
1036
1037  src0_h = __msa_adds_s_h(src0_h, src4_h);
1038  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
1039  src0_h = __msa_sat_s_h(src0_h, 7);
1040  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
1041  ST8x1_UB(dst0, dst);
1042}
1043
1044static void filter_vert_mul_w16_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
1045                                    uint8_t *dst, const int16_t *y_filter,
1046                                    int w) {
1047  int x;
1048  v16u8 dst0;
1049  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1050  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
1051  v8i16 src8_h, src9_h, src10_h, src11_h, src12_h, src13_h, src14_h, src15_h;
1052  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
1053
1054  filt = LD_SH(y_filter);
1055  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1056  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
1057
1058  for (x = 0; x < w; x += 16) {
1059    LD_SB8(src_y, src_pitch, src0, src1, src2, src3, src4, src5, src6, src7);
1060    src_y += 16;
1061
1062    XORI_B4_128_SB(src0, src1, src2, src3);
1063    XORI_B4_128_SB(src4, src5, src6, src7);
1064    UNPCK_SB_SH(src0, src0_h, src1_h);
1065    UNPCK_SB_SH(src1, src2_h, src3_h);
1066    UNPCK_SB_SH(src2, src4_h, src5_h);
1067    UNPCK_SB_SH(src3, src6_h, src7_h);
1068    UNPCK_SB_SH(src4, src8_h, src9_h);
1069    UNPCK_SB_SH(src5, src10_h, src11_h);
1070    UNPCK_SB_SH(src6, src12_h, src13_h);
1071    UNPCK_SB_SH(src7, src14_h, src15_h);
1072
1073    src0_h *= filt0;
1074    src1_h *= filt0;
1075    src8_h *= filt4;
1076    src9_h *= filt4;
1077    src0_h += src2_h * filt1;
1078    src1_h += src3_h * filt1;
1079    src8_h += src10_h * filt5;
1080    src9_h += src11_h * filt5;
1081    src0_h += src4_h * filt2;
1082    src1_h += src5_h * filt2;
1083    src8_h += src12_h * filt6;
1084    src9_h += src13_h * filt6;
1085    src0_h += src6_h * filt3;
1086    src1_h += src7_h * filt3;
1087    src8_h += src14_h * filt7;
1088    src9_h += src15_h * filt7;
1089
1090    ADDS_SH2_SH(src0_h, src8_h, src1_h, src9_h, src0_h, src1_h);
1091    SRARI_H2_SH(src0_h, src1_h, FILTER_BITS);
1092    SAT_SH2_SH(src0_h, src1_h, 7);
1093    dst0 = PCKEV_XORI128_UB(src0_h, src1_h);
1094    ST_UB(dst0, dst);
1095    dst += 16;
1096  }
1097}
1098
1099static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
1100                                   uint8_t *dst, ptrdiff_t dst_stride,
1101                                   const InterpKernel *y_filters, int y0_q4,
1102                                   int y_step_q4, int h) {
1103  int y;
1104  int y_q4 = y0_q4;
1105
1106  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1107
1108  for (y = 0; y < h; ++y) {
1109    const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1110    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1111
1112    if (y_q4 & SUBPEL_MASK) {
1113      filter_vert_w4_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
1114    } else {
1115      uint32_t srcd = LW(src_y + 3 * src_stride);
1116      SW(srcd, dst + y * dst_stride);
1117    }
1118
1119    y_q4 += y_step_q4;
1120  }
1121}
1122
1123static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
1124                                   uint8_t *dst, ptrdiff_t dst_stride,
1125                                   const InterpKernel *y_filters, int y0_q4,
1126                                   int y_step_q4, int h) {
1127  int y;
1128  int y_q4 = y0_q4;
1129
1130  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1131
1132  for (y = 0; y < h; ++y) {
1133    const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1134    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1135
1136    if (y_q4 & SUBPEL_MASK) {
1137      filter_vert_w8_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
1138    } else {
1139      uint64_t srcd = LD(src_y + 3 * src_stride);
1140      SD(srcd, dst + y * dst_stride);
1141    }
1142
1143    y_q4 += y_step_q4;
1144  }
1145}
1146
1147static void scaledconvolve_vert_mul16(const uint8_t *src, ptrdiff_t src_stride,
1148                                      uint8_t *dst, ptrdiff_t dst_stride,
1149                                      const InterpKernel *y_filters, int y0_q4,
1150                                      int y_step_q4, int w, int h) {
1151  int x, y;
1152  int y_q4 = y0_q4;
1153  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1154
1155  for (y = 0; y < h; ++y) {
1156    const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1157    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1158
1159    if (y_q4 & SUBPEL_MASK) {
1160      filter_vert_mul_w16_msa(src_y, src_stride, &dst[y * dst_stride], y_filter,
1161                              w);
1162    } else {
1163      for (x = 0; x < w; ++x) {
1164        dst[x + y * dst_stride] = src_y[x + 3 * src_stride];
1165      }
1166    }
1167
1168    y_q4 += y_step_q4;
1169  }
1170}
1171
1172void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1173                       ptrdiff_t dst_stride, const InterpKernel *filter,
1174                       int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
1175                       int w, int h) {
1176  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1177  // 2d filtering proceeds in 2 steps:
1178  //   (1) Interpolate horizontally into an intermediate buffer, temp.
1179  //   (2) Interpolate temp vertically to derive the sub-pixel result.
1180  // Deriving the maximum number of rows in the temp buffer (135):
1181  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1182  // --Largest block size is 64x64 pixels.
1183  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1184  //   original frame (in 1/16th pixel units).
1185  // --Must round-up because block may be located at sub-pixel position.
1186  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1187  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1188  // --Require an additional 8 rows for the horiz_w8 transpose tail.
1189  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
1190  const int intermediate_height =
1191      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1192
1193  assert(w <= 64);
1194  assert(h <= 64);
1195  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
1196  assert(x_step_q4 <= 64);
1197
1198  if ((0 == x0_q4) && (16 == x_step_q4) && (0 == y0_q4) && (16 == y_step_q4)) {
1199    vpx_convolve_copy_msa(src, src_stride, dst, dst_stride, filter, x0_q4,
1200                          x_step_q4, y0_q4, y_step_q4, w, h);
1201  } else {
1202    if (w >= 16) {
1203      scaledconvolve_horiz_mul16(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1204                                 src_stride, temp, 64, filter, x0_q4, x_step_q4,
1205                                 w, intermediate_height);
1206    } else if (w == 8) {
1207      scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1208                              src_stride, temp, 64, filter, x0_q4, x_step_q4,
1209                              intermediate_height);
1210    } else {
1211      scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1212                              src_stride, temp, 64, filter, x0_q4, x_step_q4,
1213                              intermediate_height);
1214    }
1215
1216    if (w >= 16) {
1217      scaledconvolve_vert_mul16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1218                                dst_stride, filter, y0_q4, y_step_q4, w, h);
1219    } else if (w == 8) {
1220      scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1221                             dst_stride, filter, y0_q4, y_step_q4, h);
1222    } else {
1223      scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1224                             dst_stride, filter, y0_q4, y_step_q4, h);
1225    }
1226  }
1227}
1228