1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include "./vpx_dsp_rtcd.h"
13#include "vpx_dsp/mips/vpx_convolve_msa.h"
14
15static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
16                                uint8_t *dst, int32_t dst_stride,
17                                int8_t *filter, int32_t height) {
18  uint32_t loop_cnt;
19  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
20  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
21  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
22  v16i8 src10998, filt0, filt1, filt2, filt3;
23  v16u8 out;
24  v8i16 filt, out10, out32;
25
26  src -= (3 * src_stride);
27
28  filt = LD_SH(filter);
29  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
30
31  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
32  src += (7 * src_stride);
33
34  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
35             src54_r, src21_r);
36  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
37  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
38             src4332, src6554);
39  XORI_B3_128_SB(src2110, src4332, src6554);
40
41  for (loop_cnt = (height >> 2); loop_cnt--;) {
42    LD_SB4(src, src_stride, src7, src8, src9, src10);
43    src += (4 * src_stride);
44
45    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
46               src87_r, src98_r, src109_r);
47    ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
48    XORI_B2_128_SB(src8776, src10998);
49    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
50                                filt1, filt2, filt3);
51    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
52                                filt1, filt2, filt3);
53    SRARI_H2_SH(out10, out32, FILTER_BITS);
54    SAT_SH2_SH(out10, out32, 7);
55    out = PCKEV_XORI128_UB(out10, out32);
56    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
57    dst += (4 * dst_stride);
58
59    src2110 = src6554;
60    src4332 = src8776;
61    src6554 = src10998;
62    src6 = src10;
63  }
64}
65
66static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
67                                uint8_t *dst, int32_t dst_stride,
68                                int8_t *filter, int32_t height) {
69  uint32_t loop_cnt;
70  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
71  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
72  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
73  v16u8 tmp0, tmp1;
74  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
75
76  src -= (3 * src_stride);
77
78  filt = LD_SH(filter);
79  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
80
81  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
82  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
83  src += (7 * src_stride);
84  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
85             src54_r, src21_r);
86  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
87
88  for (loop_cnt = (height >> 2); loop_cnt--;) {
89    LD_SB4(src, src_stride, src7, src8, src9, src10);
90    XORI_B4_128_SB(src7, src8, src9, src10);
91    src += (4 * src_stride);
92
93    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
94               src87_r, src98_r, src109_r);
95    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
96                                 filt1, filt2, filt3);
97    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
98                                 filt1, filt2, filt3);
99    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
100                                 filt1, filt2, filt3);
101    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
102                                 filt1, filt2, filt3);
103    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
104    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
105    tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
106    tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
107    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
108    dst += (4 * dst_stride);
109
110    src10_r = src54_r;
111    src32_r = src76_r;
112    src54_r = src98_r;
113    src21_r = src65_r;
114    src43_r = src87_r;
115    src65_r = src109_r;
116    src6 = src10;
117  }
118}
119
120static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
121                                 uint8_t *dst, int32_t dst_stride,
122                                 int8_t *filter, int32_t height) {
123  uint32_t loop_cnt;
124  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
125  v16i8 filt0, filt1, filt2, filt3;
126  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
127  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
128  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
129  v16u8 tmp0, tmp1, tmp2, tmp3;
130  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
131
132  src -= (3 * src_stride);
133
134  filt = LD_SH(filter);
135  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
136
137  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
138  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
139  src += (7 * src_stride);
140  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
141             src54_r, src21_r);
142  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
143  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
144             src54_l, src21_l);
145  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
146
147  for (loop_cnt = (height >> 2); loop_cnt--;) {
148    LD_SB4(src, src_stride, src7, src8, src9, src10);
149    XORI_B4_128_SB(src7, src8, src9, src10);
150    src += (4 * src_stride);
151
152    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
153               src87_r, src98_r, src109_r);
154    ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
155               src87_l, src98_l, src109_l);
156    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
157                                 filt1, filt2, filt3);
158    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
159                                 filt1, filt2, filt3);
160    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
161                                 filt1, filt2, filt3);
162    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
163                                 filt1, filt2, filt3);
164    out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
165                                 filt1, filt2, filt3);
166    out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
167                                 filt1, filt2, filt3);
168    out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
169                                 filt1, filt2, filt3);
170    out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
171                                 filt1, filt2, filt3);
172    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
173    SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
174    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
175    SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
176    PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
177                tmp0, tmp1, tmp2, tmp3);
178    XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
179    ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
180    dst += (4 * dst_stride);
181
182    src10_r = src54_r;
183    src32_r = src76_r;
184    src54_r = src98_r;
185    src21_r = src65_r;
186    src43_r = src87_r;
187    src65_r = src109_r;
188    src10_l = src54_l;
189    src32_l = src76_l;
190    src54_l = src98_l;
191    src21_l = src65_l;
192    src43_l = src87_l;
193    src65_l = src109_l;
194    src6 = src10;
195  }
196}
197
198static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
199                                      uint8_t *dst, int32_t dst_stride,
200                                      int8_t *filter, int32_t height,
201                                      int32_t width) {
202  const uint8_t *src_tmp;
203  uint8_t *dst_tmp;
204  uint32_t loop_cnt, cnt;
205  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
206  v16i8 filt0, filt1, filt2, filt3;
207  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
208  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
209  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
210  v16u8 tmp0, tmp1, tmp2, tmp3;
211  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
212
213  src -= (3 * src_stride);
214
215  filt = LD_SH(filter);
216  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
217
218  for (cnt = (width >> 4); cnt--;) {
219    src_tmp = src;
220    dst_tmp = dst;
221
222    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
223    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
224    src_tmp += (7 * src_stride);
225    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
226               src54_r, src21_r);
227    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
228    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
229               src54_l, src21_l);
230    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
231
232    for (loop_cnt = (height >> 2); loop_cnt--;) {
233      LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
234      XORI_B4_128_SB(src7, src8, src9, src10);
235      src_tmp += (4 * src_stride);
236      ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
237                 src87_r, src98_r, src109_r);
238      ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
239                 src87_l, src98_l, src109_l);
240      out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
241                                   filt1, filt2, filt3);
242      out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
243                                   filt1, filt2, filt3);
244      out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
245                                   filt1, filt2, filt3);
246      out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
247                                   filt1, filt2, filt3);
248      out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
249                                   filt1, filt2, filt3);
250      out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
251                                   filt1, filt2, filt3);
252      out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
253                                   filt1, filt2, filt3);
254      out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
255                                   filt1, filt2, filt3);
256      SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
257      SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
258      SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
259      SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
260      PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
261                  out3_r, tmp0, tmp1, tmp2, tmp3);
262      XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
263      ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
264      dst_tmp += (4 * dst_stride);
265
266      src10_r = src54_r;
267      src32_r = src76_r;
268      src54_r = src98_r;
269      src21_r = src65_r;
270      src43_r = src87_r;
271      src65_r = src109_r;
272      src10_l = src54_l;
273      src32_l = src76_l;
274      src54_l = src98_l;
275      src21_l = src65_l;
276      src43_l = src87_l;
277      src65_l = src109_l;
278      src6 = src10;
279    }
280
281    src += 16;
282    dst += 16;
283  }
284}
285
286static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
287                                 uint8_t *dst, int32_t dst_stride,
288                                 int8_t *filter, int32_t height) {
289  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
290                            32);
291}
292
293static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
294                                 uint8_t *dst, int32_t dst_stride,
295                                 int8_t *filter, int32_t height) {
296  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
297                            64);
298}
299
300static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
301                                 uint8_t *dst, int32_t dst_stride,
302                                 int8_t *filter) {
303  v16i8 src0, src1, src2, src3, src4;
304  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
305  v16u8 filt0;
306  v8i16 filt;
307  v8u16 tmp0, tmp1;
308
309  filt = LD_SH(filter);
310  filt0 = (v16u8)__msa_splati_h(filt, 0);
311
312  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
313  src += (5 * src_stride);
314
315  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
316             src32_r, src43_r);
317  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
318  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
319  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
320  src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
321  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
322}
323
324static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
325                                 uint8_t *dst, int32_t dst_stride,
326                                 int8_t *filter) {
327  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
328  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
329  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
330  v8u16 tmp0, tmp1, tmp2, tmp3;
331  v16u8 filt0;
332  v8i16 filt;
333
334  filt = LD_SH(filter);
335  filt0 = (v16u8)__msa_splati_h(filt, 0);
336
337  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
338  src += (8 * src_stride);
339
340  src8 = LD_SB(src);
341  src += src_stride;
342
343  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
344             src32_r, src43_r);
345  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
346             src76_r, src87_r);
347  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
348             src76_r, src2110, src4332, src6554, src8776);
349  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
350              tmp0, tmp1, tmp2, tmp3);
351  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
352  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
353  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
354  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
355}
356
357static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
358                                uint8_t *dst, int32_t dst_stride,
359                                int8_t *filter, int32_t height) {
360  if (4 == height) {
361    common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
362  } else if (8 == height) {
363    common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
364  }
365}
366
367static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
368                                 uint8_t *dst, int32_t dst_stride,
369                                 int8_t *filter) {
370  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
371  v16i8 out0, out1;
372  v8u16 tmp0, tmp1, tmp2, tmp3;
373  v8i16 filt;
374
375  /* rearranging filter_y */
376  filt = LD_SH(filter);
377  filt0 = (v16u8)__msa_splati_h(filt, 0);
378
379  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
380  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
381  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
382  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
383              tmp2, tmp3);
384  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
385  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
386  ST8x4_UB(out0, out1, dst, dst_stride);
387}
388
389static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
390                                     uint8_t *dst, int32_t dst_stride,
391                                     int8_t *filter, int32_t height) {
392  uint32_t loop_cnt;
393  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
394  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
395  v16i8 out0, out1;
396  v8u16 tmp0, tmp1, tmp2, tmp3;
397  v8i16 filt;
398
399  /* rearranging filter_y */
400  filt = LD_SH(filter);
401  filt0 = (v16u8)__msa_splati_h(filt, 0);
402
403  src0 = LD_UB(src);
404  src += src_stride;
405
406  for (loop_cnt = (height >> 3); loop_cnt--;) {
407    LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
408    src += (8 * src_stride);
409
410    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
411               vec3);
412    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
413               vec7);
414    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
415                tmp2, tmp3);
416    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
417    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
418    ST8x4_UB(out0, out1, dst, dst_stride);
419    dst += (4 * dst_stride);
420
421    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
422                tmp2, tmp3);
423    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
424    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
425    ST8x4_UB(out0, out1, dst, dst_stride);
426    dst += (4 * dst_stride);
427
428    src0 = src8;
429  }
430}
431
432static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
433                                uint8_t *dst, int32_t dst_stride,
434                                int8_t *filter, int32_t height) {
435  if (4 == height) {
436    common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
437  } else {
438    common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
439  }
440}
441
442static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
443                                 uint8_t *dst, int32_t dst_stride,
444                                 int8_t *filter, int32_t height) {
445  uint32_t loop_cnt;
446  v16u8 src0, src1, src2, src3, src4;
447  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
448  v8u16 tmp0, tmp1, tmp2, tmp3;
449  v8i16 filt;
450
451  /* rearranging filter_y */
452  filt = LD_SH(filter);
453  filt0 = (v16u8)__msa_splati_h(filt, 0);
454
455  src0 = LD_UB(src);
456  src += src_stride;
457
458  for (loop_cnt = (height >> 2); loop_cnt--;) {
459    LD_UB4(src, src_stride, src1, src2, src3, src4);
460    src += (4 * src_stride);
461
462    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
463    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
464    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
465    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
466    PCKEV_ST_SB(tmp0, tmp1, dst);
467    dst += dst_stride;
468
469    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
470    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
471    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
472    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
473    PCKEV_ST_SB(tmp2, tmp3, dst);
474    dst += dst_stride;
475
476    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
477    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
478    PCKEV_ST_SB(tmp0, tmp1, dst);
479    dst += dst_stride;
480
481    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
482    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
483    PCKEV_ST_SB(tmp2, tmp3, dst);
484    dst += dst_stride;
485
486    src0 = src4;
487  }
488}
489
490static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
491                                 uint8_t *dst, int32_t dst_stride,
492                                 int8_t *filter, int32_t height) {
493  uint32_t loop_cnt;
494  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
495  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
496  v8u16 tmp0, tmp1, tmp2, tmp3;
497  v8i16 filt;
498
499  /* rearranging filter_y */
500  filt = LD_SH(filter);
501  filt0 = (v16u8)__msa_splati_h(filt, 0);
502
503  src0 = LD_UB(src);
504  src5 = LD_UB(src + 16);
505  src += src_stride;
506
507  for (loop_cnt = (height >> 2); loop_cnt--;) {
508    LD_UB4(src, src_stride, src1, src2, src3, src4);
509    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
510    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
511
512    LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
513    src += (4 * src_stride);
514
515    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
516    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
517    PCKEV_ST_SB(tmp0, tmp1, dst);
518    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
519    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
520    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
521
522    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
523    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
524    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
525    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
526    PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
527
528    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
529    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
530    PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
531
532    ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
533    ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
534    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
535    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
536    PCKEV_ST_SB(tmp0, tmp1, dst + 16);
537
538    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
539    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
540    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
541
542    ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
543    ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
544    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
545    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
546    PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
547
548    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
549    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
550    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
551    dst += (4 * dst_stride);
552
553    src0 = src4;
554    src5 = src9;
555  }
556}
557
558static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
559                                 uint8_t *dst, int32_t dst_stride,
560                                 int8_t *filter, int32_t height) {
561  uint32_t loop_cnt;
562  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
563  v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
564  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
565  v8i16 filt;
566
567  /* rearranging filter_y */
568  filt = LD_SH(filter);
569  filt0 = (v16u8)__msa_splati_h(filt, 0);
570
571  LD_UB4(src, 16, src0, src3, src6, src9);
572  src += src_stride;
573
574  for (loop_cnt = (height >> 1); loop_cnt--;) {
575    LD_UB2(src, src_stride, src1, src2);
576    LD_UB2(src + 16, src_stride, src4, src5);
577    LD_UB2(src + 32, src_stride, src7, src8);
578    LD_UB2(src + 48, src_stride, src10, src11);
579    src += (2 * src_stride);
580
581    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
582    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
583    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
584    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
585    PCKEV_ST_SB(tmp0, tmp1, dst);
586
587    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
588    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
589    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
590
591    ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
592    ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
593    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
594    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
595    PCKEV_ST_SB(tmp4, tmp5, dst + 16);
596
597    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
598    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
599    PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
600
601    ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
602    ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
603    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
604    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
605    PCKEV_ST_SB(tmp0, tmp1, dst + 32);
606
607    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
608    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
609    PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
610
611    ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
612    ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
613    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
614    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
615    PCKEV_ST_SB(tmp4, tmp5, dst + 48);
616
617    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
618    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
619    PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
620    dst += (2 * dst_stride);
621
622    src0 = src2;
623    src3 = src5;
624    src6 = src8;
625    src9 = src11;
626  }
627}
628
629void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
630                            uint8_t *dst, ptrdiff_t dst_stride,
631                            const InterpKernel *filter, int x0_q4,
632                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
633                            int h) {
634  const int16_t *const filter_y = filter[y0_q4];
635  int8_t cnt, filt_ver[8];
636
637  assert(y_step_q4 == 16);
638  assert(((const int32_t *)filter_y)[1] != 0x800000);
639
640  for (cnt = 8; cnt--;) {
641    filt_ver[cnt] = filter_y[cnt];
642  }
643
644  if (((const int32_t *)filter_y)[0] == 0) {
645    switch (w) {
646      case 4:
647        common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
648                            &filt_ver[3], h);
649        break;
650      case 8:
651        common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
652                            &filt_ver[3], h);
653        break;
654      case 16:
655        common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
656                             &filt_ver[3], h);
657        break;
658      case 32:
659        common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
660                             &filt_ver[3], h);
661        break;
662      case 64:
663        common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
664                             &filt_ver[3], h);
665        break;
666      default:
667        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
668                             x_step_q4, y0_q4, y_step_q4, w, h);
669        break;
670    }
671  } else {
672    switch (w) {
673      case 4:
674        common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
675                            filt_ver, h);
676        break;
677      case 8:
678        common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
679                            filt_ver, h);
680        break;
681      case 16:
682        common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
683                             filt_ver, h);
684        break;
685      case 32:
686        common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
687                             filt_ver, h);
688        break;
689      case 64:
690        common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
691                             filt_ver, h);
692        break;
693      default:
694        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
695                             x_step_q4, y0_q4, y_step_q4, w, h);
696        break;
697    }
698  }
699}
700