1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include "./vpx_dsp_rtcd.h"
13#include "vpx_dsp/mips/vpx_convolve_msa.h"
14
15static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
16                                 uint8_t *dst, int32_t dst_stride,
17                                 int8_t *filter) {
18  v16u8 mask0, mask1, mask2, mask3, out;
19  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
20  v8i16 filt, out0, out1;
21
22  mask0 = LD_UB(&mc_filt_mask_arr[16]);
23  src -= 3;
24
25  /* rearranging filter */
26  filt = LD_SH(filter);
27  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
28
29  mask1 = mask0 + 2;
30  mask2 = mask0 + 4;
31  mask3 = mask0 + 6;
32
33  LD_SB4(src, src_stride, src0, src1, src2, src3);
34  XORI_B4_128_SB(src0, src1, src2, src3);
35  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
36                             filt0, filt1, filt2, filt3, out0, out1);
37  SRARI_H2_SH(out0, out1, FILTER_BITS);
38  SAT_SH2_SH(out0, out1, 7);
39  out = PCKEV_XORI128_UB(out0, out1);
40  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
41}
42
43static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
44                                 uint8_t *dst, int32_t dst_stride,
45                                 int8_t *filter) {
46  v16i8 filt0, filt1, filt2, filt3;
47  v16i8 src0, src1, src2, src3;
48  v16u8 mask0, mask1, mask2, mask3, out;
49  v8i16 filt, out0, out1, out2, out3;
50
51  mask0 = LD_UB(&mc_filt_mask_arr[16]);
52  src -= 3;
53
54  /* rearranging filter */
55  filt = LD_SH(filter);
56  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
57
58  mask1 = mask0 + 2;
59  mask2 = mask0 + 4;
60  mask3 = mask0 + 6;
61
62  LD_SB4(src, src_stride, src0, src1, src2, src3);
63  XORI_B4_128_SB(src0, src1, src2, src3);
64  src += (4 * src_stride);
65  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
66                             filt0, filt1, filt2, filt3, out0, out1);
67  LD_SB4(src, src_stride, src0, src1, src2, src3);
68  XORI_B4_128_SB(src0, src1, src2, src3);
69  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
70                             filt0, filt1, filt2, filt3, out2, out3);
71  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
72  SAT_SH4_SH(out0, out1, out2, out3, 7);
73  out = PCKEV_XORI128_UB(out0, out1);
74  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
75  dst += (4 * dst_stride);
76  out = PCKEV_XORI128_UB(out2, out3);
77  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
78}
79
80static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
81                                uint8_t *dst, int32_t dst_stride,
82                                int8_t *filter, int32_t height) {
83  if (4 == height) {
84    common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
85  } else if (8 == height) {
86    common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
87  }
88}
89
90static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
91                                 uint8_t *dst, int32_t dst_stride,
92                                 int8_t *filter) {
93  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
94  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
95  v8i16 filt, out0, out1, out2, out3;
96
97  mask0 = LD_UB(&mc_filt_mask_arr[0]);
98  src -= 3;
99
100  /* rearranging filter */
101  filt = LD_SH(filter);
102  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
103
104  mask1 = mask0 + 2;
105  mask2 = mask0 + 4;
106  mask3 = mask0 + 6;
107
108  LD_SB4(src, src_stride, src0, src1, src2, src3);
109  XORI_B4_128_SB(src0, src1, src2, src3);
110  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
111                             filt0, filt1, filt2, filt3, out0, out1, out2,
112                             out3);
113  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
114  SAT_SH4_SH(out0, out1, out2, out3, 7);
115  tmp0 = PCKEV_XORI128_UB(out0, out1);
116  tmp1 = PCKEV_XORI128_UB(out2, out3);
117  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
118}
119
120static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
121                                     uint8_t *dst, int32_t dst_stride,
122                                     int8_t *filter, int32_t height) {
123  uint32_t loop_cnt;
124  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
125  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
126  v8i16 filt, out0, out1, out2, out3;
127
128  mask0 = LD_UB(&mc_filt_mask_arr[0]);
129  src -= 3;
130
131  /* rearranging filter */
132  filt = LD_SH(filter);
133  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
134
135  mask1 = mask0 + 2;
136  mask2 = mask0 + 4;
137  mask3 = mask0 + 6;
138
139  for (loop_cnt = (height >> 2); loop_cnt--;) {
140    LD_SB4(src, src_stride, src0, src1, src2, src3);
141    XORI_B4_128_SB(src0, src1, src2, src3);
142    src += (4 * src_stride);
143    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
144                               mask3, filt0, filt1, filt2, filt3, out0, out1,
145                               out2, out3);
146    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
147    SAT_SH4_SH(out0, out1, out2, out3, 7);
148    tmp0 = PCKEV_XORI128_UB(out0, out1);
149    tmp1 = PCKEV_XORI128_UB(out2, out3);
150    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
151    dst += (4 * dst_stride);
152  }
153}
154
155static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
156                                uint8_t *dst, int32_t dst_stride,
157                                int8_t *filter, int32_t height) {
158  if (4 == height) {
159    common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
160  } else {
161    common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
162  }
163}
164
165static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
166                                 uint8_t *dst, int32_t dst_stride,
167                                 int8_t *filter, int32_t height) {
168  uint32_t loop_cnt;
169  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
170  v16u8 mask0, mask1, mask2, mask3, out;
171  v8i16 filt, out0, out1, out2, out3;
172
173  mask0 = LD_UB(&mc_filt_mask_arr[0]);
174  src -= 3;
175
176  /* rearranging filter */
177  filt = LD_SH(filter);
178  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
179
180  mask1 = mask0 + 2;
181  mask2 = mask0 + 4;
182  mask3 = mask0 + 6;
183
184  for (loop_cnt = (height >> 1); loop_cnt--;) {
185    LD_SB2(src, src_stride, src0, src2);
186    LD_SB2(src + 8, src_stride, src1, src3);
187    XORI_B4_128_SB(src0, src1, src2, src3);
188    src += (2 * src_stride);
189    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
190                               mask3, filt0, filt1, filt2, filt3, out0, out1,
191                               out2, out3);
192    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
193    SAT_SH4_SH(out0, out1, out2, out3, 7);
194    out = PCKEV_XORI128_UB(out0, out1);
195    ST_UB(out, dst);
196    dst += dst_stride;
197    out = PCKEV_XORI128_UB(out2, out3);
198    ST_UB(out, dst);
199    dst += dst_stride;
200  }
201}
202
203static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
204                                 uint8_t *dst, int32_t dst_stride,
205                                 int8_t *filter, int32_t height) {
206  uint32_t loop_cnt;
207  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
208  v16u8 mask0, mask1, mask2, mask3, out;
209  v8i16 filt, out0, out1, out2, out3;
210
211  mask0 = LD_UB(&mc_filt_mask_arr[0]);
212  src -= 3;
213
214  /* rearranging filter */
215  filt = LD_SH(filter);
216  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
217
218  mask1 = mask0 + 2;
219  mask2 = mask0 + 4;
220  mask3 = mask0 + 6;
221
222  for (loop_cnt = (height >> 1); loop_cnt--;) {
223    src0 = LD_SB(src);
224    src2 = LD_SB(src + 16);
225    src3 = LD_SB(src + 24);
226    src1 = __msa_sldi_b(src2, src0, 8);
227    src += src_stride;
228    XORI_B4_128_SB(src0, src1, src2, src3);
229    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
230                               mask3, filt0, filt1, filt2, filt3, out0, out1,
231                               out2, out3);
232    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
233    SAT_SH4_SH(out0, out1, out2, out3, 7);
234
235    src0 = LD_SB(src);
236    src2 = LD_SB(src + 16);
237    src3 = LD_SB(src + 24);
238    src1 = __msa_sldi_b(src2, src0, 8);
239    src += src_stride;
240
241    out = PCKEV_XORI128_UB(out0, out1);
242    ST_UB(out, dst);
243    out = PCKEV_XORI128_UB(out2, out3);
244    ST_UB(out, dst + 16);
245    dst += dst_stride;
246
247    XORI_B4_128_SB(src0, src1, src2, src3);
248    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
249                               mask3, filt0, filt1, filt2, filt3, out0, out1,
250                               out2, out3);
251    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
252    SAT_SH4_SH(out0, out1, out2, out3, 7);
253    out = PCKEV_XORI128_UB(out0, out1);
254    ST_UB(out, dst);
255    out = PCKEV_XORI128_UB(out2, out3);
256    ST_UB(out, dst + 16);
257    dst += dst_stride;
258  }
259}
260
261static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
262                                 uint8_t *dst, int32_t dst_stride,
263                                 int8_t *filter, int32_t height) {
264  int32_t loop_cnt;
265  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
266  v16u8 mask0, mask1, mask2, mask3, out;
267  v8i16 filt, out0, out1, out2, out3;
268
269  mask0 = LD_UB(&mc_filt_mask_arr[0]);
270  src -= 3;
271
272  /* rearranging filter */
273  filt = LD_SH(filter);
274  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
275
276  mask1 = mask0 + 2;
277  mask2 = mask0 + 4;
278  mask3 = mask0 + 6;
279
280  for (loop_cnt = height; loop_cnt--;) {
281    src0 = LD_SB(src);
282    src2 = LD_SB(src + 16);
283    src3 = LD_SB(src + 24);
284    src1 = __msa_sldi_b(src2, src0, 8);
285
286    XORI_B4_128_SB(src0, src1, src2, src3);
287    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
288                               mask3, filt0, filt1, filt2, filt3, out0, out1,
289                               out2, out3);
290    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
291    SAT_SH4_SH(out0, out1, out2, out3, 7);
292    out = PCKEV_XORI128_UB(out0, out1);
293    ST_UB(out, dst);
294    out = PCKEV_XORI128_UB(out2, out3);
295    ST_UB(out, dst + 16);
296
297    src0 = LD_SB(src + 32);
298    src2 = LD_SB(src + 48);
299    src3 = LD_SB(src + 56);
300    src1 = __msa_sldi_b(src2, src0, 8);
301    src += src_stride;
302
303    XORI_B4_128_SB(src0, src1, src2, src3);
304    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
305                               mask3, filt0, filt1, filt2, filt3, out0, out1,
306                               out2, out3);
307    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
308    SAT_SH4_SH(out0, out1, out2, out3, 7);
309    out = PCKEV_XORI128_UB(out0, out1);
310    ST_UB(out, dst + 32);
311    out = PCKEV_XORI128_UB(out2, out3);
312    ST_UB(out, dst + 48);
313    dst += dst_stride;
314  }
315}
316
317static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
318                                 uint8_t *dst, int32_t dst_stride,
319                                 int8_t *filter) {
320  v16i8 src0, src1, src2, src3, mask;
321  v16u8 filt0, vec0, vec1, res0, res1;
322  v8u16 vec2, vec3, filt;
323
324  mask = LD_SB(&mc_filt_mask_arr[16]);
325
326  /* rearranging filter */
327  filt = LD_UH(filter);
328  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
329
330  LD_SB4(src, src_stride, src0, src1, src2, src3);
331  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
332  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
333  SRARI_H2_UH(vec2, vec3, FILTER_BITS);
334  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
335  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
336}
337
338static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
339                                 uint8_t *dst, int32_t dst_stride,
340                                 int8_t *filter) {
341  v16u8 vec0, vec1, vec2, vec3, filt0;
342  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
343  v16i8 res0, res1, res2, res3;
344  v8u16 vec4, vec5, vec6, vec7, filt;
345
346  mask = LD_SB(&mc_filt_mask_arr[16]);
347
348  /* rearranging filter */
349  filt = LD_UH(filter);
350  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
351
352  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
353  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
354  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
355  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
356              vec6, vec7);
357  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
358  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
359              res2, res3);
360  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
361  dst += (4 * dst_stride);
362  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
363}
364
365static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
366                                uint8_t *dst, int32_t dst_stride,
367                                int8_t *filter, int32_t height) {
368  if (4 == height) {
369    common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
370  } else if (8 == height) {
371    common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
372  }
373}
374
375static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
376                                 uint8_t *dst, int32_t dst_stride,
377                                 int8_t *filter) {
378  v16u8 filt0;
379  v16i8 src0, src1, src2, src3, mask;
380  v8u16 vec0, vec1, vec2, vec3, filt;
381
382  mask = LD_SB(&mc_filt_mask_arr[0]);
383
384  /* rearranging filter */
385  filt = LD_UH(filter);
386  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
387
388  LD_SB4(src, src_stride, src0, src1, src2, src3);
389  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
390  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
391  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
392              vec2, vec3);
393  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
394  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
395  ST8x4_UB(src0, src1, dst, dst_stride);
396}
397
398static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
399                                     uint8_t *dst, int32_t dst_stride,
400                                     int8_t *filter, int32_t height) {
401  v16u8 filt0;
402  v16i8 src0, src1, src2, src3, mask, out0, out1;
403  v8u16 vec0, vec1, vec2, vec3, filt;
404
405  mask = LD_SB(&mc_filt_mask_arr[0]);
406
407  /* rearranging filter */
408  filt = LD_UH(filter);
409  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
410
411  LD_SB4(src, src_stride, src0, src1, src2, src3);
412  src += (4 * src_stride);
413
414  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
415  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
416  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
417              vec2, vec3);
418  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
419
420  LD_SB4(src, src_stride, src0, src1, src2, src3);
421  src += (4 * src_stride);
422
423  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
424  ST8x4_UB(out0, out1, dst, dst_stride);
425  dst += (4 * dst_stride);
426
427  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
428  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
429  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
430              vec2, vec3);
431  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
432  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
433  ST8x4_UB(out0, out1, dst, dst_stride);
434  dst += (4 * dst_stride);
435
436  if (16 == height) {
437    LD_SB4(src, src_stride, src0, src1, src2, src3);
438    src += (4 * src_stride);
439
440    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
441    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
442    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
443                vec2, vec3);
444    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
445    LD_SB4(src, src_stride, src0, src1, src2, src3);
446    src += (4 * src_stride);
447
448    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
449    ST8x4_UB(out0, out1, dst, dst_stride);
450
451    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
452    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
453    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
454                vec2, vec3);
455    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
456    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
457    ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
458  }
459}
460
461static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
462                                uint8_t *dst, int32_t dst_stride,
463                                int8_t *filter, int32_t height) {
464  if (4 == height) {
465    common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
466  } else {
467    common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
468  }
469}
470
471static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
472                                 uint8_t *dst, int32_t dst_stride,
473                                 int8_t *filter, int32_t height) {
474  uint32_t loop_cnt;
475  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
476  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
477  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
478
479  mask = LD_SB(&mc_filt_mask_arr[0]);
480
481  loop_cnt = (height >> 2) - 1;
482
483  /* rearranging filter */
484  filt = LD_UH(filter);
485  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
486
487  LD_SB4(src, src_stride, src0, src2, src4, src6);
488  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
489  src += (4 * src_stride);
490
491  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
492  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
493  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
494  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
495  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
496              out2, out3);
497  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
498              out6, out7);
499  SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
500  SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
501  PCKEV_ST_SB(out0, out1, dst);
502  dst += dst_stride;
503  PCKEV_ST_SB(out2, out3, dst);
504  dst += dst_stride;
505  PCKEV_ST_SB(out4, out5, dst);
506  dst += dst_stride;
507  PCKEV_ST_SB(out6, out7, dst);
508  dst += dst_stride;
509
510  for (; loop_cnt--;) {
511    LD_SB4(src, src_stride, src0, src2, src4, src6);
512    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
513    src += (4 * src_stride);
514
515    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
516    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
517    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
518    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
519    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
520                out2, out3);
521    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
522                out6, out7);
523    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
524    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
525    PCKEV_ST_SB(out0, out1, dst);
526    dst += dst_stride;
527    PCKEV_ST_SB(out2, out3, dst);
528    dst += dst_stride;
529    PCKEV_ST_SB(out4, out5, dst);
530    dst += dst_stride;
531    PCKEV_ST_SB(out6, out7, dst);
532    dst += dst_stride;
533  }
534}
535
536static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
537                                 uint8_t *dst, int32_t dst_stride,
538                                 int8_t *filter, int32_t height) {
539  uint32_t loop_cnt;
540  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
541  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
542  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
543
544  mask = LD_SB(&mc_filt_mask_arr[0]);
545
546  /* rearranging filter */
547  filt = LD_UH(filter);
548  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
549
550  for (loop_cnt = height >> 1; loop_cnt--;) {
551    src0 = LD_SB(src);
552    src2 = LD_SB(src + 16);
553    src3 = LD_SB(src + 24);
554    src1 = __msa_sldi_b(src2, src0, 8);
555    src += src_stride;
556    src4 = LD_SB(src);
557    src6 = LD_SB(src + 16);
558    src7 = LD_SB(src + 24);
559    src5 = __msa_sldi_b(src6, src4, 8);
560    src += src_stride;
561
562    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
563    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
564    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
565    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
566    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
567                out2, out3);
568    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
569                out6, out7);
570    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
571    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
572    PCKEV_ST_SB(out0, out1, dst);
573    PCKEV_ST_SB(out2, out3, dst + 16);
574    dst += dst_stride;
575    PCKEV_ST_SB(out4, out5, dst);
576    PCKEV_ST_SB(out6, out7, dst + 16);
577    dst += dst_stride;
578  }
579}
580
581static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
582                                 uint8_t *dst, int32_t dst_stride,
583                                 int8_t *filter, int32_t height) {
584  uint32_t loop_cnt;
585  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
586  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
587  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
588
589  mask = LD_SB(&mc_filt_mask_arr[0]);
590
591  /* rearranging filter */
592  filt = LD_UH(filter);
593  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
594
595  for (loop_cnt = height; loop_cnt--;) {
596    src0 = LD_SB(src);
597    src2 = LD_SB(src + 16);
598    src4 = LD_SB(src + 32);
599    src6 = LD_SB(src + 48);
600    src7 = LD_SB(src + 56);
601    SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
602    src += src_stride;
603
604    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
605    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
606    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
607    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
608    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
609                out2, out3);
610    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
611                out6, out7);
612    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
613    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
614    PCKEV_ST_SB(out0, out1, dst);
615    PCKEV_ST_SB(out2, out3, dst + 16);
616    PCKEV_ST_SB(out4, out5, dst + 32);
617    PCKEV_ST_SB(out6, out7, dst + 48);
618    dst += dst_stride;
619  }
620}
621
622void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
623                             uint8_t *dst, ptrdiff_t dst_stride,
624                             const int16_t *filter_x, int x_step_q4,
625                             const int16_t *filter_y, int y_step_q4,
626                             int w, int h) {
627  int8_t cnt, filt_hor[8];
628
629  assert(x_step_q4 == 16);
630  assert(((const int32_t *)filter_x)[1] != 0x800000);
631
632  for (cnt = 0; cnt < 8; ++cnt) {
633    filt_hor[cnt] = filter_x[cnt];
634  }
635
636  if (((const int32_t *)filter_x)[0] == 0) {
637    switch (w) {
638      case 4:
639        common_hz_2t_4w_msa(src, (int32_t)src_stride,
640                            dst, (int32_t)dst_stride,
641                            &filt_hor[3], h);
642        break;
643      case 8:
644        common_hz_2t_8w_msa(src, (int32_t)src_stride,
645                            dst, (int32_t)dst_stride,
646                            &filt_hor[3], h);
647        break;
648      case 16:
649        common_hz_2t_16w_msa(src, (int32_t)src_stride,
650                             dst, (int32_t)dst_stride,
651                             &filt_hor[3], h);
652        break;
653      case 32:
654        common_hz_2t_32w_msa(src, (int32_t)src_stride,
655                             dst, (int32_t)dst_stride,
656                             &filt_hor[3], h);
657        break;
658      case 64:
659        common_hz_2t_64w_msa(src, (int32_t)src_stride,
660                             dst, (int32_t)dst_stride,
661                             &filt_hor[3], h);
662        break;
663      default:
664        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
665                              filter_x, x_step_q4, filter_y, y_step_q4,
666                              w, h);
667        break;
668    }
669  } else {
670    switch (w) {
671      case 4:
672        common_hz_8t_4w_msa(src, (int32_t)src_stride,
673                            dst, (int32_t)dst_stride,
674                            filt_hor, h);
675        break;
676      case 8:
677        common_hz_8t_8w_msa(src, (int32_t)src_stride,
678                            dst, (int32_t)dst_stride,
679                            filt_hor, h);
680        break;
681      case 16:
682        common_hz_8t_16w_msa(src, (int32_t)src_stride,
683                             dst, (int32_t)dst_stride,
684                             filt_hor, h);
685        break;
686      case 32:
687        common_hz_8t_32w_msa(src, (int32_t)src_stride,
688                             dst, (int32_t)dst_stride,
689                             filt_hor, h);
690        break;
691      case 64:
692        common_hz_8t_64w_msa(src, (int32_t)src_stride,
693                             dst, (int32_t)dst_stride,
694                             filt_hor, h);
695        break;
696      default:
697        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
698                              filter_x, x_step_q4, filter_y, y_step_q4,
699                              w, h);
700        break;
701    }
702  }
703}
704