vpx_convolve_msa.h revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#ifndef VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
12#define VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
13
14#include "vpx_dsp/mips/macros_msa.h"
15#include "vpx_dsp/vpx_filter.h"
16
17extern const uint8_t mc_filt_mask_arr[16 * 3];
18
19#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,         \
20                            filt0, filt1, filt2, filt3) ({  \
21  v8i16 tmp0, tmp1;                                         \
22                                                            \
23  tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);         \
24  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1);  \
25  tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);         \
26  tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3);  \
27  tmp0 = __msa_adds_s_h(tmp0, tmp1);                        \
28                                                            \
29  tmp0;                                                     \
30})
31
32#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,        \
33                        filt_h0, filt_h1, filt_h2, filt_h3) ({         \
34  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
35  v8i16 hz_out_m;                                                      \
36                                                                       \
37  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,                   \
38             vec0_m, vec1_m, vec2_m, vec3_m);                          \
39  hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m,       \
40                                 filt_h0, filt_h1, filt_h2, filt_h3);  \
41                                                                       \
42  hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS);                     \
43  hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
44                                                                       \
45  hz_out_m;                                                            \
46})
47
48#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,               \
49                                   mask0, mask1, mask2, mask3,           \
50                                   filt0, filt1, filt2, filt3,           \
51                                   out0, out1) {                         \
52  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;  \
53  v8i16 res0_m, res1_m, res2_m, res3_m;                                  \
54                                                                         \
55  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);      \
56  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);             \
57  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);      \
58  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);            \
59  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);      \
60  DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);             \
61  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);      \
62  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);            \
63  ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);               \
64}
65
66#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                  \
67                                   mask0, mask1, mask2, mask3,              \
68                                   filt0, filt1, filt2, filt3,              \
69                                   out0, out1, out2, out3) {                \
70  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
71  v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;     \
72                                                                            \
73  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
74  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
75  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
76              res0_m, res1_m, res2_m, res3_m);                              \
77  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \
78  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \
79  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,   \
80              res4_m, res5_m, res6_m, res7_m);                              \
81  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \
82  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \
83  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \
84               res0_m, res1_m, res2_m, res3_m);                             \
85  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \
86  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \
87  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \
88               res4_m, res5_m, res6_m, res7_m);                             \
89  ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,       \
90              res7_m, out0, out1, out2, out3);                              \
91}
92
93#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) {  \
94  v16u8 tmp_m;                                          \
95                                                        \
96  tmp_m = PCKEV_XORI128_UB(in1, in0);                   \
97  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);            \
98  ST_UB(tmp_m, (pdst));                                 \
99}
100
101#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) {           \
102  v16u8 tmp_m;                                           \
103                                                         \
104  tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1);  \
105  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);             \
106  ST_UB(tmp_m, (pdst));                                  \
107}
108
109#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,  \
110                           pdst, stride) {                              \
111  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
112  uint8_t *pdst_m = (uint8_t *)(pdst);                                  \
113                                                                        \
114  PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                      \
115  PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                  \
116  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);          \
117  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                             \
118}
119#endif  /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */
120