loopfilter_msa.h revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#ifndef VPX_DSP_LOOPFILTER_MSA_H_
12#define VPX_DSP_LOOPFILTER_MSA_H_
13
14#include "vpx_dsp/mips/macros_msa.h"
15
16#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
17                           p1_out, p0_out, q0_out, q1_out) {             \
18  v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                    \
19  v16i8 filt, filt1, filt2, cnst4b, cnst3b;                              \
20  v8i16 q0_sub_p0_r, filt_r, cnst3h;                                     \
21                                                                         \
22  p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                               \
23  p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                               \
24  q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                               \
25  q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                               \
26                                                                         \
27  filt = __msa_subs_s_b(p1_m, q1_m);                                     \
28  filt = filt & (v16i8)hev_in;                                           \
29  q0_sub_p0 = q0_m - p0_m;                                               \
30  filt_sign = __msa_clti_s_b(filt, 0);                                   \
31                                                                         \
32  cnst3h = __msa_ldi_h(3);                                               \
33  q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);               \
34  q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);       \
35  filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                         \
36  filt_r += q0_sub_p0_r;                                                 \
37  filt_r = __msa_sat_s_h(filt_r, 7);                                     \
38                                                                         \
39  /* combine left and right part */                                      \
40  filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r);                    \
41                                                                         \
42  filt = filt & (v16i8)mask_in;                                          \
43  cnst4b = __msa_ldi_b(4);                                               \
44  filt1 = __msa_adds_s_b(filt, cnst4b);                                  \
45  filt1 >>= 3;                                                           \
46                                                                         \
47  cnst3b = __msa_ldi_b(3);                                               \
48  filt2 = __msa_adds_s_b(filt, cnst3b);                                  \
49  filt2 >>= 3;                                                           \
50                                                                         \
51  q0_m = __msa_subs_s_b(q0_m, filt1);                                    \
52  q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                              \
53  p0_m = __msa_adds_s_b(p0_m, filt2);                                    \
54  p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                              \
55                                                                         \
56  filt = __msa_srari_b(filt1, 1);                                        \
57  hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                            \
58  filt = filt & (v16i8)hev_in;                                           \
59                                                                         \
60  q1_m = __msa_subs_s_b(q1_m, filt);                                     \
61  q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                              \
62  p1_m = __msa_adds_s_b(p1_m, filt);                                     \
63  p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                              \
64}
65
66#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
67                           p1_out, p0_out, q0_out, q1_out) {             \
68  v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                    \
69  v16i8 filt, filt1, filt2, cnst4b, cnst3b;                              \
70  v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;                \
71                                                                         \
72  p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                               \
73  p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                               \
74  q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                               \
75  q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                               \
76                                                                         \
77  filt = __msa_subs_s_b(p1_m, q1_m);                                     \
78                                                                         \
79  filt = filt & (v16i8)hev_in;                                           \
80                                                                         \
81  q0_sub_p0 = q0_m - p0_m;                                               \
82  filt_sign = __msa_clti_s_b(filt, 0);                                   \
83                                                                         \
84  cnst3h = __msa_ldi_h(3);                                               \
85  q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);               \
86  q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);       \
87  filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                         \
88  filt_r += q0_sub_p0_r;                                                 \
89  filt_r = __msa_sat_s_h(filt_r, 7);                                     \
90                                                                         \
91  q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);               \
92  q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);       \
93  filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                         \
94  filt_l += q0_sub_p0_l;                                                 \
95  filt_l = __msa_sat_s_h(filt_l, 7);                                     \
96                                                                         \
97  filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                    \
98  filt = filt & (v16i8)mask_in;                                          \
99                                                                         \
100  cnst4b = __msa_ldi_b(4);                                               \
101  filt1 = __msa_adds_s_b(filt, cnst4b);                                  \
102  filt1 >>= 3;                                                           \
103                                                                         \
104  cnst3b = __msa_ldi_b(3);                                               \
105  filt2 = __msa_adds_s_b(filt, cnst3b);                                  \
106  filt2 >>= 3;                                                           \
107                                                                         \
108  q0_m = __msa_subs_s_b(q0_m, filt1);                                    \
109  q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                              \
110  p0_m = __msa_adds_s_b(p0_m, filt2);                                    \
111  p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                              \
112                                                                         \
113  filt = __msa_srari_b(filt1, 1);                                        \
114  hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                            \
115  filt = filt & (v16i8)hev_in;                                           \
116                                                                         \
117  q1_m = __msa_subs_s_b(q1_m, filt);                                     \
118  q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                              \
119  p1_m = __msa_adds_s_b(p1_m, filt);                                     \
120  p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                              \
121}
122
123#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) {  \
124  v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;         \
125  v16u8 zero_in = { 0 };                                                 \
126                                                                         \
127  tmp = __msa_ori_b(zero_in, 1);                                         \
128  p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                            \
129  q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                            \
130  p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                            \
131  q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                            \
132                                                                         \
133  p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);                 \
134  flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                       \
135  p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);                 \
136  flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                       \
137                                                                         \
138  flat_out = (tmp < (v16u8)flat_out);                                    \
139  flat_out = __msa_xori_b(flat_out, 0xff);                               \
140  flat_out = flat_out & (mask);                                          \
141}
142
143#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,  \
144                  q5_in, q6_in, q7_in, flat_in, flat2_out) {        \
145  v16u8 tmp, zero_in = { 0 };                                       \
146  v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;         \
147  v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;         \
148                                                                    \
149  tmp = __msa_ori_b(zero_in, 1);                                    \
150  p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                       \
151  q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                       \
152  p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                       \
153  q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                       \
154  p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                       \
155  q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                       \
156  p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                       \
157  q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                       \
158                                                                    \
159  p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);            \
160  flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);              \
161  flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);                \
162  p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);            \
163  flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);                \
164  p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);            \
165  flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);                \
166                                                                    \
167  flat2_out = (tmp < (v16u8)flat2_out);                             \
168  flat2_out = __msa_xori_b(flat2_out, 0xff);                        \
169  flat2_out = flat2_out & flat_in;                                  \
170}
171
172#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in,                  \
173                    q0_in, q1_in, q2_in, q3_in,                  \
174                    p2_filt8_out, p1_filt8_out, p0_filt8_out,    \
175                    q0_filt8_out, q1_filt8_out, q2_filt8_out) {  \
176  v8u16 tmp0, tmp1, tmp2;                                        \
177                                                                 \
178  tmp2 = p2_in + p1_in + p0_in;                                  \
179  tmp0 = p3_in << 1;                                             \
180                                                                 \
181  tmp0 = tmp0 + tmp2 + q0_in;                                    \
182  tmp1 = tmp0 + p3_in + p2_in;                                   \
183  p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
184                                                                 \
185  tmp1 = tmp0 + p1_in + q1_in;                                   \
186  p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
187                                                                 \
188  tmp1 = q2_in + q1_in + q0_in;                                  \
189  tmp2 = tmp2 + tmp1;                                            \
190  tmp0 = tmp2 + (p0_in);                                         \
191  tmp0 = tmp0 + (p3_in);                                         \
192  p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3);           \
193                                                                 \
194  tmp0 = q2_in + q3_in;                                          \
195  tmp0 = p0_in + tmp1 + tmp0;                                    \
196  tmp1 = q3_in + q3_in;                                          \
197  tmp1 = tmp1 + tmp0;                                            \
198  q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
199                                                                 \
200  tmp0 = tmp2 + q3_in;                                           \
201  tmp1 = tmp0 + q0_in;                                           \
202  q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
203                                                                 \
204  tmp1 = tmp0 - p2_in;                                           \
205  tmp0 = q1_in + q3_in;                                          \
206  tmp1 = tmp0 + tmp1;                                            \
207  q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
208}
209
210#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                 \
211                     q0_in, q1_in, q2_in, q3_in,                 \
212                     limit_in, b_limit_in, thresh_in,            \
213                     hev_out, mask_out, flat_out) {              \
214  v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
215  v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
216                                                                 \
217  /* absolute subtraction of pixel values */                     \
218  p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                   \
219  p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                   \
220  p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                   \
221  q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                   \
222  q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                   \
223  q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                   \
224  p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                   \
225  p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                   \
226                                                                 \
227  /* calculation of hev */                                       \
228  flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
229  hev_out = thresh_in < (v16u8)flat_out;                         \
230                                                                 \
231  /* calculation of mask */                                      \
232  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
233  p1_asub_q1_m >>= 1;                                            \
234  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
235                                                                 \
236  mask_out = b_limit_in < p0_asub_q0_m;                          \
237  mask_out = __msa_max_u_b(flat_out, mask_out);                  \
238  p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
239  mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
240  q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
241  mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
242                                                                 \
243  mask_out = limit_in < (v16u8)mask_out;                         \
244  mask_out = __msa_xori_b(mask_out, 0xff);                       \
245}
246#endif  /* VPX_DSP_LOOPFILTER_MSA_H_ */
247