1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#ifndef VPX_DSP_LOOPFILTER_MSA_H_
12#define VPX_DSP_LOOPFILTER_MSA_H_
13
14#include "vpx_dsp/mips/macros_msa.h"
15
16#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \
17                           p0_out, q0_out, q1_out)                        \
18  {                                                                       \
19    v16i8 p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2;                \
20    const v16i8 cnst4b = __msa_ldi_b(4);                                  \
21    const v16i8 cnst3b = __msa_ldi_b(3);                                  \
22                                                                          \
23    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                              \
24    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                              \
25    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                              \
26    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                              \
27                                                                          \
28    filt = __msa_subs_s_b(p1_m, q1_m);                                    \
29    filt &= hev;                                                          \
30    q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);                               \
31    filt = __msa_adds_s_b(filt, q0_sub_p0);                               \
32    filt = __msa_adds_s_b(filt, q0_sub_p0);                               \
33    filt = __msa_adds_s_b(filt, q0_sub_p0);                               \
34    filt &= mask;                                                         \
35    t1 = __msa_adds_s_b(filt, cnst4b);                                    \
36    t1 >>= cnst3b;                                                        \
37    t2 = __msa_adds_s_b(filt, cnst3b);                                    \
38    t2 >>= cnst3b;                                                        \
39    q0_m = __msa_subs_s_b(q0_m, t1);                                      \
40    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                             \
41    p0_m = __msa_adds_s_b(p0_m, t2);                                      \
42    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                             \
43    filt = __msa_srari_b(t1, 1);                                          \
44    hev = __msa_xori_b(hev, 0xff);                                        \
45    filt &= hev;                                                          \
46    q1_m = __msa_subs_s_b(q1_m, filt);                                    \
47    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                             \
48    p1_m = __msa_adds_s_b(p1_m, filt);                                    \
49    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                             \
50  }
51
52#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)    \
53  {                                                                      \
54    v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
55    v16u8 zero_in = { 0 };                                               \
56                                                                         \
57    tmp_flat4 = __msa_ori_b(zero_in, 1);                                 \
58    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                          \
59    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                          \
60    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                          \
61    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                          \
62                                                                         \
63    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);               \
64    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                     \
65    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);               \
66    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                     \
67                                                                         \
68    flat_out = (tmp_flat4 < (v16u8)flat_out);                            \
69    flat_out = __msa_xori_b(flat_out, 0xff);                             \
70    flat_out = flat_out & (mask);                                        \
71  }
72
73#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
74                  q6_in, q7_in, flat_in, flat2_out)                       \
75  {                                                                       \
76    v16u8 tmp_flat5, zero_in = { 0 };                                     \
77    v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;             \
78    v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;             \
79                                                                          \
80    tmp_flat5 = __msa_ori_b(zero_in, 1);                                  \
81    p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                           \
82    q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                           \
83    p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                           \
84    q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                           \
85    p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                           \
86    q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                           \
87    p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                           \
88    q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                           \
89                                                                          \
90    p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);                \
91    flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);                  \
92    flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);                    \
93    p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);                \
94    flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);                    \
95    p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);                \
96    flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);                    \
97                                                                          \
98    flat2_out = (tmp_flat5 < (v16u8)flat2_out);                           \
99    flat2_out = __msa_xori_b(flat2_out, 0xff);                            \
100    flat2_out = flat2_out & flat_in;                                      \
101  }
102
103#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
104                    p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
105                    q1_filt8_out, q2_filt8_out)                             \
106  {                                                                         \
107    v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2;                            \
108                                                                            \
109    tmp_filt8_2 = p2_in + p1_in + p0_in;                                    \
110    tmp_filt8_0 = p3_in << 1;                                               \
111                                                                            \
112    tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in;                        \
113    tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in;                              \
114    p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
115                                                                            \
116    tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in;                              \
117    p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
118                                                                            \
119    tmp_filt8_1 = q2_in + q1_in + q0_in;                                    \
120    tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1;                                \
121    tmp_filt8_0 = tmp_filt8_2 + (p0_in);                                    \
122    tmp_filt8_0 = tmp_filt8_0 + (p3_in);                                    \
123    p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3);             \
124                                                                            \
125    tmp_filt8_0 = q2_in + q3_in;                                            \
126    tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0;                        \
127    tmp_filt8_1 = q3_in + q3_in;                                            \
128    tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0;                                \
129    q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
130                                                                            \
131    tmp_filt8_0 = tmp_filt8_2 + q3_in;                                      \
132    tmp_filt8_1 = tmp_filt8_0 + q0_in;                                      \
133    q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
134                                                                            \
135    tmp_filt8_1 = tmp_filt8_0 - p2_in;                                      \
136    tmp_filt8_0 = q1_in + q3_in;                                            \
137    tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1;                                \
138    q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
139  }
140
141#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
142                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
143                     flat_out)                                               \
144  {                                                                          \
145    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;            \
146    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;            \
147                                                                             \
148    /* absolute subtraction of pixel values */                               \
149    p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                             \
150    p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                             \
151    p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                             \
152    q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                             \
153    q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                             \
154    q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                             \
155    p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                             \
156    p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                             \
157                                                                             \
158    /* calculation of hev */                                                 \
159    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);                    \
160    hev_out = thresh_in < (v16u8)flat_out;                                   \
161                                                                             \
162    /* calculation of mask */                                                \
163    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);               \
164    p1_asub_q1_m >>= 1;                                                      \
165    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);               \
166                                                                             \
167    mask_out = b_limit_in < p0_asub_q0_m;                                    \
168    mask_out = __msa_max_u_b(flat_out, mask_out);                            \
169    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);                \
170    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);                        \
171    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);                \
172    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);                        \
173                                                                             \
174    mask_out = limit_in < (v16u8)mask_out;                                   \
175    mask_out = __msa_xori_b(mask_out, 0xff);                                 \
176  }
177#endif /* VPX_DSP_LOOPFILTER_MSA_H_ */
178