loopfilter_msa.h revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#ifndef VPX_DSP_LOOPFILTER_MSA_H_ 12#define VPX_DSP_LOOPFILTER_MSA_H_ 13 14#include "vpx_dsp/mips/macros_msa.h" 15 16#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ 17 p1_out, p0_out, q0_out, q1_out) { \ 18 v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ 19 v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ 20 v8i16 q0_sub_p0_r, filt_r, cnst3h; \ 21 \ 22 p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ 23 p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ 24 q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ 25 q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ 26 \ 27 filt = __msa_subs_s_b(p1_m, q1_m); \ 28 filt = filt & (v16i8)hev_in; \ 29 q0_sub_p0 = q0_m - p0_m; \ 30 filt_sign = __msa_clti_s_b(filt, 0); \ 31 \ 32 cnst3h = __msa_ldi_h(3); \ 33 q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ 34 q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ 35 filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ 36 filt_r += q0_sub_p0_r; \ 37 filt_r = __msa_sat_s_h(filt_r, 7); \ 38 \ 39 /* combine left and right part */ \ 40 filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \ 41 \ 42 filt = filt & (v16i8)mask_in; \ 43 cnst4b = __msa_ldi_b(4); \ 44 filt1 = __msa_adds_s_b(filt, cnst4b); \ 45 filt1 >>= 3; \ 46 \ 47 cnst3b = __msa_ldi_b(3); \ 48 filt2 = __msa_adds_s_b(filt, cnst3b); \ 49 filt2 >>= 3; \ 50 \ 51 q0_m = __msa_subs_s_b(q0_m, filt1); \ 52 q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ 53 p0_m = __msa_adds_s_b(p0_m, filt2); \ 54 p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ 55 \ 56 filt = __msa_srari_b(filt1, 1); \ 57 hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ 58 filt = filt & (v16i8)hev_in; \ 59 \ 60 q1_m = __msa_subs_s_b(q1_m, filt); \ 61 q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ 62 p1_m = __msa_adds_s_b(p1_m, filt); \ 63 p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ 64} 65 66#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ 67 p1_out, p0_out, q0_out, q1_out) { \ 68 v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ 69 v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ 70 v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ 71 \ 72 p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ 73 p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ 74 q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ 75 q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ 76 \ 77 filt = __msa_subs_s_b(p1_m, q1_m); \ 78 \ 79 filt = filt & (v16i8)hev_in; \ 80 \ 81 q0_sub_p0 = q0_m - p0_m; \ 82 filt_sign = __msa_clti_s_b(filt, 0); \ 83 \ 84 cnst3h = __msa_ldi_h(3); \ 85 q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ 86 q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ 87 filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ 88 filt_r += q0_sub_p0_r; \ 89 filt_r = __msa_sat_s_h(filt_r, 7); \ 90 \ 91 q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ 92 q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \ 93 filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ 94 filt_l += q0_sub_p0_l; \ 95 filt_l = __msa_sat_s_h(filt_l, 7); \ 96 \ 97 filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ 98 filt = filt & (v16i8)mask_in; \ 99 \ 100 cnst4b = __msa_ldi_b(4); \ 101 filt1 = __msa_adds_s_b(filt, cnst4b); \ 102 filt1 >>= 3; \ 103 \ 104 cnst3b = __msa_ldi_b(3); \ 105 filt2 = __msa_adds_s_b(filt, cnst3b); \ 106 filt2 >>= 3; \ 107 \ 108 q0_m = __msa_subs_s_b(q0_m, filt1); \ 109 q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ 110 p0_m = __msa_adds_s_b(p0_m, filt2); \ 111 p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ 112 \ 113 filt = __msa_srari_b(filt1, 1); \ 114 hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ 115 filt = filt & (v16i8)hev_in; \ 116 \ 117 q1_m = __msa_subs_s_b(q1_m, filt); \ 118 q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ 119 p1_m = __msa_adds_s_b(p1_m, filt); \ 120 p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ 121} 122 123#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) { \ 124 v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ 125 v16u8 zero_in = { 0 }; \ 126 \ 127 tmp = __msa_ori_b(zero_in, 1); \ 128 p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ 129 q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ 130 p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ 131 q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ 132 \ 133 p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ 134 flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ 135 p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ 136 flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ 137 \ 138 flat_out = (tmp < (v16u8)flat_out); \ 139 flat_out = __msa_xori_b(flat_out, 0xff); \ 140 flat_out = flat_out & (mask); \ 141} 142 143#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \ 144 q5_in, q6_in, q7_in, flat_in, flat2_out) { \ 145 v16u8 tmp, zero_in = { 0 }; \ 146 v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ 147 v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ 148 \ 149 tmp = __msa_ori_b(zero_in, 1); \ 150 p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ 151 q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ 152 p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ 153 q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \ 154 p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \ 155 q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \ 156 p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \ 157 q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \ 158 \ 159 p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \ 160 flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \ 161 flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \ 162 p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \ 163 flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \ 164 p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ 165 flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ 166 \ 167 flat2_out = (tmp < (v16u8)flat2_out); \ 168 flat2_out = __msa_xori_b(flat2_out, 0xff); \ 169 flat2_out = flat2_out & flat_in; \ 170} 171 172#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \ 173 q0_in, q1_in, q2_in, q3_in, \ 174 p2_filt8_out, p1_filt8_out, p0_filt8_out, \ 175 q0_filt8_out, q1_filt8_out, q2_filt8_out) { \ 176 v8u16 tmp0, tmp1, tmp2; \ 177 \ 178 tmp2 = p2_in + p1_in + p0_in; \ 179 tmp0 = p3_in << 1; \ 180 \ 181 tmp0 = tmp0 + tmp2 + q0_in; \ 182 tmp1 = tmp0 + p3_in + p2_in; \ 183 p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ 184 \ 185 tmp1 = tmp0 + p1_in + q1_in; \ 186 p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ 187 \ 188 tmp1 = q2_in + q1_in + q0_in; \ 189 tmp2 = tmp2 + tmp1; \ 190 tmp0 = tmp2 + (p0_in); \ 191 tmp0 = tmp0 + (p3_in); \ 192 p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3); \ 193 \ 194 tmp0 = q2_in + q3_in; \ 195 tmp0 = p0_in + tmp1 + tmp0; \ 196 tmp1 = q3_in + q3_in; \ 197 tmp1 = tmp1 + tmp0; \ 198 q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ 199 \ 200 tmp0 = tmp2 + q3_in; \ 201 tmp1 = tmp0 + q0_in; \ 202 q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ 203 \ 204 tmp1 = tmp0 - p2_in; \ 205 tmp0 = q1_in + q3_in; \ 206 tmp1 = tmp0 + tmp1; \ 207 q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ 208} 209 210#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \ 211 q0_in, q1_in, q2_in, q3_in, \ 212 limit_in, b_limit_in, thresh_in, \ 213 hev_out, mask_out, flat_out) { \ 214 v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ 215 v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ 216 \ 217 /* absolute subtraction of pixel values */ \ 218 p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \ 219 p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \ 220 p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \ 221 q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \ 222 q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \ 223 q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \ 224 p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \ 225 p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \ 226 \ 227 /* calculation of hev */ \ 228 flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ 229 hev_out = thresh_in < (v16u8)flat_out; \ 230 \ 231 /* calculation of mask */ \ 232 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ 233 p1_asub_q1_m >>= 1; \ 234 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ 235 \ 236 mask_out = b_limit_in < p0_asub_q0_m; \ 237 mask_out = __msa_max_u_b(flat_out, mask_out); \ 238 p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ 239 mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ 240 q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ 241 mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ 242 \ 243 mask_out = limit_in < (v16u8)mask_out; \ 244 mask_out = __msa_xori_b(mask_out, 0xff); \ 245} 246#endif /* VPX_DSP_LOOPFILTER_MSA_H_ */ 247