1/* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "vp8/encoder/denoising.h" 12#include "vp8/common/reconinter.h" 13#include "vpx/vpx_integer.h" 14#include "vpx_mem/vpx_mem.h" 15#include "vp8_rtcd.h" 16 17#include <emmintrin.h> 18#include "vpx_ports/emmintrin_compat.h" 19 20/* Compute the sum of all pixel differences of this MB. */ 21static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) { 22 const __m128i k_1 = _mm_set1_epi16(1); 23 const __m128i acc_diff_lo = _mm_srai_epi16( 24 _mm_unpacklo_epi8(acc_diff, acc_diff), 8); 25 const __m128i acc_diff_hi = _mm_srai_epi16( 26 _mm_unpackhi_epi8(acc_diff, acc_diff), 8); 27 const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); 28 const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); 29 const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, 30 _mm_srli_si128(hg_fe_dc_ba, 8)); 31 const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, 32 _mm_srli_si128(hgfe_dcba, 4)); 33 unsigned int sum_diff = abs(_mm_cvtsi128_si32(hgfedcba)); 34 35 return sum_diff; 36} 37 38int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, 39 int mc_avg_y_stride, 40 unsigned char *running_avg_y, int avg_y_stride, 41 unsigned char *sig, int sig_stride, 42 unsigned int motion_magnitude, 43 int increase_denoising) 44{ 45 unsigned char *running_avg_y_start = running_avg_y; 46 unsigned char *sig_start = sig; 47 unsigned int sum_diff_thresh; 48 int r; 49 int shift_inc = (increase_denoising && 50 motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; 51 __m128i acc_diff = _mm_setzero_si128(); 52 const __m128i k_0 = _mm_setzero_si128(); 53 const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); 54 const __m128i k_8 = _mm_set1_epi8(8); 55 const __m128i k_16 = _mm_set1_epi8(16); 56 /* Modify each level's adjustment according to motion_magnitude. */ 57 const __m128i l3 = _mm_set1_epi8( 58 (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 59 7 + shift_inc : 6); 60 /* Difference between level 3 and level 2 is 2. */ 61 const __m128i l32 = _mm_set1_epi8(2); 62 /* Difference between level 2 and level 1 is 1. */ 63 const __m128i l21 = _mm_set1_epi8(1); 64 65 for (r = 0; r < 16; ++r) 66 { 67 /* Calculate differences */ 68 const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); 69 const __m128i v_mc_running_avg_y = _mm_loadu_si128( 70 (__m128i *)(&mc_running_avg_y[0])); 71 __m128i v_running_avg_y; 72 const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); 73 const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); 74 /* Obtain the sign. FF if diff is negative. */ 75 const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); 76 /* Clamp absolute difference to 16 to be used to get mask. Doing this 77 * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ 78 const __m128i clamped_absdiff = _mm_min_epu8( 79 _mm_or_si128(pdiff, ndiff), k_16); 80 /* Get masks for l2 l1 and l0 adjustments */ 81 const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); 82 const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); 83 const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); 84 /* Get adjustments for l2, l1, and l0 */ 85 __m128i adj2 = _mm_and_si128(mask2, l32); 86 const __m128i adj1 = _mm_and_si128(mask1, l21); 87 const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); 88 __m128i adj, padj, nadj; 89 90 /* Combine the adjustments and get absolute adjustments. */ 91 adj2 = _mm_add_epi8(adj2, adj1); 92 adj = _mm_sub_epi8(l3, adj2); 93 adj = _mm_andnot_si128(mask0, adj); 94 adj = _mm_or_si128(adj, adj0); 95 96 /* Restore the sign and get positive and negative adjustments. */ 97 padj = _mm_andnot_si128(diff_sign, adj); 98 nadj = _mm_and_si128(diff_sign, adj); 99 100 /* Calculate filtered value. */ 101 v_running_avg_y = _mm_adds_epu8(v_sig, padj); 102 v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); 103 _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); 104 105 /* Adjustments <=7, and each element in acc_diff can fit in signed 106 * char. 107 */ 108 acc_diff = _mm_adds_epi8(acc_diff, padj); 109 acc_diff = _mm_subs_epi8(acc_diff, nadj); 110 111 /* Update pointers for next iteration. */ 112 sig += sig_stride; 113 mc_running_avg_y += mc_avg_y_stride; 114 running_avg_y += avg_y_stride; 115 } 116 117 { 118 /* Compute the sum of all pixel differences of this MB. */ 119 unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); 120 sum_diff_thresh = SUM_DIFF_THRESHOLD; 121 if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; 122 if (abs_sum_diff > sum_diff_thresh) { 123 // Before returning to copy the block (i.e., apply no denoising), 124 // check if we can still apply some (weaker) temporal filtering to 125 // this block, that would otherwise not be denoised at all. Simplest 126 // is to apply an additional adjustment to running_avg_y to bring it 127 // closer to sig. The adjustment is capped by a maximum delta, and 128 // chosen such that in most cases the resulting sum_diff will be 129 // within the acceptable range given by sum_diff_thresh. 130 131 // The delta is set by the excess of absolute pixel diff over the 132 // threshold. 133 int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; 134 // Only apply the adjustment for max delta up to 3. 135 if (delta < 4) { 136 const __m128i k_delta = _mm_set1_epi8(delta); 137 sig -= sig_stride * 16; 138 mc_running_avg_y -= mc_avg_y_stride * 16; 139 running_avg_y -= avg_y_stride * 16; 140 for (r = 0; r < 16; ++r) { 141 __m128i v_running_avg_y = 142 _mm_loadu_si128((__m128i *)(&running_avg_y[0])); 143 // Calculate differences. 144 const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); 145 const __m128i v_mc_running_avg_y = 146 _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0])); 147 const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); 148 const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); 149 // Obtain the sign. FF if diff is negative. 150 const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); 151 // Clamp absolute difference to delta to get the adjustment. 152 const __m128i adj = 153 _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); 154 // Restore the sign and get positive and negative adjustments. 155 __m128i padj, nadj; 156 padj = _mm_andnot_si128(diff_sign, adj); 157 nadj = _mm_and_si128(diff_sign, adj); 158 // Calculate filtered value. 159 v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); 160 v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); 161 _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); 162 163 // Accumulate the adjustments. 164 acc_diff = _mm_subs_epi8(acc_diff, padj); 165 acc_diff = _mm_adds_epi8(acc_diff, nadj); 166 167 // Update pointers for next iteration. 168 sig += sig_stride; 169 mc_running_avg_y += mc_avg_y_stride; 170 running_avg_y += avg_y_stride; 171 } 172 abs_sum_diff = abs_sum_diff_16x1(acc_diff); 173 if (abs_sum_diff > sum_diff_thresh) { 174 return COPY_BLOCK; 175 } 176 } else { 177 return COPY_BLOCK; 178 } 179 } 180 } 181 182 vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride); 183 return FILTER_BLOCK; 184} 185 186int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, 187 int mc_avg_stride, 188 unsigned char *running_avg, int avg_stride, 189 unsigned char *sig, int sig_stride, 190 unsigned int motion_magnitude, 191 int increase_denoising) { 192 unsigned char *running_avg_start = running_avg; 193 unsigned char *sig_start = sig; 194 unsigned int sum_diff_thresh; 195 int r; 196 int shift_inc = (increase_denoising && 197 motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0; 198 __m128i acc_diff = _mm_setzero_si128(); 199 const __m128i k_0 = _mm_setzero_si128(); 200 const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); 201 const __m128i k_8 = _mm_set1_epi8(8); 202 const __m128i k_16 = _mm_set1_epi8(16); 203 /* Modify each level's adjustment according to motion_magnitude. */ 204 const __m128i l3 = _mm_set1_epi8( 205 (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 206 7 + shift_inc : 6); 207 /* Difference between level 3 and level 2 is 2. */ 208 const __m128i l32 = _mm_set1_epi8(2); 209 /* Difference between level 2 and level 1 is 1. */ 210 const __m128i l21 = _mm_set1_epi8(1); 211 212 { 213 const __m128i k_1 = _mm_set1_epi16(1); 214 __m128i vec_sum_block = _mm_setzero_si128(); 215 216 // Avoid denoising color signal if its close to average level. 217 for (r = 0; r < 8; ++r) { 218 const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0])); 219 const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0); 220 vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack); 221 sig += sig_stride; 222 } 223 sig -= sig_stride * 8; 224 { 225 const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1); 226 const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, 227 _mm_srli_si128(hg_fe_dc_ba, 8)); 228 const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, 229 _mm_srli_si128(hgfe_dcba, 4)); 230 const int sum_block = _mm_cvtsi128_si32(hgfedcba); 231 if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) { 232 return COPY_BLOCK; 233 } 234 } 235 } 236 237 for (r = 0; r < 4; ++r) { 238 /* Calculate differences */ 239 const __m128i v_sig_low = _mm_castpd_si128( 240 _mm_load_sd((double *)(&sig[0]))); 241 const __m128i v_sig = _mm_castpd_si128( 242 _mm_loadh_pd(_mm_castsi128_pd(v_sig_low), 243 (double *)(&sig[sig_stride]))); 244 const __m128i v_mc_running_avg_low = _mm_castpd_si128( 245 _mm_load_sd((double *)(&mc_running_avg[0]))); 246 const __m128i v_mc_running_avg = _mm_castpd_si128( 247 _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low), 248 (double *)(&mc_running_avg[mc_avg_stride]))); 249 const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig); 250 const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg); 251 /* Obtain the sign. FF if diff is negative. */ 252 const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); 253 /* Clamp absolute difference to 16 to be used to get mask. Doing this 254 * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ 255 const __m128i clamped_absdiff = _mm_min_epu8( 256 _mm_or_si128(pdiff, ndiff), k_16); 257 /* Get masks for l2 l1 and l0 adjustments */ 258 const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); 259 const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); 260 const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); 261 /* Get adjustments for l2, l1, and l0 */ 262 __m128i adj2 = _mm_and_si128(mask2, l32); 263 const __m128i adj1 = _mm_and_si128(mask1, l21); 264 const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); 265 __m128i adj, padj, nadj; 266 __m128i v_running_avg; 267 268 /* Combine the adjustments and get absolute adjustments. */ 269 adj2 = _mm_add_epi8(adj2, adj1); 270 adj = _mm_sub_epi8(l3, adj2); 271 adj = _mm_andnot_si128(mask0, adj); 272 adj = _mm_or_si128(adj, adj0); 273 274 /* Restore the sign and get positive and negative adjustments. */ 275 padj = _mm_andnot_si128(diff_sign, adj); 276 nadj = _mm_and_si128(diff_sign, adj); 277 278 /* Calculate filtered value. */ 279 v_running_avg = _mm_adds_epu8(v_sig, padj); 280 v_running_avg = _mm_subs_epu8(v_running_avg, nadj); 281 282 _mm_storel_pd((double *)&running_avg[0], 283 _mm_castsi128_pd(v_running_avg)); 284 _mm_storeh_pd((double *)&running_avg[avg_stride], 285 _mm_castsi128_pd(v_running_avg)); 286 287 /* Adjustments <=7, and each element in acc_diff can fit in signed 288 * char. 289 */ 290 acc_diff = _mm_adds_epi8(acc_diff, padj); 291 acc_diff = _mm_subs_epi8(acc_diff, nadj); 292 293 /* Update pointers for next iteration. */ 294 sig += sig_stride * 2; 295 mc_running_avg += mc_avg_stride * 2; 296 running_avg += avg_stride * 2; 297 } 298 299 { 300 unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); 301 sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; 302 if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV; 303 if (abs_sum_diff > sum_diff_thresh) { 304 // Before returning to copy the block (i.e., apply no denoising), 305 // check if we can still apply some (weaker) temporal filtering to 306 // this block, that would otherwise not be denoised at all. Simplest 307 // is to apply an additional adjustment to running_avg_y to bring it 308 // closer to sig. The adjustment is capped by a maximum delta, and 309 // chosen such that in most cases the resulting sum_diff will be 310 // within the acceptable range given by sum_diff_thresh. 311 312 // The delta is set by the excess of absolute pixel diff over the 313 // threshold. 314 int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; 315 // Only apply the adjustment for max delta up to 3. 316 if (delta < 4) { 317 const __m128i k_delta = _mm_set1_epi8(delta); 318 sig -= sig_stride * 8; 319 mc_running_avg -= mc_avg_stride * 8; 320 running_avg -= avg_stride * 8; 321 for (r = 0; r < 4; ++r) { 322 // Calculate differences. 323 const __m128i v_sig_low = _mm_castpd_si128( 324 _mm_load_sd((double *)(&sig[0]))); 325 const __m128i v_sig = _mm_castpd_si128( 326 _mm_loadh_pd(_mm_castsi128_pd(v_sig_low), 327 (double *)(&sig[sig_stride]))); 328 const __m128i v_mc_running_avg_low = _mm_castpd_si128( 329 _mm_load_sd((double *)(&mc_running_avg[0]))); 330 const __m128i v_mc_running_avg = _mm_castpd_si128( 331 _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low), 332 (double *)(&mc_running_avg[mc_avg_stride]))); 333 const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig); 334 const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg); 335 // Obtain the sign. FF if diff is negative. 336 const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); 337 // Clamp absolute difference to delta to get the adjustment. 338 const __m128i adj = 339 _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); 340 // Restore the sign and get positive and negative adjustments. 341 __m128i padj, nadj; 342 const __m128i v_running_avg_low = _mm_castpd_si128( 343 _mm_load_sd((double *)(&running_avg[0]))); 344 __m128i v_running_avg = _mm_castpd_si128( 345 _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low), 346 (double *)(&running_avg[avg_stride]))); 347 padj = _mm_andnot_si128(diff_sign, adj); 348 nadj = _mm_and_si128(diff_sign, adj); 349 // Calculate filtered value. 350 v_running_avg = _mm_subs_epu8(v_running_avg, padj); 351 v_running_avg = _mm_adds_epu8(v_running_avg, nadj); 352 353 _mm_storel_pd((double *)&running_avg[0], 354 _mm_castsi128_pd(v_running_avg)); 355 _mm_storeh_pd((double *)&running_avg[avg_stride], 356 _mm_castsi128_pd(v_running_avg)); 357 358 // Accumulate the adjustments. 359 acc_diff = _mm_subs_epi8(acc_diff, padj); 360 acc_diff = _mm_adds_epi8(acc_diff, nadj); 361 362 // Update pointers for next iteration. 363 sig += sig_stride * 2; 364 mc_running_avg += mc_avg_stride * 2; 365 running_avg += avg_stride * 2; 366 } 367 abs_sum_diff = abs_sum_diff_16x1(acc_diff); 368 if (abs_sum_diff > sum_diff_thresh) { 369 return COPY_BLOCK; 370 } 371 } else { 372 return COPY_BLOCK; 373 } 374 } 375 } 376 377 vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride); 378 return FILTER_BLOCK; 379} 380