11b362b15af34006e6a11974088a46d42b903418eJohann/*
21b362b15af34006e6a11974088a46d42b903418eJohann *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
31b362b15af34006e6a11974088a46d42b903418eJohann *
41b362b15af34006e6a11974088a46d42b903418eJohann *  Use of this source code is governed by a BSD-style license
51b362b15af34006e6a11974088a46d42b903418eJohann *  that can be found in the LICENSE file in the root of the source
61b362b15af34006e6a11974088a46d42b903418eJohann *  tree. An additional intellectual property rights grant can be found
71b362b15af34006e6a11974088a46d42b903418eJohann *  in the file PATENTS.  All contributing project authors may
81b362b15af34006e6a11974088a46d42b903418eJohann *  be found in the AUTHORS file in the root of the source tree.
91b362b15af34006e6a11974088a46d42b903418eJohann */
101b362b15af34006e6a11974088a46d42b903418eJohann
111b362b15af34006e6a11974088a46d42b903418eJohann#include "vp8/encoder/denoising.h"
121b362b15af34006e6a11974088a46d42b903418eJohann#include "vp8/common/reconinter.h"
131b362b15af34006e6a11974088a46d42b903418eJohann#include "vpx/vpx_integer.h"
141b362b15af34006e6a11974088a46d42b903418eJohann#include "vpx_mem/vpx_mem.h"
15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp8_rtcd.h"
161b362b15af34006e6a11974088a46d42b903418eJohann
171b362b15af34006e6a11974088a46d42b903418eJohann#include <emmintrin.h>
18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vpx_ports/emmintrin_compat.h"
191b362b15af34006e6a11974088a46d42b903418eJohann
201b362b15af34006e6a11974088a46d42b903418eJohannunion sum_union {
211b362b15af34006e6a11974088a46d42b903418eJohann    __m128i v;
221b362b15af34006e6a11974088a46d42b903418eJohann    signed char e[16];
231b362b15af34006e6a11974088a46d42b903418eJohann};
241b362b15af34006e6a11974088a46d42b903418eJohann
251b362b15af34006e6a11974088a46d42b903418eJohannint vp8_denoiser_filter_sse2(YV12_BUFFER_CONFIG *mc_running_avg,
261b362b15af34006e6a11974088a46d42b903418eJohann                             YV12_BUFFER_CONFIG *running_avg,
271b362b15af34006e6a11974088a46d42b903418eJohann                             MACROBLOCK *signal, unsigned int motion_magnitude,
281b362b15af34006e6a11974088a46d42b903418eJohann                             int y_offset, int uv_offset)
291b362b15af34006e6a11974088a46d42b903418eJohann{
301b362b15af34006e6a11974088a46d42b903418eJohann    unsigned char *sig = signal->thismb;
311b362b15af34006e6a11974088a46d42b903418eJohann    int sig_stride = 16;
321b362b15af34006e6a11974088a46d42b903418eJohann    unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
331b362b15af34006e6a11974088a46d42b903418eJohann    int mc_avg_y_stride = mc_running_avg->y_stride;
341b362b15af34006e6a11974088a46d42b903418eJohann    unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
351b362b15af34006e6a11974088a46d42b903418eJohann    int avg_y_stride = running_avg->y_stride;
361b362b15af34006e6a11974088a46d42b903418eJohann    int r;
371b362b15af34006e6a11974088a46d42b903418eJohann    __m128i acc_diff = _mm_setzero_si128();
381b362b15af34006e6a11974088a46d42b903418eJohann    const __m128i k_0 = _mm_setzero_si128();
391b362b15af34006e6a11974088a46d42b903418eJohann    const __m128i k_4 = _mm_set1_epi8(4);
401b362b15af34006e6a11974088a46d42b903418eJohann    const __m128i k_8 = _mm_set1_epi8(8);
411b362b15af34006e6a11974088a46d42b903418eJohann    const __m128i k_16 = _mm_set1_epi8(16);
421b362b15af34006e6a11974088a46d42b903418eJohann    /* Modify each level's adjustment according to motion_magnitude. */
431b362b15af34006e6a11974088a46d42b903418eJohann    const __m128i l3 = _mm_set1_epi8(
441b362b15af34006e6a11974088a46d42b903418eJohann                      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 : 6);
451b362b15af34006e6a11974088a46d42b903418eJohann    /* Difference between level 3 and level 2 is 2. */
461b362b15af34006e6a11974088a46d42b903418eJohann    const __m128i l32 = _mm_set1_epi8(2);
471b362b15af34006e6a11974088a46d42b903418eJohann    /* Difference between level 2 and level 1 is 1. */
481b362b15af34006e6a11974088a46d42b903418eJohann    const __m128i l21 = _mm_set1_epi8(1);
491b362b15af34006e6a11974088a46d42b903418eJohann
501b362b15af34006e6a11974088a46d42b903418eJohann    for (r = 0; r < 16; ++r)
511b362b15af34006e6a11974088a46d42b903418eJohann    {
521b362b15af34006e6a11974088a46d42b903418eJohann        /* Calculate differences */
531b362b15af34006e6a11974088a46d42b903418eJohann        const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
541b362b15af34006e6a11974088a46d42b903418eJohann        const __m128i v_mc_running_avg_y = _mm_loadu_si128(
551b362b15af34006e6a11974088a46d42b903418eJohann                                           (__m128i *)(&mc_running_avg_y[0]));
561b362b15af34006e6a11974088a46d42b903418eJohann        __m128i v_running_avg_y;
571b362b15af34006e6a11974088a46d42b903418eJohann        const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
581b362b15af34006e6a11974088a46d42b903418eJohann        const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
591b362b15af34006e6a11974088a46d42b903418eJohann        /* Obtain the sign. FF if diff is negative. */
601b362b15af34006e6a11974088a46d42b903418eJohann        const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
611b362b15af34006e6a11974088a46d42b903418eJohann        /* Clamp absolute difference to 16 to be used to get mask. Doing this
621b362b15af34006e6a11974088a46d42b903418eJohann         * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
631b362b15af34006e6a11974088a46d42b903418eJohann        const __m128i clamped_absdiff = _mm_min_epu8(
641b362b15af34006e6a11974088a46d42b903418eJohann                                        _mm_or_si128(pdiff, ndiff), k_16);
651b362b15af34006e6a11974088a46d42b903418eJohann        /* Get masks for l2 l1 and l0 adjustments */
661b362b15af34006e6a11974088a46d42b903418eJohann        const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
671b362b15af34006e6a11974088a46d42b903418eJohann        const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
681b362b15af34006e6a11974088a46d42b903418eJohann        const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
691b362b15af34006e6a11974088a46d42b903418eJohann        /* Get adjustments for l2, l1, and l0 */
701b362b15af34006e6a11974088a46d42b903418eJohann        __m128i adj2 = _mm_and_si128(mask2, l32);
711b362b15af34006e6a11974088a46d42b903418eJohann        const __m128i adj1 = _mm_and_si128(mask1, l21);
721b362b15af34006e6a11974088a46d42b903418eJohann        const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
731b362b15af34006e6a11974088a46d42b903418eJohann        __m128i adj,  padj, nadj;
741b362b15af34006e6a11974088a46d42b903418eJohann
751b362b15af34006e6a11974088a46d42b903418eJohann        /* Combine the adjustments and get absolute adjustments. */
761b362b15af34006e6a11974088a46d42b903418eJohann        adj2 = _mm_add_epi8(adj2, adj1);
771b362b15af34006e6a11974088a46d42b903418eJohann        adj = _mm_sub_epi8(l3, adj2);
781b362b15af34006e6a11974088a46d42b903418eJohann        adj = _mm_andnot_si128(mask0, adj);
791b362b15af34006e6a11974088a46d42b903418eJohann        adj = _mm_or_si128(adj, adj0);
801b362b15af34006e6a11974088a46d42b903418eJohann
811b362b15af34006e6a11974088a46d42b903418eJohann        /* Restore the sign and get positive and negative adjustments. */
821b362b15af34006e6a11974088a46d42b903418eJohann        padj = _mm_andnot_si128(diff_sign, adj);
831b362b15af34006e6a11974088a46d42b903418eJohann        nadj = _mm_and_si128(diff_sign, adj);
841b362b15af34006e6a11974088a46d42b903418eJohann
851b362b15af34006e6a11974088a46d42b903418eJohann        /* Calculate filtered value. */
861b362b15af34006e6a11974088a46d42b903418eJohann        v_running_avg_y = _mm_adds_epu8(v_sig, padj);
871b362b15af34006e6a11974088a46d42b903418eJohann        v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
881b362b15af34006e6a11974088a46d42b903418eJohann        _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
891b362b15af34006e6a11974088a46d42b903418eJohann
901b362b15af34006e6a11974088a46d42b903418eJohann        /* Adjustments <=7, and each element in acc_diff can fit in signed
911b362b15af34006e6a11974088a46d42b903418eJohann         * char.
921b362b15af34006e6a11974088a46d42b903418eJohann         */
931b362b15af34006e6a11974088a46d42b903418eJohann        acc_diff = _mm_adds_epi8(acc_diff, padj);
941b362b15af34006e6a11974088a46d42b903418eJohann        acc_diff = _mm_subs_epi8(acc_diff, nadj);
951b362b15af34006e6a11974088a46d42b903418eJohann
961b362b15af34006e6a11974088a46d42b903418eJohann        /* Update pointers for next iteration. */
971b362b15af34006e6a11974088a46d42b903418eJohann        sig += sig_stride;
981b362b15af34006e6a11974088a46d42b903418eJohann        mc_running_avg_y += mc_avg_y_stride;
991b362b15af34006e6a11974088a46d42b903418eJohann        running_avg_y += avg_y_stride;
1001b362b15af34006e6a11974088a46d42b903418eJohann    }
1011b362b15af34006e6a11974088a46d42b903418eJohann
1021b362b15af34006e6a11974088a46d42b903418eJohann    {
1031b362b15af34006e6a11974088a46d42b903418eJohann        /* Compute the sum of all pixel differences of this MB. */
1041b362b15af34006e6a11974088a46d42b903418eJohann        union sum_union s;
1051b362b15af34006e6a11974088a46d42b903418eJohann        int sum_diff = 0;
1061b362b15af34006e6a11974088a46d42b903418eJohann        s.v = acc_diff;
1071b362b15af34006e6a11974088a46d42b903418eJohann        sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
1081b362b15af34006e6a11974088a46d42b903418eJohann                 + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
1091b362b15af34006e6a11974088a46d42b903418eJohann                 + s.e[12] + s.e[13] + s.e[14] + s.e[15];
1101b362b15af34006e6a11974088a46d42b903418eJohann
1111b362b15af34006e6a11974088a46d42b903418eJohann        if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
1121b362b15af34006e6a11974088a46d42b903418eJohann        {
1131b362b15af34006e6a11974088a46d42b903418eJohann            return COPY_BLOCK;
1141b362b15af34006e6a11974088a46d42b903418eJohann        }
1151b362b15af34006e6a11974088a46d42b903418eJohann    }
1161b362b15af34006e6a11974088a46d42b903418eJohann
1171b362b15af34006e6a11974088a46d42b903418eJohann    vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
1181b362b15af34006e6a11974088a46d42b903418eJohann                      signal->thismb, sig_stride);
1191b362b15af34006e6a11974088a46d42b903418eJohann    return FILTER_BLOCK;
1201b362b15af34006e6a11974088a46d42b903418eJohann}
121