1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <stdlib.h>
12#include "./vp8_rtcd.h"
13#include "vp8/common/mips/msa/vp8_macros_msa.h"
14#include "vp8/encoder/denoising.h"
15
16int32_t vp8_denoiser_filter_msa(uint8_t *mc_running_avg_y_ptr,
17                                int32_t mc_avg_y_stride,
18                                uint8_t *running_avg_y_ptr,
19                                int32_t avg_y_stride, uint8_t *sig_ptr,
20                                int32_t sig_stride, uint32_t motion_magnitude,
21                                int32_t increase_denoising) {
22  uint8_t *running_avg_y_start = running_avg_y_ptr;
23  uint8_t *sig_start = sig_ptr;
24  int32_t cnt = 0;
25  int32_t sum_diff = 0;
26  int32_t shift_inc1 = 3;
27  int32_t delta = 0;
28  int32_t sum_diff_thresh;
29  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
30  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
31  v16u8 mc_running_avg_y0, running_avg_y, sig0;
32  v16u8 mc_running_avg_y1, running_avg_y1, sig1;
33  v16u8 coeff0, coeff1;
34  v8i16 diff0, diff1, abs_diff0, abs_diff1, abs_diff_neg0, abs_diff_neg1;
35  v8i16 adjust0, adjust1, adjust2, adjust3;
36  v8i16 shift_inc1_vec = { 0 };
37  v8i16 col_sum0 = { 0 };
38  v8i16 col_sum1 = { 0 };
39  v8i16 col_sum2 = { 0 };
40  v8i16 col_sum3 = { 0 };
41  v8i16 temp0_h, temp1_h, temp2_h, temp3_h, cmp, delta_vec;
42  v4i32 temp0_w;
43  v2i64 temp0_d, temp1_d;
44  v8i16 zero = { 0 };
45  v8i16 one = __msa_ldi_h(1);
46  v8i16 four = __msa_ldi_h(4);
47  v8i16 val_127 = __msa_ldi_h(127);
48  v8i16 adj_val = { 6, 4, 3, 0, -6, -4, -3, 0 };
49
50  if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
51    adj_val = __msa_add_a_h(adj_val, one);
52    if (increase_denoising) {
53      adj_val = __msa_add_a_h(adj_val, one);
54      shift_inc1 = 4;
55    }
56
57    temp0_h = zero - adj_val;
58    adj_val = (v8i16)__msa_ilvev_d((v2i64)temp0_h, (v2i64)adj_val);
59  }
60
61  adj_val = __msa_insert_h(adj_val, 3, cnt);
62  adj_val = __msa_insert_h(adj_val, 7, cnt);
63  shift_inc1_vec = __msa_fill_h(shift_inc1);
64
65  for (cnt = 8; cnt--;) {
66    v8i16 mask0 = { 0 };
67    v8i16 mask1 = { 0 };
68
69    mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
70    sig0 = LD_UB(sig_ptr);
71    sig_ptr += sig_stride;
72    mc_running_avg_y_ptr += mc_avg_y_stride;
73
74    mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
75    sig1 = LD_UB(sig_ptr);
76
77    ILVRL_B2_UB(mc_running_avg_y0, sig0, coeff0, coeff1);
78    HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
79    abs_diff0 = __msa_add_a_h(diff0, zero);
80    abs_diff1 = __msa_add_a_h(diff1, zero);
81    cmp = __msa_clei_s_h(abs_diff0, 15);
82    cmp = cmp & one;
83    mask0 += cmp;
84    cmp = __msa_clei_s_h(abs_diff0, 7);
85    cmp = cmp & one;
86    mask0 += cmp;
87    cmp = abs_diff0 < shift_inc1_vec;
88    cmp = cmp & one;
89    mask0 += cmp;
90    cmp = __msa_clei_s_h(abs_diff1, 15);
91    cmp = cmp & one;
92    mask1 += cmp;
93    cmp = __msa_clei_s_h(abs_diff1, 7);
94    cmp = cmp & one;
95    mask1 += cmp;
96    cmp = abs_diff1 < shift_inc1_vec;
97    cmp = cmp & one;
98    mask1 += cmp;
99    temp0_h = __msa_clei_s_h(diff0, 0);
100    temp0_h = temp0_h & four;
101    mask0 += temp0_h;
102    temp1_h = __msa_clei_s_h(diff1, 0);
103    temp1_h = temp1_h & four;
104    mask1 += temp1_h;
105    VSHF_H2_SH(adj_val, adj_val, adj_val, adj_val, mask0, mask1, adjust0,
106               adjust1);
107    temp2_h = __msa_ceqi_h(adjust0, 0);
108    temp3_h = __msa_ceqi_h(adjust1, 0);
109    adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h);
110    adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)diff1, (v16u8)temp3_h);
111    ADD2(col_sum0, adjust0, col_sum1, adjust1, col_sum0, col_sum1);
112    UNPCK_UB_SH(sig0, temp0_h, temp1_h);
113    ADD2(temp0_h, adjust0, temp1_h, adjust1, temp0_h, temp1_h);
114    MAXI_SH2_SH(temp0_h, temp1_h, 0);
115    SAT_UH2_SH(temp0_h, temp1_h, 7);
116    temp2_h = (v8i16)__msa_pckev_b((v16i8)temp3_h, (v16i8)temp2_h);
117    running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp1_h, (v16i8)temp0_h);
118    running_avg_y =
119        __msa_bmnz_v(running_avg_y, mc_running_avg_y0, (v16u8)temp2_h);
120    ST_UB(running_avg_y, running_avg_y_ptr);
121    running_avg_y_ptr += avg_y_stride;
122
123    mask0 = zero;
124    mask1 = zero;
125    ILVRL_B2_UB(mc_running_avg_y1, sig1, coeff0, coeff1);
126    HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
127    abs_diff0 = __msa_add_a_h(diff0, zero);
128    abs_diff1 = __msa_add_a_h(diff1, zero);
129    cmp = __msa_clei_s_h(abs_diff0, 15);
130    cmp = cmp & one;
131    mask0 += cmp;
132    cmp = __msa_clei_s_h(abs_diff0, 7);
133    cmp = cmp & one;
134    mask0 += cmp;
135    cmp = abs_diff0 < shift_inc1_vec;
136    cmp = cmp & one;
137    mask0 += cmp;
138    cmp = __msa_clei_s_h(abs_diff1, 15);
139    cmp = cmp & one;
140    mask1 += cmp;
141    cmp = __msa_clei_s_h(abs_diff1, 7);
142    cmp = cmp & one;
143    mask1 += cmp;
144    cmp = abs_diff1 < shift_inc1_vec;
145    cmp = cmp & one;
146    mask1 += cmp;
147    temp0_h = __msa_clei_s_h(diff0, 0);
148    temp0_h = temp0_h & four;
149    mask0 += temp0_h;
150    temp1_h = __msa_clei_s_h(diff1, 0);
151    temp1_h = temp1_h & four;
152    mask1 += temp1_h;
153    VSHF_H2_SH(adj_val, adj_val, adj_val, adj_val, mask0, mask1, adjust0,
154               adjust1);
155    temp2_h = __msa_ceqi_h(adjust0, 0);
156    temp3_h = __msa_ceqi_h(adjust1, 0);
157    adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h);
158    adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)diff1, (v16u8)temp3_h);
159    ADD2(col_sum0, adjust0, col_sum1, adjust1, col_sum0, col_sum1);
160    UNPCK_UB_SH(sig1, temp0_h, temp1_h);
161    ADD2(temp0_h, adjust0, temp1_h, adjust1, temp0_h, temp1_h);
162    MAXI_SH2_SH(temp0_h, temp1_h, 0);
163    SAT_UH2_SH(temp0_h, temp1_h, 7);
164    temp2_h = (v8i16)__msa_pckev_b((v16i8)temp3_h, (v16i8)temp2_h);
165    running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp1_h, (v16i8)temp0_h);
166    running_avg_y =
167        __msa_bmnz_v(running_avg_y, mc_running_avg_y1, (v16u8)temp2_h);
168    ST_UB(running_avg_y, running_avg_y_ptr);
169    sig_ptr += sig_stride;
170    mc_running_avg_y_ptr += mc_avg_y_stride;
171    running_avg_y_ptr += avg_y_stride;
172  }
173
174  col_sum0 = __msa_min_s_h(col_sum0, val_127);
175  col_sum1 = __msa_min_s_h(col_sum1, val_127);
176  temp0_h = col_sum0 + col_sum1;
177  temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
178  temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
179  temp1_d = __msa_splati_d(temp0_d, 1);
180  temp0_d += temp1_d;
181  sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
182  sig_ptr -= sig_stride * 16;
183  mc_running_avg_y_ptr -= mc_avg_y_stride * 16;
184  running_avg_y_ptr -= avg_y_stride * 16;
185
186  if (increase_denoising) {
187    sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
188  }
189
190  if (abs(sum_diff) > sum_diff_thresh) {
191    delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
192    delta_vec = __msa_fill_h(delta);
193    if (delta < 4) {
194      for (cnt = 8; cnt--;) {
195        running_avg_y = LD_UB(running_avg_y_ptr);
196        mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
197        sig0 = LD_UB(sig_ptr);
198        sig_ptr += sig_stride;
199        mc_running_avg_y_ptr += mc_avg_y_stride;
200        running_avg_y_ptr += avg_y_stride;
201        mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
202        sig1 = LD_UB(sig_ptr);
203        running_avg_y1 = LD_UB(running_avg_y_ptr);
204        ILVRL_B2_UB(mc_running_avg_y0, sig0, coeff0, coeff1);
205        HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
206        abs_diff0 = __msa_add_a_h(diff0, zero);
207        abs_diff1 = __msa_add_a_h(diff1, zero);
208        temp0_h = abs_diff0 < delta_vec;
209        temp1_h = abs_diff1 < delta_vec;
210        abs_diff0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)delta_vec,
211                                       (v16u8)temp0_h);
212        abs_diff1 = (v8i16)__msa_bmz_v((v16u8)abs_diff1, (v16u8)delta_vec,
213                                       (v16u8)temp1_h);
214        SUB2(zero, abs_diff0, zero, abs_diff1, abs_diff_neg0, abs_diff_neg1);
215        abs_diff_neg0 = zero - abs_diff0;
216        abs_diff_neg1 = zero - abs_diff1;
217        temp0_h = __msa_clei_s_h(diff0, 0);
218        temp1_h = __msa_clei_s_h(diff1, 0);
219        adjust0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0,
220                                      (v16u8)temp0_h);
221        adjust1 = (v8i16)__msa_bmnz_v((v16u8)abs_diff1, (v16u8)abs_diff_neg1,
222                                      (v16u8)temp1_h);
223        ILVRL_B2_SH(zero, running_avg_y, temp2_h, temp3_h);
224        ADD2(temp2_h, adjust0, temp3_h, adjust1, adjust2, adjust3);
225        MAXI_SH2_SH(adjust2, adjust3, 0);
226        SAT_UH2_SH(adjust2, adjust3, 7);
227        temp0_h = __msa_ceqi_h(diff0, 0);
228        temp1_h = __msa_ceqi_h(diff1, 0);
229        adjust2 =
230            (v8i16)__msa_bmz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h);
231        adjust3 =
232            (v8i16)__msa_bmz_v((v16u8)adjust3, (v16u8)temp3_h, (v16u8)temp1_h);
233        adjust0 =
234            (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h);
235        adjust1 =
236            (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)zero, (v16u8)temp1_h);
237        ADD2(col_sum2, adjust0, col_sum3, adjust1, col_sum2, col_sum3);
238        running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust3, (v16i8)adjust2);
239        ST_UB(running_avg_y, running_avg_y_ptr - avg_y_stride);
240        ILVRL_B2_UB(mc_running_avg_y1, sig1, coeff0, coeff1);
241        HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
242        abs_diff0 = __msa_add_a_h(diff0, zero);
243        abs_diff1 = __msa_add_a_h(diff1, zero);
244        temp0_h = abs_diff0 < delta_vec;
245        temp1_h = abs_diff1 < delta_vec;
246        abs_diff0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)delta_vec,
247                                       (v16u8)temp0_h);
248        abs_diff1 = (v8i16)__msa_bmz_v((v16u8)abs_diff1, (v16u8)delta_vec,
249                                       (v16u8)temp1_h);
250        SUB2(zero, abs_diff0, zero, abs_diff1, abs_diff_neg0, abs_diff_neg1);
251        temp0_h = __msa_clei_s_h(diff0, 0);
252        temp1_h = __msa_clei_s_h(diff1, 0);
253        adjust0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0,
254                                      (v16u8)temp0_h);
255        adjust1 = (v8i16)__msa_bmnz_v((v16u8)abs_diff1, (v16u8)abs_diff_neg1,
256                                      (v16u8)temp1_h);
257        ILVRL_H2_SH(zero, running_avg_y1, temp2_h, temp3_h);
258        ADD2(temp2_h, adjust0, temp3_h, adjust1, adjust2, adjust3);
259        MAXI_SH2_SH(adjust2, adjust3, 0);
260        SAT_UH2_SH(adjust2, adjust3, 7);
261        temp0_h = __msa_ceqi_h(diff0, 0);
262        temp1_h = __msa_ceqi_h(diff1, 0);
263        adjust2 =
264            (v8i16)__msa_bmz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h);
265        adjust3 =
266            (v8i16)__msa_bmz_v((v16u8)adjust3, (v16u8)temp3_h, (v16u8)temp1_h);
267        adjust0 =
268            (v8i16)__msa_bmz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h);
269        adjust1 =
270            (v8i16)__msa_bmz_v((v16u8)adjust1, (v16u8)zero, (v16u8)temp1_h);
271        ADD2(col_sum2, adjust0, col_sum3, adjust1, col_sum2, col_sum3);
272        running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust3, (v16i8)adjust2);
273        ST_UB(running_avg_y, running_avg_y_ptr);
274        running_avg_y_ptr += avg_y_stride;
275      }
276
277      col_sum2 = __msa_min_s_h(col_sum2, val_127);
278      col_sum3 = __msa_min_s_h(col_sum3, val_127);
279      temp0_h = col_sum2 + col_sum3;
280      temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
281      temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
282      temp1_d = __msa_splati_d(temp0_d, 1);
283      temp0_d += (v2i64)temp1_d;
284      sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
285      if (abs(sum_diff) > SUM_DIFF_THRESHOLD) {
286        return COPY_BLOCK;
287      }
288    } else {
289      return COPY_BLOCK;
290    }
291  }
292
293  LD_UB8(sig_start, sig_stride, src0, src1, src2, src3, src4, src5, src6, src7);
294  sig_start += (8 * sig_stride);
295  LD_UB8(sig_start, sig_stride, src8, src9, src10, src11, src12, src13, src14,
296         src15);
297
298  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, running_avg_y_start,
299         avg_y_stride);
300  running_avg_y_start += (8 * avg_y_stride);
301  ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
302         running_avg_y_start, avg_y_stride);
303
304  return FILTER_BLOCK;
305}
306
307int32_t vp8_denoiser_filter_uv_msa(
308    uint8_t *mc_running_avg_y_ptr, int32_t mc_avg_y_stride,
309    uint8_t *running_avg_y_ptr, int32_t avg_y_stride, uint8_t *sig_ptr,
310    int32_t sig_stride, uint32_t motion_magnitude, int32_t increase_denoising) {
311  uint8_t *running_avg_y_start = running_avg_y_ptr;
312  uint8_t *sig_start = sig_ptr;
313  int32_t cnt = 0;
314  int32_t sum_diff = 0;
315  int32_t shift_inc1 = 3;
316  int32_t delta = 0;
317  int32_t sum_block = 0;
318  int32_t sum_diff_thresh;
319  int64_t dst0, dst1, src0, src1, src2, src3;
320  v16u8 mc_running_avg_y0, running_avg_y, sig0;
321  v16u8 mc_running_avg_y1, running_avg_y1, sig1;
322  v16u8 sig2, sig3, sig4, sig5, sig6, sig7;
323  v16u8 coeff0;
324  v8i16 diff0, abs_diff0, abs_diff_neg0;
325  v8i16 adjust0, adjust2;
326  v8i16 shift_inc1_vec = { 0 };
327  v8i16 col_sum0 = { 0 };
328  v8i16 temp0_h, temp2_h, cmp, delta_vec;
329  v4i32 temp0_w;
330  v2i64 temp0_d, temp1_d;
331  v16i8 zero = { 0 };
332  v8i16 one = __msa_ldi_h(1);
333  v8i16 four = __msa_ldi_h(4);
334  v8i16 adj_val = { 6, 4, 3, 0, -6, -4, -3, 0 };
335
336  sig0 = LD_UB(sig_ptr);
337  sig_ptr += sig_stride;
338  temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig0);
339  sig1 = LD_UB(sig_ptr);
340  sig_ptr += sig_stride;
341  temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig1);
342  sig2 = LD_UB(sig_ptr);
343  sig_ptr += sig_stride;
344  temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig2);
345  sig3 = LD_UB(sig_ptr);
346  sig_ptr += sig_stride;
347  temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig3);
348  sig4 = LD_UB(sig_ptr);
349  sig_ptr += sig_stride;
350  temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig4);
351  sig5 = LD_UB(sig_ptr);
352  sig_ptr += sig_stride;
353  temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig5);
354  sig6 = LD_UB(sig_ptr);
355  sig_ptr += sig_stride;
356  temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig6);
357  sig7 = LD_UB(sig_ptr);
358  sig_ptr += sig_stride;
359  temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig7);
360  temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
361  temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
362  temp1_d = __msa_splati_d(temp0_d, 1);
363  temp0_d += temp1_d;
364  sum_block = __msa_copy_s_w((v4i32)temp0_d, 0);
365  sig_ptr -= sig_stride * 8;
366
367  if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
368    return COPY_BLOCK;
369  }
370
371  if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
372    adj_val = __msa_add_a_h(adj_val, one);
373
374    if (increase_denoising) {
375      adj_val = __msa_add_a_h(adj_val, one);
376      shift_inc1 = 4;
377    }
378
379    temp0_h = (v8i16)zero - adj_val;
380    adj_val = (v8i16)__msa_ilvev_d((v2i64)temp0_h, (v2i64)adj_val);
381  }
382
383  adj_val = __msa_insert_h(adj_val, 3, cnt);
384  adj_val = __msa_insert_h(adj_val, 7, cnt);
385  shift_inc1_vec = __msa_fill_h(shift_inc1);
386  for (cnt = 4; cnt--;) {
387    v8i16 mask0 = { 0 };
388    mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
389    sig0 = LD_UB(sig_ptr);
390    sig_ptr += sig_stride;
391    mc_running_avg_y_ptr += mc_avg_y_stride;
392    mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
393    sig1 = LD_UB(sig_ptr);
394    coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y0, (v16i8)sig0);
395    diff0 = __msa_hsub_u_h(coeff0, coeff0);
396    abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
397    cmp = __msa_clei_s_h(abs_diff0, 15);
398    cmp = cmp & one;
399    mask0 += cmp;
400    cmp = __msa_clei_s_h(abs_diff0, 7);
401    cmp = cmp & one;
402    mask0 += cmp;
403    cmp = abs_diff0 < shift_inc1_vec;
404    cmp = cmp & one;
405    mask0 += cmp;
406    temp0_h = __msa_clei_s_h(diff0, 0);
407    temp0_h = temp0_h & four;
408    mask0 += temp0_h;
409    adjust0 = __msa_vshf_h(mask0, adj_val, adj_val);
410    temp2_h = __msa_ceqi_h(adjust0, 0);
411    adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h);
412    col_sum0 += adjust0;
413    temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig0);
414    temp0_h += adjust0;
415    temp0_h = __msa_maxi_s_h(temp0_h, 0);
416    temp0_h = (v8i16)__msa_sat_u_h((v8u16)temp0_h, 7);
417    temp2_h = (v8i16)__msa_pckev_b((v16i8)temp2_h, (v16i8)temp2_h);
418    running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp0_h, (v16i8)temp0_h);
419    running_avg_y =
420        __msa_bmnz_v(running_avg_y, mc_running_avg_y0, (v16u8)temp2_h);
421    dst0 = __msa_copy_s_d((v2i64)running_avg_y, 0);
422    SD(dst0, running_avg_y_ptr);
423    running_avg_y_ptr += avg_y_stride;
424
425    mask0 = __msa_ldi_h(0);
426    coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y1, (v16i8)sig1);
427    diff0 = __msa_hsub_u_h(coeff0, coeff0);
428    abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
429    cmp = __msa_clei_s_h(abs_diff0, 15);
430    cmp = cmp & one;
431    mask0 += cmp;
432    cmp = __msa_clei_s_h(abs_diff0, 7);
433    cmp = cmp & one;
434    mask0 += cmp;
435    cmp = abs_diff0 < shift_inc1_vec;
436    cmp = cmp & one;
437    mask0 += cmp;
438    temp0_h = __msa_clei_s_h(diff0, 0);
439    temp0_h = temp0_h & four;
440    mask0 += temp0_h;
441    adjust0 = __msa_vshf_h(mask0, adj_val, adj_val);
442    temp2_h = __msa_ceqi_h(adjust0, 0);
443    adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h);
444    col_sum0 += adjust0;
445    temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig1);
446    temp0_h += adjust0;
447    temp0_h = __msa_maxi_s_h(temp0_h, 0);
448    temp0_h = (v8i16)__msa_sat_u_h((v8u16)temp0_h, 7);
449
450    temp2_h = (v8i16)__msa_pckev_b((v16i8)temp2_h, (v16i8)temp2_h);
451    running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp0_h, (v16i8)temp0_h);
452    running_avg_y =
453        __msa_bmnz_v(running_avg_y, mc_running_avg_y1, (v16u8)temp2_h);
454    dst1 = __msa_copy_s_d((v2i64)running_avg_y, 0);
455    SD(dst1, running_avg_y_ptr);
456
457    sig_ptr += sig_stride;
458    mc_running_avg_y_ptr += mc_avg_y_stride;
459    running_avg_y_ptr += avg_y_stride;
460  }
461
462  temp0_h = col_sum0;
463  temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
464  temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
465  temp1_d = __msa_splati_d(temp0_d, 1);
466  temp0_d += temp1_d;
467  sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
468  sig_ptr -= sig_stride * 8;
469  mc_running_avg_y_ptr -= mc_avg_y_stride * 8;
470  running_avg_y_ptr -= avg_y_stride * 8;
471  sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
472
473  if (increase_denoising) {
474    sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
475  }
476
477  if (abs(sum_diff) > sum_diff_thresh) {
478    delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
479    delta_vec = __msa_fill_h(delta);
480    if (delta < 4) {
481      for (cnt = 4; cnt--;) {
482        running_avg_y = LD_UB(running_avg_y_ptr);
483        mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
484        sig0 = LD_UB(sig_ptr);
485        /* Update pointers for next iteration. */
486        sig_ptr += sig_stride;
487        mc_running_avg_y_ptr += mc_avg_y_stride;
488        running_avg_y_ptr += avg_y_stride;
489
490        mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
491        sig1 = LD_UB(sig_ptr);
492        running_avg_y1 = LD_UB(running_avg_y_ptr);
493
494        coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y0, (v16i8)sig0);
495        diff0 = __msa_hsub_u_h(coeff0, coeff0);
496        abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
497        temp0_h = delta_vec < abs_diff0;
498        abs_diff0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)delta_vec,
499                                        (v16u8)temp0_h);
500        abs_diff_neg0 = (v8i16)zero - abs_diff0;
501        temp0_h = __msa_clei_s_h(diff0, 0);
502        adjust0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0,
503                                     (v16u8)temp0_h);
504        temp2_h = (v8i16)__msa_ilvr_b(zero, (v16i8)running_avg_y);
505        adjust2 = temp2_h + adjust0;
506        adjust2 = __msa_maxi_s_h(adjust2, 0);
507        adjust2 = (v8i16)__msa_sat_u_h((v8u16)adjust2, 7);
508        temp0_h = __msa_ceqi_h(diff0, 0);
509        adjust2 =
510            (v8i16)__msa_bmnz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h);
511        adjust0 =
512            (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h);
513        col_sum0 += adjust0;
514        running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust2, (v16i8)adjust2);
515        dst0 = __msa_copy_s_d((v2i64)running_avg_y, 0);
516        SD(dst0, running_avg_y_ptr - avg_y_stride);
517
518        coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y1, (v16i8)sig1);
519        diff0 = __msa_hsub_u_h(coeff0, coeff0);
520        abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
521        temp0_h = delta_vec < abs_diff0;
522        abs_diff0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)delta_vec,
523                                        (v16u8)temp0_h);
524        abs_diff_neg0 = (v8i16)zero - abs_diff0;
525        temp0_h = __msa_clei_s_h(diff0, 0);
526        adjust0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0,
527                                     (v16u8)temp0_h);
528        temp2_h = (v8i16)__msa_ilvr_b(zero, (v16i8)running_avg_y1);
529        adjust2 = temp2_h + adjust0;
530        adjust2 = __msa_maxi_s_h(adjust2, 0);
531        adjust2 = (v8i16)__msa_sat_u_h((v8u16)adjust2, 7);
532        temp0_h = __msa_ceqi_h(diff0, 0);
533        adjust2 =
534            (v8i16)__msa_bmnz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h);
535        adjust0 =
536            (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h);
537        col_sum0 += adjust0;
538        running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust2, (v16i8)adjust2);
539        dst1 = __msa_copy_s_d((v2i64)running_avg_y, 0);
540        SD(dst1, running_avg_y_ptr);
541        running_avg_y_ptr += avg_y_stride;
542      }
543
544      temp0_h = col_sum0;
545      temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
546      temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
547      temp1_d = __msa_splati_d(temp0_d, 1);
548      temp0_d += temp1_d;
549      sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
550
551      if (abs(sum_diff) > sum_diff_thresh) {
552        return COPY_BLOCK;
553      }
554    } else {
555      return COPY_BLOCK;
556    }
557  }
558
559  LD4(sig_start, sig_stride, src0, src1, src2, src3);
560  sig_start += (4 * sig_stride);
561  SD4(src0, src1, src2, src3, running_avg_y_start, avg_y_stride);
562  running_avg_y_start += (4 * avg_y_stride);
563
564  LD4(sig_start, sig_stride, src0, src1, src2, src3);
565  SD4(src0, src1, src2, src3, running_avg_y_start, avg_y_stride);
566
567  return FILTER_BLOCK;
568}
569