denoising_msa.c revision da49e34c1fb5e99681f4ad99c21d9cfd83eddb96
1537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin/* 2537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin * 4537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin * Use of this source code is governed by a BSD-style license 5537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin * that can be found in the LICENSE file in the root of the source 6537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin * tree. An additional intellectual property rights grant can be found 7537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin * in the file PATENTS. All contributing project authors may 8537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin * be found in the AUTHORS file in the root of the source tree. 9537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin */ 10537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin 11537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin#include <stdlib.h> 12537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin#include "./vp8_rtcd.h" 13537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin#include "vp8/common/mips/msa/vp8_macros_msa.h" 14537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin#include "vp8/encoder/denoising.h" 15537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin 16537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levinint32_t vp8_denoiser_filter_msa(uint8_t *mc_running_avg_y_ptr, 17537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin int32_t mc_avg_y_stride, 18537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin uint8_t *running_avg_y_ptr, 19537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin int32_t avg_y_stride, 20537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin uint8_t *sig_ptr, int32_t sig_stride, 21537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin uint32_t motion_magnitude, 22537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin int32_t increase_denoising) 23537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin{ 24537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin uint8_t *running_avg_y_start = running_avg_y_ptr; 25537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin uint8_t *sig_start = sig_ptr; 26537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin int32_t cnt = 0; 27537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin int32_t sum_diff = 0; 28537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin int32_t shift_inc1 = 3; 29537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin int32_t delta = 0; 30537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin int32_t sum_diff_thresh; 31537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 32537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v16u8 src8, src9, src10, src11, src12, src13, src14, src15; 33537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v16u8 mc_running_avg_y0, running_avg_y, sig0; 34537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v16u8 mc_running_avg_y1, running_avg_y1, sig1; 35537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v16u8 coeff0, coeff1; 36537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v8i16 diff0, diff1, abs_diff0, abs_diff1, abs_diff_neg0, abs_diff_neg1; 37537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v8i16 adjust0, adjust1, adjust2, adjust3; 38537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v8i16 shift_inc1_vec = { 0 }; 39537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v8i16 col_sum0 = { 0 }; 40537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v8i16 col_sum1 = { 0 }; 41537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v8i16 col_sum2 = { 0 }; 42537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v8i16 col_sum3 = { 0 }; 43537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v8i16 temp0_h, temp1_h, temp2_h, temp3_h, cmp, delta_vec; 44537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v4i32 temp0_w; 45537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v2i64 temp0_d, temp1_d; 46537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v8i16 zero = { 0 }; 47537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v8i16 one = __msa_ldi_h(1); 48537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v8i16 four = __msa_ldi_h(4); 49537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v8i16 val_127 = __msa_ldi_h(127); 50537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v8i16 adj_val = { 6, 4, 3, 0, -6, -4, -3, 0 }; 51537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin 52537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) 53537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin { 54537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin adj_val = __msa_add_a_h(adj_val, one); 55537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin if (increase_denoising) 56537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin { 57537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin adj_val = __msa_add_a_h(adj_val, one); 58537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin shift_inc1 = 4; 59537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin } 60537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin 61537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp0_h = zero - adj_val; 62537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin adj_val = (v8i16)__msa_ilvev_d((v2i64)temp0_h, (v2i64)adj_val); 63537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin } 64537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin 65537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin adj_val = __msa_insert_h(adj_val, 3, cnt); 66537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin adj_val = __msa_insert_h(adj_val, 7, cnt); 67537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin shift_inc1_vec = __msa_fill_h(shift_inc1); 68537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin 69537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin for (cnt = 8; cnt--;) 70537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin { 71537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v8i16 mask0 = { 0 }; 72537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin v8i16 mask1 = { 0 }; 73537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin 74537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr); 75537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin sig0 = LD_UB(sig_ptr); 76537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin sig_ptr += sig_stride; 77537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mc_running_avg_y_ptr += mc_avg_y_stride; 78537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin 79537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr); 80537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin sig1 = LD_UB(sig_ptr); 81537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin 82537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin ILVRL_B2_UB(mc_running_avg_y0, sig0, coeff0, coeff1); 83537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin HSUB_UB2_SH(coeff0, coeff1, diff0, diff1); 84537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin abs_diff0 = __msa_add_a_h(diff0, zero); 85537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin abs_diff1 = __msa_add_a_h(diff1, zero); 86537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = __msa_clei_s_h(abs_diff0, 15); 87537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = cmp & one; 88537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask0 += cmp; 89537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = __msa_clei_s_h(abs_diff0, 7); 90537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = cmp & one; 91537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask0 += cmp; 92537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = abs_diff0 < shift_inc1_vec; 93537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = cmp & one; 94537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask0 += cmp; 95537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = __msa_clei_s_h(abs_diff1, 15); 96537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = cmp & one; 97537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask1 += cmp; 98537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = __msa_clei_s_h(abs_diff1, 7); 99537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = cmp & one; 100537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask1 += cmp; 101537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = abs_diff1 < shift_inc1_vec; 102537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = cmp & one; 103537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask1 += cmp; 104537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp0_h = __msa_clei_s_h(diff0, 0); 105537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp0_h = temp0_h & four; 106537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask0 += temp0_h; 107537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp1_h = __msa_clei_s_h(diff1, 0); 108537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp1_h = temp1_h & four; 109537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask1 += temp1_h; 110537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin VSHF_H2_SH(adj_val, adj_val, adj_val, adj_val, mask0, mask1, adjust0, 111537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin adjust1); 112537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp2_h = __msa_ceqi_h(adjust0, 0); 113537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp3_h = __msa_ceqi_h(adjust1, 0); 114537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, 115537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin (v16u8)temp2_h); 116537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)diff1, 117537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin (v16u8)temp3_h); 118537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin ADD2(col_sum0, adjust0, col_sum1, adjust1, col_sum0, col_sum1); 119537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin UNPCK_UB_SH(sig0, temp0_h, temp1_h); 120537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin ADD2(temp0_h, adjust0, temp1_h, adjust1, temp0_h, temp1_h); 121537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin MAXI_SH2_SH(temp0_h, temp1_h, 0); 122537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin SAT_UH2_SH(temp0_h, temp1_h, 7); 123537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp2_h = (v8i16)__msa_pckev_b((v16i8)temp3_h, (v16i8)temp2_h); 124537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp1_h, (v16i8)temp0_h); 125537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin running_avg_y = __msa_bmnz_v(running_avg_y, mc_running_avg_y0, 126537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin (v16u8)temp2_h); 127537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin ST_UB(running_avg_y, running_avg_y_ptr); 128537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin running_avg_y_ptr += avg_y_stride; 129537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin 130537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask0 = zero; 131537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask1 = zero; 132537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin ILVRL_B2_UB(mc_running_avg_y1, sig1, coeff0, coeff1); 133537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin HSUB_UB2_SH(coeff0, coeff1, diff0, diff1); 134537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin abs_diff0 = __msa_add_a_h(diff0, zero); 135537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin abs_diff1 = __msa_add_a_h(diff1, zero); 136537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = __msa_clei_s_h(abs_diff0, 15); 137537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = cmp & one; 138537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask0 += cmp; 139537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = __msa_clei_s_h(abs_diff0, 7); 140537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = cmp & one; 141537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask0 += cmp; 142537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = abs_diff0 < shift_inc1_vec; 143537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = cmp & one; 144537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask0 += cmp; 145537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = __msa_clei_s_h(abs_diff1, 15); 146537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = cmp & one; 147537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask1 += cmp; 148537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = __msa_clei_s_h(abs_diff1, 7); 149537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = cmp & one; 150537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask1 += cmp; 151537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = abs_diff1 < shift_inc1_vec; 152537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin cmp = cmp & one; 153537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask1 += cmp; 154537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp0_h = __msa_clei_s_h(diff0, 0); 155537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp0_h = temp0_h & four; 156537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask0 += temp0_h; 157537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp1_h = __msa_clei_s_h(diff1, 0); 158537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp1_h = temp1_h & four; 159537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mask1 += temp1_h; 160537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin VSHF_H2_SH(adj_val, adj_val, adj_val, adj_val, mask0, mask1, adjust0, 161537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin adjust1); 162537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp2_h = __msa_ceqi_h(adjust0, 0); 163537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp3_h = __msa_ceqi_h(adjust1, 0); 164537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, 165537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin (v16u8)temp2_h); 166537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)diff1, 167537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin (v16u8)temp3_h); 168537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin ADD2(col_sum0, adjust0, col_sum1, adjust1, col_sum0, col_sum1); 169537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin UNPCK_UB_SH(sig1, temp0_h, temp1_h); 170537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin ADD2(temp0_h, adjust0, temp1_h, adjust1, temp0_h, temp1_h); 171537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin MAXI_SH2_SH(temp0_h, temp1_h, 0); 172537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin SAT_UH2_SH(temp0_h, temp1_h, 7); 173537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp2_h = (v8i16)__msa_pckev_b((v16i8)temp3_h, (v16i8)temp2_h); 174537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp1_h, (v16i8)temp0_h); 175537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin running_avg_y = __msa_bmnz_v(running_avg_y, mc_running_avg_y1, 176537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin (v16u8)temp2_h); 177537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin ST_UB(running_avg_y, running_avg_y_ptr); 178537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin sig_ptr += sig_stride; 179537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin mc_running_avg_y_ptr += mc_avg_y_stride; 180537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin running_avg_y_ptr += avg_y_stride; 181537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin } 182537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin 183537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin col_sum0 = __msa_min_s_h(col_sum0, val_127); 184537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin col_sum1 = __msa_min_s_h(col_sum1, val_127); 185537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp0_h = col_sum0 + col_sum1; 186537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp0_w = __msa_hadd_s_w(temp0_h, temp0_h); 187537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp0_d = __msa_hadd_s_d(temp0_w, temp0_w); 188537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp1_d = __msa_splati_d(temp0_d, 1); 189537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin temp0_d += temp1_d; 190537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0); 191537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin sig_ptr -= sig_stride * 16; 192a528eb5b493d0c722e5a8744bd4be52aa32c9eddDmitry V. Levin mc_running_avg_y_ptr -= mc_avg_y_stride * 16; 193a528eb5b493d0c722e5a8744bd4be52aa32c9eddDmitry V. Levin running_avg_y_ptr -= avg_y_stride * 16; 194a528eb5b493d0c722e5a8744bd4be52aa32c9eddDmitry V. Levin 195537c964fe9ad96b5b4c779af7a53a2b0850ade4eDmitry V. Levin if (increase_denoising) 196 { 197 sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; 198 } 199 200 if (abs(sum_diff) > sum_diff_thresh) 201 { 202 delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1; 203 delta_vec = __msa_fill_h(delta); 204 if (delta < 4) 205 { 206 for (cnt = 8; cnt--;) 207 { 208 running_avg_y = LD_UB(running_avg_y_ptr); 209 mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr); 210 sig0 = LD_UB(sig_ptr); 211 sig_ptr += sig_stride; 212 mc_running_avg_y_ptr += mc_avg_y_stride; 213 running_avg_y_ptr += avg_y_stride; 214 mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr); 215 sig1 = LD_UB(sig_ptr); 216 running_avg_y1 = LD_UB(running_avg_y_ptr); 217 ILVRL_B2_UB(mc_running_avg_y0, sig0, coeff0, coeff1); 218 HSUB_UB2_SH(coeff0, coeff1, diff0, diff1); 219 abs_diff0 = __msa_add_a_h(diff0, zero); 220 abs_diff1 = __msa_add_a_h(diff1, zero); 221 temp0_h = abs_diff0 < delta_vec; 222 temp1_h = abs_diff1 < delta_vec; 223 abs_diff0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, 224 (v16u8)delta_vec, 225 (v16u8)temp0_h); 226 abs_diff1 = (v8i16)__msa_bmz_v((v16u8)abs_diff1, 227 (v16u8)delta_vec, 228 (v16u8)temp1_h); 229 SUB2(zero, abs_diff0, zero, abs_diff1, abs_diff_neg0, 230 abs_diff_neg1); 231 abs_diff_neg0 = zero - abs_diff0; 232 abs_diff_neg1 = zero - abs_diff1; 233 temp0_h = __msa_clei_s_h(diff0, 0); 234 temp1_h = __msa_clei_s_h(diff1, 0); 235 adjust0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, 236 (v16u8)abs_diff_neg0, 237 (v16u8)temp0_h); 238 adjust1 = (v8i16)__msa_bmnz_v((v16u8)abs_diff1, 239 (v16u8)abs_diff_neg1, 240 (v16u8)temp1_h); 241 ILVRL_B2_SH(zero, running_avg_y, temp2_h, temp3_h); 242 ADD2(temp2_h, adjust0, temp3_h, adjust1, adjust2, adjust3); 243 MAXI_SH2_SH(adjust2, adjust3, 0); 244 SAT_UH2_SH(adjust2, adjust3, 7); 245 temp0_h = __msa_ceqi_h(diff0, 0); 246 temp1_h = __msa_ceqi_h(diff1, 0); 247 adjust2 = (v8i16)__msa_bmz_v((v16u8)adjust2, (v16u8)temp2_h, 248 (v16u8)temp0_h); 249 adjust3 = (v8i16)__msa_bmz_v((v16u8)adjust3, (v16u8)temp3_h, 250 (v16u8)temp1_h); 251 adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero, 252 (v16u8)temp0_h); 253 adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)zero, 254 (v16u8)temp1_h); 255 ADD2(col_sum2, adjust0, col_sum3, adjust1, col_sum2, col_sum3); 256 running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust3, 257 (v16i8)adjust2); 258 ST_UB(running_avg_y, running_avg_y_ptr - avg_y_stride); 259 ILVRL_B2_UB(mc_running_avg_y1, sig1, coeff0, coeff1); 260 HSUB_UB2_SH(coeff0, coeff1, diff0, diff1); 261 abs_diff0 = __msa_add_a_h(diff0, zero); 262 abs_diff1 = __msa_add_a_h(diff1, zero); 263 temp0_h = abs_diff0 < delta_vec; 264 temp1_h = abs_diff1 < delta_vec; 265 abs_diff0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, 266 (v16u8)delta_vec, 267 (v16u8)temp0_h); 268 abs_diff1 = (v8i16)__msa_bmz_v((v16u8)abs_diff1, 269 (v16u8)delta_vec, 270 (v16u8)temp1_h); 271 SUB2(zero, abs_diff0, zero, abs_diff1, abs_diff_neg0, 272 abs_diff_neg1); 273 temp0_h = __msa_clei_s_h(diff0, 0); 274 temp1_h = __msa_clei_s_h(diff1, 0); 275 adjust0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, 276 (v16u8)abs_diff_neg0, 277 (v16u8)temp0_h); 278 adjust1 = (v8i16)__msa_bmnz_v((v16u8)abs_diff1, 279 (v16u8)abs_diff_neg1, 280 (v16u8)temp1_h); 281 ILVRL_H2_SH(zero, running_avg_y1, temp2_h, temp3_h); 282 ADD2(temp2_h, adjust0, temp3_h, adjust1, adjust2, adjust3); 283 MAXI_SH2_SH(adjust2, adjust3, 0); 284 SAT_UH2_SH(adjust2, adjust3, 7); 285 temp0_h = __msa_ceqi_h(diff0, 0); 286 temp1_h = __msa_ceqi_h(diff1, 0); 287 adjust2 = (v8i16)__msa_bmz_v((v16u8)adjust2, (v16u8)temp2_h, 288 (v16u8)temp0_h); 289 adjust3 = (v8i16)__msa_bmz_v((v16u8)adjust3, (v16u8)temp3_h, 290 (v16u8)temp1_h); 291 adjust0 = (v8i16)__msa_bmz_v((v16u8)adjust0, (v16u8)zero, 292 (v16u8)temp0_h); 293 adjust1 = (v8i16)__msa_bmz_v((v16u8)adjust1, (v16u8)zero, 294 (v16u8)temp1_h); 295 ADD2(col_sum2, adjust0, col_sum3, adjust1, col_sum2, col_sum3); 296 running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust3, 297 (v16i8)adjust2); 298 ST_UB(running_avg_y, running_avg_y_ptr); 299 running_avg_y_ptr += avg_y_stride; 300 } 301 302 col_sum2 = __msa_min_s_h(col_sum2, val_127); 303 col_sum3 = __msa_min_s_h(col_sum3, val_127); 304 temp0_h = col_sum2 + col_sum3; 305 temp0_w = __msa_hadd_s_w(temp0_h, temp0_h); 306 temp0_d = __msa_hadd_s_d(temp0_w, temp0_w); 307 temp1_d = __msa_splati_d(temp0_d, 1); 308 temp0_d += (v2i64)temp1_d; 309 sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0); 310 if (abs(sum_diff) > SUM_DIFF_THRESHOLD) 311 { 312 return COPY_BLOCK; 313 } 314 } 315 else 316 { 317 return COPY_BLOCK; 318 } 319 } 320 321 LD_UB8(sig_start, sig_stride, src0, src1, src2, src3, src4, src5, src6, 322 src7); 323 sig_start += (8 * sig_stride); 324 LD_UB8(sig_start, sig_stride, src8, src9, src10, src11, src12, src13, 325 src14, src15); 326 327 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, running_avg_y_start, 328 avg_y_stride); 329 running_avg_y_start += (8 * avg_y_stride); 330 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, 331 running_avg_y_start, avg_y_stride); 332 333 return FILTER_BLOCK; 334} 335 336int32_t vp8_denoiser_filter_uv_msa(uint8_t *mc_running_avg_y_ptr, 337 int32_t mc_avg_y_stride, 338 uint8_t *running_avg_y_ptr, 339 int32_t avg_y_stride, 340 uint8_t *sig_ptr, 341 int32_t sig_stride, 342 uint32_t motion_magnitude, 343 int32_t increase_denoising) 344{ 345 uint8_t *running_avg_y_start = running_avg_y_ptr; 346 uint8_t *sig_start = sig_ptr; 347 int32_t cnt = 0; 348 int32_t sum_diff = 0; 349 int32_t shift_inc1 = 3; 350 int32_t delta = 0; 351 int32_t sum_block = 0; 352 int32_t sum_diff_thresh; 353 int64_t dst0, dst1, src0, src1, src2, src3; 354 v16u8 mc_running_avg_y0, running_avg_y, sig0; 355 v16u8 mc_running_avg_y1, running_avg_y1, sig1; 356 v16u8 sig2, sig3, sig4, sig5, sig6, sig7; 357 v16u8 coeff0; 358 v8i16 diff0, abs_diff0, abs_diff_neg0; 359 v8i16 adjust0, adjust2; 360 v8i16 shift_inc1_vec = { 0 }; 361 v8i16 col_sum0 = { 0 }; 362 v8i16 temp0_h, temp2_h, cmp, delta_vec; 363 v4i32 temp0_w; 364 v2i64 temp0_d, temp1_d; 365 v16i8 zero = { 0 }; 366 v8i16 one = __msa_ldi_h(1); 367 v8i16 four = __msa_ldi_h(4); 368 v8i16 adj_val = { 6, 4, 3, 0, -6, -4, -3, 0 }; 369 370 371 sig0 = LD_UB(sig_ptr); 372 sig_ptr += sig_stride; 373 temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig0); 374 sig1 = LD_UB(sig_ptr); 375 sig_ptr += sig_stride; 376 temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig1); 377 sig2 = LD_UB(sig_ptr); 378 sig_ptr += sig_stride; 379 temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig2); 380 sig3 = LD_UB(sig_ptr); 381 sig_ptr += sig_stride; 382 temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig3); 383 sig4 = LD_UB(sig_ptr); 384 sig_ptr += sig_stride; 385 temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig4); 386 sig5 = LD_UB(sig_ptr); 387 sig_ptr += sig_stride; 388 temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig5); 389 sig6 = LD_UB(sig_ptr); 390 sig_ptr += sig_stride; 391 temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig6); 392 sig7 = LD_UB(sig_ptr); 393 sig_ptr += sig_stride; 394 temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig7); 395 temp0_w = __msa_hadd_s_w(temp0_h, temp0_h); 396 temp0_d = __msa_hadd_s_d(temp0_w, temp0_w); 397 temp1_d = __msa_splati_d(temp0_d, 1); 398 temp0_d += temp1_d; 399 sum_block = __msa_copy_s_w((v4i32)temp0_d, 0); 400 sig_ptr -= sig_stride * 8; 401 402 if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) 403 { 404 return COPY_BLOCK; 405 } 406 407 if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) 408 { 409 adj_val = __msa_add_a_h(adj_val, one); 410 411 if (increase_denoising) 412 { 413 adj_val = __msa_add_a_h(adj_val, one); 414 shift_inc1 = 4; 415 } 416 417 temp0_h = (v8i16)zero - adj_val; 418 adj_val = (v8i16)__msa_ilvev_d((v2i64)temp0_h, (v2i64)adj_val); 419 } 420 421 adj_val = __msa_insert_h(adj_val, 3, cnt); 422 adj_val = __msa_insert_h(adj_val, 7, cnt); 423 shift_inc1_vec = __msa_fill_h(shift_inc1); 424 for (cnt = 4; cnt--;) 425 { 426 v8i16 mask0 = { 0 }; 427 mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr); 428 sig0 = LD_UB(sig_ptr); 429 sig_ptr += sig_stride; 430 mc_running_avg_y_ptr += mc_avg_y_stride; 431 mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr); 432 sig1 = LD_UB(sig_ptr); 433 coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y0, (v16i8)sig0); 434 diff0 = __msa_hsub_u_h(coeff0, coeff0); 435 abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero); 436 cmp = __msa_clei_s_h(abs_diff0, 15); 437 cmp = cmp & one; 438 mask0 += cmp; 439 cmp = __msa_clei_s_h(abs_diff0, 7); 440 cmp = cmp & one; 441 mask0 += cmp; 442 cmp = abs_diff0 < shift_inc1_vec; 443 cmp = cmp & one; 444 mask0 += cmp; 445 temp0_h = __msa_clei_s_h(diff0, 0); 446 temp0_h = temp0_h & four; 447 mask0 += temp0_h; 448 adjust0 = __msa_vshf_h(mask0, adj_val, adj_val); 449 temp2_h = __msa_ceqi_h(adjust0, 0); 450 adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, 451 (v16u8)temp2_h); 452 col_sum0 += adjust0; 453 temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig0); 454 temp0_h += adjust0; 455 temp0_h = __msa_maxi_s_h(temp0_h, 0); 456 temp0_h = (v8i16)__msa_sat_u_h((v8u16)temp0_h, 7); 457 temp2_h = (v8i16)__msa_pckev_b((v16i8)temp2_h, (v16i8)temp2_h); 458 running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp0_h, (v16i8)temp0_h); 459 running_avg_y = __msa_bmnz_v(running_avg_y, mc_running_avg_y0, 460 (v16u8)temp2_h); 461 dst0 = __msa_copy_s_d((v2i64)running_avg_y, 0); 462 SD(dst0, running_avg_y_ptr); 463 running_avg_y_ptr += avg_y_stride; 464 465 mask0 = __msa_ldi_h(0); 466 coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y1, (v16i8)sig1); 467 diff0 = __msa_hsub_u_h(coeff0, coeff0); 468 abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero); 469 cmp = __msa_clei_s_h(abs_diff0, 15); 470 cmp = cmp & one; 471 mask0 += cmp; 472 cmp = __msa_clei_s_h(abs_diff0, 7); 473 cmp = cmp & one; 474 mask0 += cmp; 475 cmp = abs_diff0 < shift_inc1_vec; 476 cmp = cmp & one; 477 mask0 += cmp; 478 temp0_h = __msa_clei_s_h(diff0, 0); 479 temp0_h = temp0_h & four; 480 mask0 += temp0_h; 481 adjust0 = __msa_vshf_h(mask0, adj_val, adj_val); 482 temp2_h = __msa_ceqi_h(adjust0, 0); 483 adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, 484 (v16u8)temp2_h); 485 col_sum0 += adjust0; 486 temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig1); 487 temp0_h += adjust0; 488 temp0_h = __msa_maxi_s_h(temp0_h, 0); 489 temp0_h = (v8i16)__msa_sat_u_h((v8u16)temp0_h, 7); 490 491 temp2_h = (v8i16)__msa_pckev_b((v16i8)temp2_h, (v16i8)temp2_h); 492 running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp0_h, (v16i8)temp0_h); 493 running_avg_y = __msa_bmnz_v(running_avg_y, mc_running_avg_y1, 494 (v16u8)temp2_h); 495 dst1 = __msa_copy_s_d((v2i64)running_avg_y, 0); 496 SD(dst1, running_avg_y_ptr); 497 498 sig_ptr += sig_stride; 499 mc_running_avg_y_ptr += mc_avg_y_stride; 500 running_avg_y_ptr += avg_y_stride; 501 } 502 503 temp0_h = col_sum0; 504 temp0_w = __msa_hadd_s_w(temp0_h, temp0_h); 505 temp0_d = __msa_hadd_s_d(temp0_w, temp0_w); 506 temp1_d = __msa_splati_d(temp0_d, 1); 507 temp0_d += temp1_d; 508 sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0); 509 sig_ptr -= sig_stride * 8; 510 mc_running_avg_y_ptr -= mc_avg_y_stride * 8; 511 running_avg_y_ptr -= avg_y_stride * 8; 512 sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; 513 514 if (increase_denoising) 515 { 516 sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV; 517 } 518 519 if (abs(sum_diff) > sum_diff_thresh) 520 { 521 delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1; 522 delta_vec = __msa_fill_h(delta); 523 if (delta < 4) 524 { 525 for (cnt = 4; cnt--;) 526 { 527 running_avg_y = LD_UB(running_avg_y_ptr); 528 mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr); 529 sig0 = LD_UB(sig_ptr); 530 /* Update pointers for next iteration. */ 531 sig_ptr += sig_stride; 532 mc_running_avg_y_ptr += mc_avg_y_stride; 533 running_avg_y_ptr += avg_y_stride; 534 535 mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr); 536 sig1 = LD_UB(sig_ptr); 537 running_avg_y1 = LD_UB(running_avg_y_ptr); 538 539 coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y0, 540 (v16i8)sig0); 541 diff0 = __msa_hsub_u_h(coeff0, coeff0); 542 abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero); 543 temp0_h = delta_vec < abs_diff0; 544 abs_diff0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, 545 (v16u8)delta_vec, 546 (v16u8)temp0_h); 547 abs_diff_neg0 = (v8i16)zero - abs_diff0; 548 temp0_h = __msa_clei_s_h(diff0, 0); 549 adjust0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, 550 (v16u8)abs_diff_neg0, 551 (v16u8)temp0_h); 552 temp2_h = (v8i16)__msa_ilvr_b(zero, (v16i8)running_avg_y); 553 adjust2 = temp2_h + adjust0; 554 adjust2 = __msa_maxi_s_h(adjust2, 0); 555 adjust2 = (v8i16)__msa_sat_u_h((v8u16)adjust2, 7); 556 temp0_h = __msa_ceqi_h(diff0, 0); 557 adjust2 = (v8i16)__msa_bmnz_v((v16u8)adjust2, (v16u8)temp2_h, 558 (v16u8)temp0_h); 559 adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero, 560 (v16u8)temp0_h); 561 col_sum0 += adjust0; 562 running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust2, 563 (v16i8)adjust2); 564 dst0 = __msa_copy_s_d((v2i64)running_avg_y, 0); 565 SD(dst0, running_avg_y_ptr - avg_y_stride); 566 567 coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y1, 568 (v16i8)sig1); 569 diff0 = __msa_hsub_u_h(coeff0, coeff0); 570 abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero); 571 temp0_h = delta_vec < abs_diff0; 572 abs_diff0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, 573 (v16u8)delta_vec, 574 (v16u8)temp0_h); 575 abs_diff_neg0 = (v8i16)zero - abs_diff0; 576 temp0_h = __msa_clei_s_h(diff0, 0); 577 adjust0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, 578 (v16u8)abs_diff_neg0, 579 (v16u8)temp0_h); 580 temp2_h = (v8i16)__msa_ilvr_b(zero, (v16i8)running_avg_y1); 581 adjust2 = temp2_h + adjust0; 582 adjust2 = __msa_maxi_s_h(adjust2, 0); 583 adjust2 = (v8i16)__msa_sat_u_h((v8u16)adjust2, 7); 584 temp0_h = __msa_ceqi_h(diff0, 0); 585 adjust2 = (v8i16)__msa_bmnz_v((v16u8)adjust2, (v16u8)temp2_h, 586 (v16u8)temp0_h); 587 adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero, 588 (v16u8)temp0_h); 589 col_sum0 += adjust0; 590 running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust2, 591 (v16i8)adjust2); 592 dst1 = __msa_copy_s_d((v2i64)running_avg_y, 0); 593 SD(dst1, running_avg_y_ptr); 594 running_avg_y_ptr += avg_y_stride; 595 } 596 597 temp0_h = col_sum0; 598 temp0_w = __msa_hadd_s_w(temp0_h, temp0_h); 599 temp0_d = __msa_hadd_s_d(temp0_w, temp0_w); 600 temp1_d = __msa_splati_d(temp0_d, 1); 601 temp0_d += temp1_d; 602 sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0); 603 604 if (abs(sum_diff) > sum_diff_thresh) 605 { 606 return COPY_BLOCK; 607 } 608 } 609 else 610 { 611 return COPY_BLOCK; 612 } 613 } 614 615 LD4(sig_start, sig_stride, src0, src1, src2, src3); 616 sig_start += (4 * sig_stride); 617 SD4(src0, src1, src2, src3, running_avg_y_start, avg_y_stride); 618 running_avg_y_start += (4 * avg_y_stride); 619 620 LD4(sig_start, sig_stride, src0, src1, src2, src3); 621 SD4(src0, src1, src2, src3, running_avg_y_start, avg_y_stride); 622 623 return FILTER_BLOCK; 624} 625