1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* 2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * 4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */ 10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h" 12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_ports/mem.h" 13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/mips/macros_msa.h" 14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/variance.h" 15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 16da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic const uint8_t bilinear_filters_msa[8][2] = { 177bc9febe8749e98a3812a0dc4380ceae75c29450Johann { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, 187bc9febe8749e98a3812a0dc4380ceae75c29450Johann { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, 19da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}; 20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 217bc9febe8749e98a3812a0dc4380ceae75c29450Johann#define CALC_MSE_AVG_B(src, ref, var, sub) \ 227bc9febe8749e98a3812a0dc4380ceae75c29450Johann { \ 237bc9febe8749e98a3812a0dc4380ceae75c29450Johann v16u8 src_l0_m, src_l1_m; \ 247bc9febe8749e98a3812a0dc4380ceae75c29450Johann v8i16 res_l0_m, res_l1_m; \ 257bc9febe8749e98a3812a0dc4380ceae75c29450Johann \ 267bc9febe8749e98a3812a0dc4380ceae75c29450Johann ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ 277bc9febe8749e98a3812a0dc4380ceae75c29450Johann HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ 287bc9febe8749e98a3812a0dc4380ceae75c29450Johann DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ 297bc9febe8749e98a3812a0dc4380ceae75c29450Johann \ 307bc9febe8749e98a3812a0dc4380ceae75c29450Johann sub += res_l0_m + res_l1_m; \ 317bc9febe8749e98a3812a0dc4380ceae75c29450Johann } 32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 337bc9febe8749e98a3812a0dc4380ceae75c29450Johann#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) 34da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_LARGE_WxH(sse, diff, shift) \ 36da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sse - (((int64_t)diff * diff) >> shift) 37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr, 39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t src_stride, 40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const uint8_t *ref_ptr, 41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t ref_stride, 427bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *sec_pred, int32_t height, 43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t *diff) { 44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t ht_cnt; 45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t src0, src1, src2, src3; 46da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t ref0, ref1, ref2, ref3; 47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 pred, src = { 0 }; 48da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 ref = { 0 }; 49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 50da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (ht_cnt = (height >> 2); ht_cnt--;) { 53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian pred = LD_UB(sec_pred); 54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LW4(src_ptr, src_stride, src0, src1, src2, src3); 56da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += (4 * src_stride); 57da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += (4 * ref_stride); 59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian INSERT_W4_UB(src0, src1, src2, src3, src); 61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 63da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src = __msa_aver_u_b(src, pred); 64da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src, ref, var, avg); 65da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 66da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 72da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 73da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr, 74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t src_stride, 75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const uint8_t *ref_ptr, 76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t ref_stride, 777bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *sec_pred, int32_t height, 78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t *diff) { 79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t ht_cnt; 80da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3; 81da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 ref0, ref1, ref2, ref3; 82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 pred0, pred1; 83da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (ht_cnt = (height >> 2); ht_cnt--;) { 87da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(sec_pred, 16, pred0, pred1); 88da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 32; 89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); 90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += (4 * src_stride); 91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += (4 * ref_stride); 93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 947bc9febe8749e98a3812a0dc4380ceae75c29450Johann PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, 957bc9febe8749e98a3812a0dc4380ceae75c29450Johann ref0, ref1); 96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg); 98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg); 99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr, 108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t src_stride, 109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const uint8_t *ref_ptr, 110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t ref_stride, 111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const uint8_t *sec_pred, 1127bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t ht_cnt; 114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src, ref, pred; 115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (ht_cnt = (height >> 2); ht_cnt--;) { 119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian pred = LD_UB(sec_pred); 120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src = LD_UB(src_ptr); 122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref = LD_UB(ref_ptr); 124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src = __msa_aver_u_b(src, pred); 126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src, ref, var, avg); 127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian pred = LD_UB(sec_pred); 129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src = LD_UB(src_ptr); 131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref = LD_UB(ref_ptr); 133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src = __msa_aver_u_b(src, pred); 135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src, ref, var, avg); 136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian pred = LD_UB(sec_pred); 138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src = LD_UB(src_ptr); 140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref = LD_UB(ref_ptr); 142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src = __msa_aver_u_b(src, pred); 144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src, ref, var, avg); 145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian pred = LD_UB(sec_pred); 147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src = LD_UB(src_ptr); 149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref = LD_UB(ref_ptr); 151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src = __msa_aver_u_b(src, pred); 153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src, ref, var, avg); 154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr, 163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t src_stride, 164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const uint8_t *ref_ptr, 165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t ref_stride, 166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const uint8_t *sec_pred, 1677bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t ht_cnt; 169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, ref0, ref1, pred0, pred1; 170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (ht_cnt = (height >> 2); ht_cnt--;) { 174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(sec_pred, 16, pred0, pred1); 175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 32; 176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(src_ptr, 16, src0, src1); 177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(ref_ptr, 16, ref0, ref1); 179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg); 182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg); 183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(sec_pred, 16, pred0, pred1); 185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 32; 186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(src_ptr, 16, src0, src1); 187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(ref_ptr, 16, ref0, ref1); 189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg); 192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg); 193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(sec_pred, 16, pred0, pred1); 195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 32; 196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(src_ptr, 16, src0, src1); 197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(ref_ptr, 16, ref0, ref1); 199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg); 202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg); 203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(sec_pred, 16, pred0, pred1); 205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 32; 206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(src_ptr, 16, src0, src1); 207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(ref_ptr, 16, ref0, ref1); 209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg); 212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg); 213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr, 222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t src_stride, 223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const uint8_t *ref_ptr, 224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t ref_stride, 2257bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *sec_pred, int32_t *diff) { 226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t ht_cnt; 227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, ref0, ref1, pred0, pred1; 228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg0 = { 0 }; 229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg1 = { 0 }; 230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (ht_cnt = 16; ht_cnt--;) { 233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(sec_pred, 16, pred0, pred1); 234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 32; 235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(src_ptr, 16, src0, src1); 236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(ref_ptr, 16, ref0, ref1); 238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg0); 241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg1); 242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(sec_pred, 16, pred0, pred1); 244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 32; 245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(src_ptr, 16, src0, src1); 246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(ref_ptr, 16, ref0, ref1); 248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg0); 251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg1); 252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(sec_pred, 16, pred0, pred1); 254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 32; 255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(src_ptr, 16, src0, src1); 256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(ref_ptr, 16, ref0, ref1); 258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg0); 261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg1); 262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(sec_pred, 16, pred0, pred1); 264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 32; 265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(src_ptr, 16, src0, src1); 266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(ref_ptr, 16, ref0, ref1); 268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg0); 271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg1); 272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg0, avg0); 275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec += __msa_hadd_s_w(avg1, avg1); 276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr, 282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t src_stride, 283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const uint8_t *ref_ptr, 284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t ref_stride, 2857bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *sec_pred, int32_t *diff) { 286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t ht_cnt; 287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3; 288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 ref0, ref1, ref2, ref3; 289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 pred0, pred1, pred2, pred3; 290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg0 = { 0 }; 291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg1 = { 0 }; 292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (ht_cnt = 16; ht_cnt--;) { 295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 64; 297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src_ptr, 16, src0, src1, src2, src3); 298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); 300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 3017bc9febe8749e98a3812a0dc4380ceae75c29450Johann AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, 3027bc9febe8749e98a3812a0dc4380ceae75c29450Johann src2, src3); 303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg0); 304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src2, ref2, var, avg0); 305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg1); 306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src3, ref3, var, avg1); 307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 64; 310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src_ptr, 16, src0, src1, src2, src3); 311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); 313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 3147bc9febe8749e98a3812a0dc4380ceae75c29450Johann AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, 3157bc9febe8749e98a3812a0dc4380ceae75c29450Johann src2, src3); 316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg0); 317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src2, ref2, var, avg0); 318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg1); 319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src3, ref3, var, avg1); 320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg0, avg0); 323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec += __msa_hadd_s_w(avg1, avg1); 324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr, 331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t src_stride, 332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const uint8_t *ref_ptr, 333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t ref_stride, 3347bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *sec_pred, int32_t *diff) { 335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t ht_cnt; 336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3; 337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 ref0, ref1, ref2, ref3; 338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 pred0, pred1, pred2, pred3; 339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg0 = { 0 }; 340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg1 = { 0 }; 341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg2 = { 0 }; 342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg3 = { 0 }; 343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (ht_cnt = 32; ht_cnt--;) { 346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 64; 348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src_ptr, 16, src0, src1, src2, src3); 349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); 351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 3527bc9febe8749e98a3812a0dc4380ceae75c29450Johann AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, 3537bc9febe8749e98a3812a0dc4380ceae75c29450Johann src2, src3); 354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg0); 355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg1); 356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src2, ref2, var, avg2); 357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src3, ref3, var, avg3); 358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 64; 361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src_ptr, 16, src0, src1, src2, src3); 362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src_ptr += src_stride; 363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); 364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ref_ptr += ref_stride; 3657bc9febe8749e98a3812a0dc4380ceae75c29450Johann AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, 3667bc9febe8749e98a3812a0dc4380ceae75c29450Johann src2, src3); 367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg0); 368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg1); 369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src2, ref2, var, avg2); 370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src3, ref3, var, avg3); 371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg0, avg0); 374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec += __msa_hadd_s_w(avg1, avg1); 375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec += __msa_hadd_s_w(avg2, avg2); 376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec += __msa_hadd_s_w(avg3, avg3); 377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3827bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_4width_h_msa( 3837bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 3847bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t ref0, ref1, ref2, ref3; 388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt0, ref = { 0 }; 389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3; 390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 vec0, vec1, vec2, vec3; 392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter); 396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt0 = (v16u8)__msa_fill_h(filtval); 397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src, src_stride, src0, src1, src2, src3); 400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 4067bc9febe8749e98a3812a0dc4380ceae75c29450Johann DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 4077bc9febe8749e98a3812a0dc4380ceae75c29450Johann vec2, vec3); 408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 4097bc9febe8749e98a3812a0dc4380ceae75c29450Johann PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, 4107bc9febe8749e98a3812a0dc4380ceae75c29450Johann src2, src3); 411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); 412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); 413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref, var, avg); 414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 4227bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_8width_h_msa( 4237bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 4247bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt0, out, ref0, ref1, ref2, ref3; 428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3; 429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 vec0, vec1, vec2, vec3; 431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter); 435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt0 = (v16u8)__msa_fill_h(filtval); 436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src, src_stride, src0, src1, src2, src3); 439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 4467bc9febe8749e98a3812a0dc4380ceae75c29450Johann DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 4477bc9febe8749e98a3812a0dc4380ceae75c29450Johann vec2, vec3); 448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 4497bc9febe8749e98a3812a0dc4380ceae75c29450Johann PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, 4507bc9febe8749e98a3812a0dc4380ceae75c29450Johann src2, src3); 451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); 452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out, ref0, var, avg); 453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); 454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out, ref1, var, avg); 455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 4637bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_16width_h_msa( 4647bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 4657bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 dst0, dst1, dst2, dst3, filt0; 471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 out0, out1, out2, out3, out4, out5, out6, out7; 473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter); 477da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt0 = (v16u8)__msa_fill_h(filtval); 478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src, src_stride, src0, src2, src4, src6); 481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); 489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); 4907bc9febe8749e98a3812a0dc4380ceae75c29450Johann DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 4917bc9febe8749e98a3812a0dc4380ceae75c29450Johann out2, out3); 4927bc9febe8749e98a3812a0dc4380ceae75c29450Johann DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 4937bc9febe8749e98a3812a0dc4380ceae75c29450Johann out6, out7); 494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 4967bc9febe8749e98a3812a0dc4380ceae75c29450Johann PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1, 4977bc9febe8749e98a3812a0dc4380ceae75c29450Johann src2, src3); 498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, dst0, var, avg); 499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, dst1, var, avg); 500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src2, dst2, var, avg); 501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src3, dst3, var, avg); 502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 5107bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_32width_h_msa( 5117bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 5127bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt, sse = 0; 514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t diff0[2]; 515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, 518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter, height, &diff0[loop_cnt]); 519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = diff0[0] + diff0[1]; 524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return sse; 526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 5287bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_64width_h_msa( 5297bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 5307bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt, sse = 0; 532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t diff0[4]; 533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, 536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter, height, &diff0[loop_cnt]); 537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return sse; 544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 545da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 5467bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_4width_v_msa( 5477bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 5487bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t ref0, ref1, ref2, ref3; 552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3, src4, out; 553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src10_r, src32_r, src21_r, src43_r; 554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 ref = { 0 }; 555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src2110, src4332; 556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt0; 557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 tmp0, tmp1; 560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter); 562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt0 = (v16u8)__msa_fill_h(filtval); 563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = LD_UB(src); 565da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 566da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src, src_stride, src1, src2, src3, src4); 569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 5747bc9febe8749e98a3812a0dc4380ceae75c29450Johann ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 5757bc9febe8749e98a3812a0dc4380ceae75c29450Johann src32_r, src43_r); 576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); 578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out, ref, var, avg); 581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = src4; 582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 586da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 587da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 5907bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_8width_v_msa( 5917bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 5927bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3, src4; 596da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 ref0, ref1, ref2, ref3; 597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 vec0, vec1, vec2, vec3; 598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 tmp0, tmp1, tmp2, tmp3; 599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt0; 600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter); 604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt0 = (v16u8)__msa_fill_h(filtval); 605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = LD_UB(src); 607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src, src_stride, src1, src2, src3, src4); 611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 613da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 6167bc9febe8749e98a3812a0dc4380ceae75c29450Johann ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, 6177bc9febe8749e98a3812a0dc4380ceae75c29450Johann vec3); 6187bc9febe8749e98a3812a0dc4380ceae75c29450Johann DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, 6197bc9febe8749e98a3812a0dc4380ceae75c29450Johann tmp2, tmp3); 620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 621da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 622da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg); 623da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg); 624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = src4; 625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 626da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 627da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 628da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 629da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 632da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 6337bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_16width_v_msa( 6347bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 6357bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 636da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 637da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 638da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 ref0, ref1, ref2, ref3; 639da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3, src4; 640da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 out0, out1, out2, out3; 641da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 642da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 tmp0, tmp1, tmp2, tmp3; 643da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt0; 644da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 645da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 646da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 647da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter); 648da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt0 = (v16u8)__msa_fill_h(filtval); 649da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 650da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = LD_UB(src); 651da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 652da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 653da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src, src_stride, src1, src2, src3, src4); 655da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 656da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 659da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 660da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 663da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 664da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); 666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); 667da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 668da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 671da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 672da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 674da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 675da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 676da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 677da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 678da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = src4; 679da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 680da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out0, ref0, var, avg); 681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out1, ref1, var, avg); 682da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out2, ref2, var, avg); 683da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out3, ref3, var, avg); 684da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 685da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 686da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 687da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 688da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 689da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 690da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 691da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 6927bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_32width_v_msa( 6937bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 6947bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 695da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt, sse = 0; 696da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t diff0[2]; 697da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 698da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 699da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, 700da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter, height, &diff0[loop_cnt]); 701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 703da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 704da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = diff0[0] + diff0[1]; 706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return sse; 708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 7107bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_64width_v_msa( 7117bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 7127bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 713da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt, sse = 0; 714da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t diff0[4]; 715da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 716da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 717da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, 718da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter, height, &diff0[loop_cnt]); 719da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 720da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 721da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 722da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 723da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 724da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 725da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return sse; 726da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 727da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 7287bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_4width_hv_msa( 7297bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 7307bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, 7317bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 732da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 733da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 734da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t ref0, ref1, ref2, ref3; 735da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3, src4; 736da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 out, ref = { 0 }; 737da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt_vt, filt_hz, vec0, vec1; 738da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; 739da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4; 740da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 tmp0, tmp1; 741da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 742da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 743da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 744da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter_horiz); 745da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_fill_h(filtval); 746da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter_vert); 747da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_fill_h(filtval); 748da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 749da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = LD_UB(src); 750da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 751da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 752da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 753da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src, src_stride, src1, src2, src3, src4); 754da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 755da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 756da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 757da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 758da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 759da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 760da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 761da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 762da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); 763da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 764da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 765da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 766da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 767da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out, ref, var, avg); 768da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = src4; 769da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 770da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 771da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 772da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 773da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 774da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 775da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 776da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 7777bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_8width_hv_msa( 7787bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 7797bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, 7807bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 781da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 782da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 783da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 ref0, ref1, ref2, ref3; 784da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3, src4; 785da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 out0, out1; 786da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 787da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out0, hz_out1; 788da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 tmp0, tmp1, tmp2, tmp3; 789da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt_vt, filt_hz, vec0; 790da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 791da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 792da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 793da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter_horiz); 794da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_fill_h(filtval); 795da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter_vert); 796da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_fill_h(filtval); 797da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 798da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = LD_UB(src); 799da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 800da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 801da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 802da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 803da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src, src_stride, src1, src2, src3, src4); 804da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 805da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 806da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 807da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 808da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 809da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 810da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 811da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = __msa_dotp_u_h(vec0, filt_vt); 812da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 813da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 814da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = __msa_dotp_u_h(vec0, filt_vt); 815da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 816da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 817da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 818da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = __msa_dotp_u_h(vec0, filt_vt); 819da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 820da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 821da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = __msa_dotp_u_h(vec0, filt_vt); 822da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 823da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 824da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out0, ref0, var, avg); 825da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out1, ref1, var, avg); 826da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 827da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 828da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 829da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 830da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 831da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 832da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 833da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 8347bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_16width_hv_msa( 8357bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 8367bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, 8377bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 838da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 839da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 840da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 841da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 ref0, ref1, ref2, ref3; 842da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt_hz, filt_vt, vec0, vec1; 843da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 844da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out0, hz_out1, hz_out2, hz_out3; 845da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 tmp0, tmp1; 846da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 847da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 848da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 849da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter_horiz); 850da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_fill_h(filtval); 851da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter_vert); 852da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_fill_h(filtval); 853da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 854da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(src, 8, src0, src1); 855da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 856da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 857da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 858da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 859da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 860da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 861da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src, src_stride, src0, src2, src4, src6); 862da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src + 8, src_stride, src1, src3, src5, src7); 863da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 864da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 865da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 866da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 867da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 868da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 869da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 870da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 871da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 872da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 873da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 874da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 875da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 876da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 877da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 878da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 879da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 880da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 881da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 882da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); 883da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 884da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 885da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 886da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 887da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 888da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); 889da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); 890da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 891da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 892da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 893da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 894da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 895da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg); 896da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg); 897da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src2, ref2, var, avg); 898da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src3, ref3, var, avg); 899da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 900da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 901da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 902da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 903da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 904da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 905da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 906da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 9077bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_32width_hv_msa( 9087bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 9097bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, 9107bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 911da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt, sse = 0; 912da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t diff0[2]; 913da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 914da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 915da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, 916da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_horiz, filter_vert, height, 917da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian &diff0[loop_cnt]); 918da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 919da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 920da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 921da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 922da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = diff0[0] + diff0[1]; 923da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 924da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return sse; 925da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 926da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 9277bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_64width_hv_msa( 9287bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 9297bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, 9307bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 931da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt, sse = 0; 932da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t diff0[4]; 933da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 934da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 935da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, 936da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_horiz, filter_vert, height, 937da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian &diff0[loop_cnt]); 938da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 939da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 940da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 941da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 942da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 943da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 944da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return sse; 945da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 946da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 9477bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_4width_h_msa( 9487bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 9497bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 9507bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 951da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 952da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 953da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t ref0, ref1, ref2, ref3; 954da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 out, pred, filt0, ref = { 0 }; 955da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3; 956da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 957da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 vec0, vec1, vec2, vec3; 958da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 959da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 960da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 961da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter); 962da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt0 = (v16u8)__msa_fill_h(filtval); 963da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 964da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 965da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src, src_stride, src0, src1, src2, src3); 966da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 967da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian pred = LD_UB(sec_pred); 968da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 969da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 970da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 971da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 972da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 973da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 974da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 9757bc9febe8749e98a3812a0dc4380ceae75c29450Johann DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 9767bc9febe8749e98a3812a0dc4380ceae75c29450Johann vec2, vec3); 977da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 9787bc9febe8749e98a3812a0dc4380ceae75c29450Johann PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, 9797bc9febe8749e98a3812a0dc4380ceae75c29450Johann src2, src3); 980da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); 981da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); 982da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = __msa_aver_u_b(out, pred); 983da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out, ref, var, avg); 984da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 985da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 986da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 987da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 988da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 989da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 990da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 991da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 9927bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_8width_h_msa( 9937bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 9947bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 9957bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 996da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 997da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 998da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 out, pred, filt0; 999da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 ref0, ref1, ref2, ref3; 1000da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3; 1001da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1002da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 vec0, vec1, vec2, vec3; 1003da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 1004da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 1005da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1006da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter); 1007da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt0 = (v16u8)__msa_fill_h(filtval); 1008da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1009da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 1010da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src, src_stride, src0, src1, src2, src3); 1011da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 1012da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1013da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 1014da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1015da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 1016da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1017da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 10187bc9febe8749e98a3812a0dc4380ceae75c29450Johann DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 10197bc9febe8749e98a3812a0dc4380ceae75c29450Johann vec2, vec3); 1020da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 10217bc9febe8749e98a3812a0dc4380ceae75c29450Johann PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, 10227bc9febe8749e98a3812a0dc4380ceae75c29450Johann src2, src3); 1023da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); 1024da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1025da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian pred = LD_UB(sec_pred); 1026da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 1027da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = __msa_aver_u_b(out, pred); 1028da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out, ref0, var, avg); 1029da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); 1030da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian pred = LD_UB(sec_pred); 1031da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 1032da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = __msa_aver_u_b(out, pred); 1033da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out, ref1, var, avg); 1034da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1035da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1036da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 1037da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 1038da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1039da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 1040da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1041da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 10427bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t subpel_avg_ssediff_16w_h_msa( 10437bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 10447bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 10457bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff, int32_t width) { 1046da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 1047da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 1048da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 1049da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1050da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 dst0, dst1, dst2, dst3; 1051da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 tmp0, tmp1, tmp2, tmp3; 1052da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 pred0, pred1, pred2, pred3, filt0; 1053da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1054da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 out0, out1, out2, out3, out4, out5, out6, out7; 1055da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 1056da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 1057da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1058da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter); 1059da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt0 = (v16u8)__msa_fill_h(filtval); 1060da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1061da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 1062da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src, src_stride, src0, src2, src4, src6); 1063da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 1064da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 1065da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 1066da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 1067da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); 1068da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += (4 * width); 1069da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1070da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1071da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1072da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); 1073da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); 10747bc9febe8749e98a3812a0dc4380ceae75c29450Johann DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 10757bc9febe8749e98a3812a0dc4380ceae75c29450Johann out2, out3); 10767bc9febe8749e98a3812a0dc4380ceae75c29450Johann DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 10777bc9febe8749e98a3812a0dc4380ceae75c29450Johann out6, out7); 1078da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 1079da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 10807bc9febe8749e98a3812a0dc4380ceae75c29450Johann PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1, 10817bc9febe8749e98a3812a0dc4380ceae75c29450Johann tmp2, tmp3); 10827bc9febe8749e98a3812a0dc4380ceae75c29450Johann AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1, 10837bc9febe8749e98a3812a0dc4380ceae75c29450Johann tmp2, tmp3); 1084da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1085da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(tmp0, dst0, var, avg); 1086da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(tmp1, dst1, var, avg); 1087da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(tmp2, dst2, var, avg); 1088da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(tmp3, dst3, var, avg); 1089da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1090da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1091da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 1092da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 1093da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1094da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 1095da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1096da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 10977bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_16width_h_msa( 10987bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 10997bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 11007bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 1101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, 1102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred, filter, height, diff, 16); 1103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 11057bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_32width_h_msa( 11067bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 11077bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 11087bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 1109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt, sse = 0; 1110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t diff0[2]; 1111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 11137bc9febe8749e98a3812a0dc4380ceae75c29450Johann sse += 11147bc9febe8749e98a3812a0dc4380ceae75c29450Johann subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, 11157bc9febe8749e98a3812a0dc4380ceae75c29450Johann filter, height, &diff0[loop_cnt], 32); 1116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 1117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 1118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 1119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = diff0[0] + diff0[1]; 1122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return sse; 1124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 11267bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_64width_h_msa( 11277bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 11287bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 11297bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 1130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt, sse = 0; 1131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t diff0[4]; 1132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 11347bc9febe8749e98a3812a0dc4380ceae75c29450Johann sse += 11357bc9febe8749e98a3812a0dc4380ceae75c29450Johann subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, 11367bc9febe8749e98a3812a0dc4380ceae75c29450Johann filter, height, &diff0[loop_cnt], 64); 1137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 1138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 1139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 1140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 1143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return sse; 1145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 11477bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_4width_v_msa( 11487bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 11497bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 11507bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 1151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 1152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 1153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t ref0, ref1, ref2, ref3; 1154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3, src4; 1155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src10_r, src32_r, src21_r, src43_r; 1156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 out, pred, ref = { 0 }; 1157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src2110, src4332, filt0; 1158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 1159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 1160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 tmp0, tmp1; 1161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter); 1163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt0 = (v16u8)__msa_fill_h(filtval); 1164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = LD_UB(src); 1166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 1167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 1169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src, src_stride, src1, src2, src3, src4); 1170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 1171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian pred = LD_UB(sec_pred); 1172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 1173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 1174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 1175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 11777bc9febe8749e98a3812a0dc4380ceae75c29450Johann ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 11787bc9febe8749e98a3812a0dc4380ceae75c29450Johann src32_r, src43_r); 1179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 1180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); 1181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = __msa_aver_u_b(out, pred); 1185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out, ref, var, avg); 1186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = src4; 1187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 1190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 1191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 1193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 11957bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_8width_v_msa( 11967bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 11977bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 11987bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 1199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 1200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 1201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3, src4; 1202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 ref0, ref1, ref2, ref3; 1203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 pred0, pred1, filt0; 1204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 vec0, vec1, vec2, vec3; 1205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 tmp0, tmp1, tmp2, tmp3; 1206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 1207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 1208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter); 1210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt0 = (v16u8)__msa_fill_h(filtval); 1211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = LD_UB(src); 1213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 1214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 1216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src, src_stride, src1, src2, src3, src4); 1217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 1218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(sec_pred, 16, pred0, pred1); 1219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 32; 1220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 1222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 12237bc9febe8749e98a3812a0dc4380ceae75c29450Johann ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, 12247bc9febe8749e98a3812a0dc4380ceae75c29450Johann vec3); 12257bc9febe8749e98a3812a0dc4380ceae75c29450Johann DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, 12267bc9febe8749e98a3812a0dc4380ceae75c29450Johann tmp2, tmp3); 1227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 1228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 1229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 1230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src0, ref0, var, avg); 1231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(src1, ref1, var, avg); 1232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = src4; 1234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 1237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 1238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 1240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 12427bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t subpel_avg_ssediff_16w_v_msa( 12437bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 12447bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 12457bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff, int32_t width) { 1246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 1247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 1248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 ref0, ref1, ref2, ref3; 1249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 pred0, pred1, pred2, pred3; 1250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3, src4; 1251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 out0, out1, out2, out3, filt0; 1252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 tmp0, tmp1, tmp2, tmp3; 1254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 1255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 1256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter); 1258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt0 = (v16u8)__msa_fill_h(filtval); 1259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = LD_UB(src); 1261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 1262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 1264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src, src_stride, src1, src2, src3, src4); 1265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 1266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); 1267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += (4 * width); 1268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); 1270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); 1271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 1272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); 1276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); 1277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 1278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 1279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 1280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 1282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 1286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 1287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 1288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = src4; 1290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 1292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 12937bc9febe8749e98a3812a0dc4380ceae75c29450Johann AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, 12947bc9febe8749e98a3812a0dc4380ceae75c29450Johann out2, out3); 1295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out0, ref0, var, avg); 1297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out1, ref1, var, avg); 1298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out2, ref2, var, avg); 1299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out3, ref3, var, avg); 1300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 1303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 1304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 1306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 13087bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_16width_v_msa( 13097bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 13107bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 13117bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 1312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, 1313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred, filter, height, diff, 16); 1314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 13167bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_32width_v_msa( 13177bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 13187bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 13197bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 1320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt, sse = 0; 1321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t diff0[2]; 1322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 13247bc9febe8749e98a3812a0dc4380ceae75c29450Johann sse += 13257bc9febe8749e98a3812a0dc4380ceae75c29450Johann subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, 13267bc9febe8749e98a3812a0dc4380ceae75c29450Johann filter, height, &diff0[loop_cnt], 32); 1327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 1328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 1329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 1330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = diff0[0] + diff0[1]; 1333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return sse; 1335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 13377bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_64width_v_msa( 13387bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 13397bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 13407bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t height, int32_t *diff) { 1341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt, sse = 0; 1342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t diff0[4]; 1343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 13457bc9febe8749e98a3812a0dc4380ceae75c29450Johann sse += 13467bc9febe8749e98a3812a0dc4380ceae75c29450Johann subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, 13477bc9febe8749e98a3812a0dc4380ceae75c29450Johann filter, height, &diff0[loop_cnt], 64); 1348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 1349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 1350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 1351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 1354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return sse; 1356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t sub_pixel_avg_sse_diff_4width_hv_msa( 13597bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 13607bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, 13617bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *filter_vert, int32_t height, int32_t *diff) { 1362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 1363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 1364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t ref0, ref1, ref2, ref3; 1365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3, src4; 1366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; 1367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt_hz, filt_vt, vec0, vec1; 1368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 out, pred, ref = { 0 }; 1369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; 1370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 1371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 1372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter_horiz); 1374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_fill_h(filtval); 1375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter_vert); 1376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_fill_h(filtval); 1377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = LD_UB(src); 1379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 1380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 1382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src, src_stride, src1, src2, src3, src4); 1383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 1384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian pred = LD_UB(sec_pred); 1385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 1386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 1387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 1388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 1389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 1390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 1391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 1392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 1393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); 1394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 1395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = __msa_aver_u_b(out, pred); 1399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out, ref, var, avg); 1400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = src4; 1401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 1404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 1405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 1407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t sub_pixel_avg_sse_diff_8width_hv_msa( 14107bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 14117bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, 14127bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *filter_vert, int32_t height, int32_t *diff) { 1413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 1414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 1415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 ref0, ref1, ref2, ref3; 1416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3, src4; 1417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 pred0, pred1, out0, out1; 1418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt_hz, filt_vt, vec0; 1419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 1421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 1422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 1423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter_horiz); 1425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_fill_h(filtval); 1426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter_vert); 1427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_fill_h(filtval); 1428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = LD_UB(src); 1430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 1431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 1432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 1434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src, src_stride, src1, src2, src3, src4); 1435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 1436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(sec_pred, 16, pred0, pred1); 1437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 32; 1438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 1440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 1442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 1443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 1445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = __msa_dotp_u_h(vec0, filt_vt); 1446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 1447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 1449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = __msa_dotp_u_h(vec0, filt_vt); 1450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 1452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 1454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = __msa_dotp_u_h(vec0, filt_vt); 1455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 1456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 1458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = __msa_dotp_u_h(vec0, filt_vt); 1459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 1461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 1462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); 1463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out0, ref0, var, avg); 1465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out1, ref1, var, avg); 1466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 1469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 1470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 1472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 14747bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t subpel_avg_ssediff_16w_hv_msa( 14757bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 14767bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, 14777bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) { 1478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int16_t filtval; 1479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 1480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 1481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 ref0, ref1, ref2, ref3; 1482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 pred0, pred1, pred2, pred3; 1483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 out0, out1, out2, out3; 1484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt_hz, filt_vt, vec0, vec1; 1485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; 1487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 avg = { 0 }; 1488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v4i32 vec, var = { 0 }; 1489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter_horiz); 1491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_fill_h(filtval); 1492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filtval = LH(filter_vert); 1493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_fill_h(filtval); 1494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB2(src, 8, src0, src1); 1496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 1497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 1499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 1500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 1502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src, src_stride, src0, src2, src4, src6); 1503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(src + 8, src_stride, src1, src3, src5, src7); 1504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 1505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); 1506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += (4 * width); 1507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 1509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 1510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 1511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 1516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 1517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 1518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 1523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); 1524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 1525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); 1530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); 1531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 1532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 1538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 15397bc9febe8749e98a3812a0dc4380ceae75c29450Johann AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, 15407bc9febe8749e98a3812a0dc4380ceae75c29450Johann out2, out3); 1541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out0, ref0, var, avg); 1543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out1, ref1, var, avg); 1544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out2, ref2, var, avg); 1545da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian CALC_MSE_AVG_B(out3, ref3, var, avg); 1546da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1547da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec = __msa_hadd_s_w(avg, avg); 1549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = HADD_SW_S32(vec); 1550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return HADD_SW_S32(var); 1552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t sub_pixel_avg_sse_diff_16width_hv_msa( 15557bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 15567bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, 15577bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *filter_vert, int32_t height, int32_t *diff) { 1558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, 1559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred, filter_horiz, filter_vert, 1560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian height, diff, 16); 1561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t sub_pixel_avg_sse_diff_32width_hv_msa( 15647bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 15657bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, 15667bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *filter_vert, int32_t height, int32_t *diff) { 1567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt, sse = 0; 1568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t diff0[2]; 1569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 1571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, 1572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred, filter_horiz, filter_vert, 1573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian height, &diff0[loop_cnt], 32); 1574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 1575da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 1576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 1577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = diff0[0] + diff0[1]; 1580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return sse; 1582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( 15857bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, const uint8_t *dst, 15867bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, 15877bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *filter_vert, int32_t height, int32_t *diff) { 1588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt, sse = 0; 1589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t diff0[4]; 1590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1591da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 1592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, 1593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred, filter_horiz, filter_vert, 1594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian height, &diff0[loop_cnt], 64); 1595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 1596da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 1597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred += 16; 1598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 1601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return sse; 1603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); 1606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); 1607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); 1608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); 1609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); 1610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); 1611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); 1612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1613da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); 1614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); 1615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); 1616da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); 1617da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); 1618da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); 1619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 16207bc9febe8749e98a3812a0dc4380ceae75c29450Johann#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ 16217bc9febe8749e98a3812a0dc4380ceae75c29450Johann uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa( \ 16227bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, int32_t xoffset, \ 16237bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \ 16247bc9febe8749e98a3812a0dc4380ceae75c29450Johann uint32_t *sse) { \ 16257bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t diff; \ 16267bc9febe8749e98a3812a0dc4380ceae75c29450Johann uint32_t var; \ 16277bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ 16287bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ 16297bc9febe8749e98a3812a0dc4380ceae75c29450Johann \ 16307bc9febe8749e98a3812a0dc4380ceae75c29450Johann if (yoffset) { \ 16317bc9febe8749e98a3812a0dc4380ceae75c29450Johann if (xoffset) { \ 16327bc9febe8749e98a3812a0dc4380ceae75c29450Johann *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \ 16337bc9febe8749e98a3812a0dc4380ceae75c29450Johann src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ 16347bc9febe8749e98a3812a0dc4380ceae75c29450Johann } else { \ 16357bc9febe8749e98a3812a0dc4380ceae75c29450Johann *sse = sub_pixel_sse_diff_##wd##width_v_msa( \ 16367bc9febe8749e98a3812a0dc4380ceae75c29450Johann src, src_stride, ref, ref_stride, v_filter, ht, &diff); \ 16377bc9febe8749e98a3812a0dc4380ceae75c29450Johann } \ 16387bc9febe8749e98a3812a0dc4380ceae75c29450Johann \ 16397bc9febe8749e98a3812a0dc4380ceae75c29450Johann var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ 16407bc9febe8749e98a3812a0dc4380ceae75c29450Johann } else { \ 16417bc9febe8749e98a3812a0dc4380ceae75c29450Johann if (xoffset) { \ 16427bc9febe8749e98a3812a0dc4380ceae75c29450Johann *sse = sub_pixel_sse_diff_##wd##width_h_msa( \ 16437bc9febe8749e98a3812a0dc4380ceae75c29450Johann src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ 16447bc9febe8749e98a3812a0dc4380ceae75c29450Johann \ 16457bc9febe8749e98a3812a0dc4380ceae75c29450Johann var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ 16467bc9febe8749e98a3812a0dc4380ceae75c29450Johann } else { \ 16477bc9febe8749e98a3812a0dc4380ceae75c29450Johann var = vpx_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \ 16487bc9febe8749e98a3812a0dc4380ceae75c29450Johann sse); \ 16497bc9febe8749e98a3812a0dc4380ceae75c29450Johann } \ 16507bc9febe8749e98a3812a0dc4380ceae75c29450Johann } \ 16517bc9febe8749e98a3812a0dc4380ceae75c29450Johann \ 16527bc9febe8749e98a3812a0dc4380ceae75c29450Johann return var; \ 16537bc9febe8749e98a3812a0dc4380ceae75c29450Johann } 1654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1655da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4); 1656da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8); 1657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4); 1659da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8); 1660da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16); 1661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8); 1663da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16); 1664da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32); 1665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16); 1667da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32); 1668da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64); 1669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32); 1671da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64); 1672da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \ 16747bc9febe8749e98a3812a0dc4380ceae75c29450Johann uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \ 16757bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ 16767bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ 16777bc9febe8749e98a3812a0dc4380ceae75c29450Johann uint32_t *sse, const uint8_t *sec_pred) { \ 16787bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t diff; \ 16797bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ 16807bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ 1681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 16827bc9febe8749e98a3812a0dc4380ceae75c29450Johann if (yoffset) { \ 16837bc9febe8749e98a3812a0dc4380ceae75c29450Johann if (xoffset) { \ 16847bc9febe8749e98a3812a0dc4380ceae75c29450Johann *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \ 16857bc9febe8749e98a3812a0dc4380ceae75c29450Johann src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ 16867bc9febe8749e98a3812a0dc4380ceae75c29450Johann v_filter, ht, &diff); \ 16877bc9febe8749e98a3812a0dc4380ceae75c29450Johann } else { \ 16887bc9febe8749e98a3812a0dc4380ceae75c29450Johann *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \ 16897bc9febe8749e98a3812a0dc4380ceae75c29450Johann src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ 16907bc9febe8749e98a3812a0dc4380ceae75c29450Johann &diff); \ 16917bc9febe8749e98a3812a0dc4380ceae75c29450Johann } \ 1692da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { \ 16937bc9febe8749e98a3812a0dc4380ceae75c29450Johann if (xoffset) { \ 16947bc9febe8749e98a3812a0dc4380ceae75c29450Johann *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \ 16957bc9febe8749e98a3812a0dc4380ceae75c29450Johann src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ 16967bc9febe8749e98a3812a0dc4380ceae75c29450Johann &diff); \ 16977bc9febe8749e98a3812a0dc4380ceae75c29450Johann } else { \ 16987bc9febe8749e98a3812a0dc4380ceae75c29450Johann *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \ 16997bc9febe8749e98a3812a0dc4380ceae75c29450Johann ref_stride, sec_pred, ht, &diff); \ 17007bc9febe8749e98a3812a0dc4380ceae75c29450Johann } \ 1701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } \ 1702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian \ 17037bc9febe8749e98a3812a0dc4380ceae75c29450Johann return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ 17047bc9febe8749e98a3812a0dc4380ceae75c29450Johann } 1705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4); 1707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8); 1708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4); 1710da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8); 1711da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16); 1712da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1713da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8); 1714da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16); 1715da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32); 1716da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1717da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16); 1718da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32); 1719da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1720da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianuint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, 1721da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t src_stride, 17227bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t xoffset, int32_t yoffset, 1723da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const uint8_t *ref_ptr, 17247bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t ref_stride, uint32_t *sse, 1725da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const uint8_t *sec_pred) { 1726da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t diff; 1727da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const uint8_t *h_filter = bilinear_filters_msa[xoffset]; 1728da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian const uint8_t *v_filter = bilinear_filters_msa[yoffset]; 1729da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1730da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (yoffset) { 1731da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (xoffset) { 17327bc9febe8749e98a3812a0dc4380ceae75c29450Johann *sse = sub_pixel_avg_sse_diff_32width_hv_msa( 17337bc9febe8749e98a3812a0dc4380ceae75c29450Johann src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, 17347bc9febe8749e98a3812a0dc4380ceae75c29450Johann v_filter, 64, &diff); 1735da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 17367bc9febe8749e98a3812a0dc4380ceae75c29450Johann *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr, 17377bc9febe8749e98a3812a0dc4380ceae75c29450Johann ref_stride, sec_pred, 17387bc9febe8749e98a3812a0dc4380ceae75c29450Johann v_filter, 64, &diff); 1739da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1740da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 1741da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (xoffset) { 17427bc9febe8749e98a3812a0dc4380ceae75c29450Johann *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr, 17437bc9febe8749e98a3812a0dc4380ceae75c29450Johann ref_stride, sec_pred, 17447bc9febe8749e98a3812a0dc4380ceae75c29450Johann h_filter, 64, &diff); 1745da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 1746da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride, 1747da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian sec_pred, &diff); 1748da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1749da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 1750da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1751da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian return VARIANCE_32Wx64H(*sse, diff); 1752da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 1753da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 17547bc9febe8749e98a3812a0dc4380ceae75c29450Johann#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ 17557bc9febe8749e98a3812a0dc4380ceae75c29450Johann uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa( \ 17567bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ 17577bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ 17587bc9febe8749e98a3812a0dc4380ceae75c29450Johann uint32_t *sse, const uint8_t *sec_pred) { \ 17597bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t diff; \ 17607bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ 17617bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ 17627bc9febe8749e98a3812a0dc4380ceae75c29450Johann \ 17637bc9febe8749e98a3812a0dc4380ceae75c29450Johann if (yoffset) { \ 17647bc9febe8749e98a3812a0dc4380ceae75c29450Johann if (xoffset) { \ 17657bc9febe8749e98a3812a0dc4380ceae75c29450Johann *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \ 17667bc9febe8749e98a3812a0dc4380ceae75c29450Johann src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ 17677bc9febe8749e98a3812a0dc4380ceae75c29450Johann v_filter, ht, &diff); \ 17687bc9febe8749e98a3812a0dc4380ceae75c29450Johann } else { \ 17697bc9febe8749e98a3812a0dc4380ceae75c29450Johann *sse = sub_pixel_avg_sse_diff_64width_v_msa( \ 17707bc9febe8749e98a3812a0dc4380ceae75c29450Johann src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ 17717bc9febe8749e98a3812a0dc4380ceae75c29450Johann &diff); \ 17727bc9febe8749e98a3812a0dc4380ceae75c29450Johann } \ 17737bc9febe8749e98a3812a0dc4380ceae75c29450Johann } else { \ 17747bc9febe8749e98a3812a0dc4380ceae75c29450Johann if (xoffset) { \ 17757bc9febe8749e98a3812a0dc4380ceae75c29450Johann *sse = sub_pixel_avg_sse_diff_64width_h_msa( \ 17767bc9febe8749e98a3812a0dc4380ceae75c29450Johann src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ 17777bc9febe8749e98a3812a0dc4380ceae75c29450Johann &diff); \ 17787bc9febe8749e98a3812a0dc4380ceae75c29450Johann } else { \ 17797bc9febe8749e98a3812a0dc4380ceae75c29450Johann *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \ 17807bc9febe8749e98a3812a0dc4380ceae75c29450Johann ref_stride, sec_pred, &diff); \ 17817bc9febe8749e98a3812a0dc4380ceae75c29450Johann } \ 17827bc9febe8749e98a3812a0dc4380ceae75c29450Johann } \ 17837bc9febe8749e98a3812a0dc4380ceae75c29450Johann \ 17847bc9febe8749e98a3812a0dc4380ceae75c29450Johann return VARIANCE_64Wx##ht##H(*sse, diff); \ 17857bc9febe8749e98a3812a0dc4380ceae75c29450Johann } 1786da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1787da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32); 1788da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64); 1789