1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/*
2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *
4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */
10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h"
12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_ports/mem.h"
13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/mips/macros_msa.h"
14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/variance.h"
15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
16da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic const uint8_t bilinear_filters_msa[8][2] = {
177bc9febe8749e98a3812a0dc4380ceae75c29450Johann  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
187bc9febe8749e98a3812a0dc4380ceae75c29450Johann  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
19da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian};
20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
217bc9febe8749e98a3812a0dc4380ceae75c29450Johann#define CALC_MSE_AVG_B(src, ref, var, sub)                          \
227bc9febe8749e98a3812a0dc4380ceae75c29450Johann  {                                                                 \
237bc9febe8749e98a3812a0dc4380ceae75c29450Johann    v16u8 src_l0_m, src_l1_m;                                       \
247bc9febe8749e98a3812a0dc4380ceae75c29450Johann    v8i16 res_l0_m, res_l1_m;                                       \
257bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                                                    \
267bc9febe8749e98a3812a0dc4380ceae75c29450Johann    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
277bc9febe8749e98a3812a0dc4380ceae75c29450Johann    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
287bc9febe8749e98a3812a0dc4380ceae75c29450Johann    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
297bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                                                    \
307bc9febe8749e98a3812a0dc4380ceae75c29450Johann    sub += res_l0_m + res_l1_m;                                     \
317bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
337bc9febe8749e98a3812a0dc4380ceae75c29450Johann#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
34da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_LARGE_WxH(sse, diff, shift) \
36da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  sse - (((int64_t)diff * diff) >> shift)
37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                        int32_t src_stride,
40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                        const uint8_t *ref_ptr,
41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                        int32_t ref_stride,
427bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                        const uint8_t *sec_pred, int32_t height,
43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                        int32_t *diff) {
44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t ht_cnt;
45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t src0, src1, src2, src3;
46da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t ref0, ref1, ref2, ref3;
47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 pred, src = { 0 };
48da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 ref = { 0 };
49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
50da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (ht_cnt = (height >> 2); ht_cnt--;) {
53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred = LD_UB(sec_pred);
54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LW4(src_ptr, src_stride, src0, src1, src2, src3);
56da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += (4 * src_stride);
57da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += (4 * ref_stride);
59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    INSERT_W4_UB(src0, src1, src2, src3, src);
61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
63da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src = __msa_aver_u_b(src, pred);
64da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src, ref, var, avg);
65da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
66da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
72da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
73da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                        int32_t src_stride,
75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                        const uint8_t *ref_ptr,
76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                        int32_t ref_stride,
777bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                        const uint8_t *sec_pred, int32_t height,
78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                        int32_t *diff) {
79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t ht_cnt;
80da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3;
81da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 ref0, ref1, ref2, ref3;
82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 pred0, pred1;
83da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (ht_cnt = (height >> 2); ht_cnt--;) {
87da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(sec_pred, 16, pred0, pred1);
88da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 32;
89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += (4 * src_stride);
91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += (4 * ref_stride);
93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
947bc9febe8749e98a3812a0dc4380ceae75c29450Johann    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
957bc9febe8749e98a3812a0dc4380ceae75c29450Johann                ref0, ref1);
96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg);
98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg);
99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                         int32_t src_stride,
109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                         const uint8_t *ref_ptr,
110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                         int32_t ref_stride,
111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                         const uint8_t *sec_pred,
1127bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                         int32_t height, int32_t *diff) {
113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t ht_cnt;
114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src, ref, pred;
115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (ht_cnt = (height >> 2); ht_cnt--;) {
119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred = LD_UB(sec_pred);
120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src = LD_UB(src_ptr);
122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref = LD_UB(ref_ptr);
124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src = __msa_aver_u_b(src, pred);
126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src, ref, var, avg);
127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred = LD_UB(sec_pred);
129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src = LD_UB(src_ptr);
131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref = LD_UB(ref_ptr);
133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src = __msa_aver_u_b(src, pred);
135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src, ref, var, avg);
136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred = LD_UB(sec_pred);
138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src = LD_UB(src_ptr);
140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref = LD_UB(ref_ptr);
142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src = __msa_aver_u_b(src, pred);
144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src, ref, var, avg);
145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred = LD_UB(sec_pred);
147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src = LD_UB(src_ptr);
149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref = LD_UB(ref_ptr);
151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src = __msa_aver_u_b(src, pred);
153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src, ref, var, avg);
154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                         int32_t src_stride,
164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                         const uint8_t *ref_ptr,
165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                         int32_t ref_stride,
166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                         const uint8_t *sec_pred,
1677bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                         int32_t height, int32_t *diff) {
168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t ht_cnt;
169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, ref0, ref1, pred0, pred1;
170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (ht_cnt = (height >> 2); ht_cnt--;) {
174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(sec_pred, 16, pred0, pred1);
175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 32;
176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(src_ptr, 16, src0, src1);
177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(ref_ptr, 16, ref0, ref1);
179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg);
182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg);
183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(sec_pred, 16, pred0, pred1);
185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 32;
186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(src_ptr, 16, src0, src1);
187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(ref_ptr, 16, ref0, ref1);
189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg);
192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg);
193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(sec_pred, 16, pred0, pred1);
195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 32;
196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(src_ptr, 16, src0, src1);
197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(ref_ptr, 16, ref0, ref1);
199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg);
202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg);
203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(sec_pred, 16, pred0, pred1);
205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 32;
206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(src_ptr, 16, src0, src1);
207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(ref_ptr, 16, ref0, ref1);
209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg);
212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg);
213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                       int32_t src_stride,
223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                       const uint8_t *ref_ptr,
224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                       int32_t ref_stride,
2257bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                       const uint8_t *sec_pred, int32_t *diff) {
226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t ht_cnt;
227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, ref0, ref1, pred0, pred1;
228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg0 = { 0 };
229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg1 = { 0 };
230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (ht_cnt = 16; ht_cnt--;) {
233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(sec_pred, 16, pred0, pred1);
234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 32;
235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(src_ptr, 16, src0, src1);
236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(ref_ptr, 16, ref0, ref1);
238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg0);
241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg1);
242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(sec_pred, 16, pred0, pred1);
244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 32;
245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(src_ptr, 16, src0, src1);
246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(ref_ptr, 16, ref0, ref1);
248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg0);
251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg1);
252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(sec_pred, 16, pred0, pred1);
254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 32;
255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(src_ptr, 16, src0, src1);
256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(ref_ptr, 16, ref0, ref1);
258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg0);
261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg1);
262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(sec_pred, 16, pred0, pred1);
264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 32;
265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(src_ptr, 16, src0, src1);
266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(ref_ptr, 16, ref0, ref1);
268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg0);
271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg1);
272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg0, avg0);
275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec += __msa_hadd_s_w(avg1, avg1);
276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                       int32_t src_stride,
283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                       const uint8_t *ref_ptr,
284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                       int32_t ref_stride,
2857bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                       const uint8_t *sec_pred, int32_t *diff) {
286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t ht_cnt;
287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3;
288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 ref0, ref1, ref2, ref3;
289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 pred0, pred1, pred2, pred3;
290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg0 = { 0 };
291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg1 = { 0 };
292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (ht_cnt = 16; ht_cnt--;) {
295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 64;
297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
3017bc9febe8749e98a3812a0dc4380ceae75c29450Johann    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
3027bc9febe8749e98a3812a0dc4380ceae75c29450Johann                src2, src3);
303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg0);
304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src2, ref2, var, avg0);
305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg1);
306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src3, ref3, var, avg1);
307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 64;
310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
3147bc9febe8749e98a3812a0dc4380ceae75c29450Johann    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
3157bc9febe8749e98a3812a0dc4380ceae75c29450Johann                src2, src3);
316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg0);
317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src2, ref2, var, avg0);
318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg1);
319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src3, ref3, var, avg1);
320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg0, avg0);
323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec += __msa_hadd_s_w(avg1, avg1);
324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                       int32_t src_stride,
332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                       const uint8_t *ref_ptr,
333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                       int32_t ref_stride,
3347bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                       const uint8_t *sec_pred, int32_t *diff) {
335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t ht_cnt;
336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3;
337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 ref0, ref1, ref2, ref3;
338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 pred0, pred1, pred2, pred3;
339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg0 = { 0 };
340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg1 = { 0 };
341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg2 = { 0 };
342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg3 = { 0 };
343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (ht_cnt = 32; ht_cnt--;) {
346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 64;
348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
3527bc9febe8749e98a3812a0dc4380ceae75c29450Johann    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
3537bc9febe8749e98a3812a0dc4380ceae75c29450Johann                src2, src3);
354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg0);
355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg1);
356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src2, ref2, var, avg2);
357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src3, ref3, var, avg3);
358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 64;
361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += src_stride;
363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ref_ptr += ref_stride;
3657bc9febe8749e98a3812a0dc4380ceae75c29450Johann    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
3667bc9febe8749e98a3812a0dc4380ceae75c29450Johann                src2, src3);
367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg0);
368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg1);
369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src2, ref2, var, avg2);
370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src3, ref3, var, avg3);
371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg0, avg0);
374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec += __msa_hadd_s_w(avg1, avg1);
375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec += __msa_hadd_s_w(avg2, avg2);
376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec += __msa_hadd_s_w(avg3, avg3);
377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3827bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_4width_h_msa(
3837bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
3847bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t ref0, ref1, ref2, ref3;
388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt0, ref = { 0 };
389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3;
390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 vec0, vec1, vec2, vec3;
392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter);
396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_fill_h(filtval);
397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src1, src2, src3);
400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
4067bc9febe8749e98a3812a0dc4380ceae75c29450Johann    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
4077bc9febe8749e98a3812a0dc4380ceae75c29450Johann                vec2, vec3);
408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
4097bc9febe8749e98a3812a0dc4380ceae75c29450Johann    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
4107bc9febe8749e98a3812a0dc4380ceae75c29450Johann                src2, src3);
411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref, var, avg);
414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
4227bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_8width_h_msa(
4237bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
4247bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt0, out, ref0, ref1, ref2, ref3;
428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3;
429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 vec0, vec1, vec2, vec3;
431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter);
435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_fill_h(filtval);
436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src1, src2, src3);
439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
4467bc9febe8749e98a3812a0dc4380ceae75c29450Johann    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
4477bc9febe8749e98a3812a0dc4380ceae75c29450Johann                vec2, vec3);
448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
4497bc9febe8749e98a3812a0dc4380ceae75c29450Johann    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
4507bc9febe8749e98a3812a0dc4380ceae75c29450Johann                src2, src3);
451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out, ref0, var, avg);
453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out, ref1, var, avg);
455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
4637bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_16width_h_msa(
4647bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
4657bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 dst0, dst1, dst2, dst3, filt0;
471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter);
477da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_fill_h(filtval);
478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src2, src4, src6);
481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
4907bc9febe8749e98a3812a0dc4380ceae75c29450Johann    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
4917bc9febe8749e98a3812a0dc4380ceae75c29450Johann                out2, out3);
4927bc9febe8749e98a3812a0dc4380ceae75c29450Johann    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
4937bc9febe8749e98a3812a0dc4380ceae75c29450Johann                out6, out7);
494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
4967bc9febe8749e98a3812a0dc4380ceae75c29450Johann    PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
4977bc9febe8749e98a3812a0dc4380ceae75c29450Johann                src2, src3);
498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, dst0, var, avg);
499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, dst1, var, avg);
500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src2, dst2, var, avg);
501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src3, dst3, var, avg);
502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
5107bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_32width_h_msa(
5117bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
5127bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt, sse = 0;
514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t diff0[2];
515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                            filter, height, &diff0[loop_cnt]);
519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = diff0[0] + diff0[1];
524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return sse;
526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
5287bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_64width_h_msa(
5297bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
5307bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt, sse = 0;
532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t diff0[4];
533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                            filter, height, &diff0[loop_cnt]);
537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return sse;
544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
545da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
5467bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_4width_v_msa(
5477bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
5487bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t ref0, ref1, ref2, ref3;
552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3, src4, out;
553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src10_r, src32_r, src21_r, src43_r;
554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 ref = { 0 };
555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src2110, src4332;
556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt0;
557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 tmp0, tmp1;
560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter);
562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_fill_h(filtval);
563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src0 = LD_UB(src);
565da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
566da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src, src_stride, src1, src2, src3, src4);
569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
5747bc9febe8749e98a3812a0dc4380ceae75c29450Johann    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
5757bc9febe8749e98a3812a0dc4380ceae75c29450Johann               src32_r, src43_r);
576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out, ref, var, avg);
581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src0 = src4;
582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
586da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
587da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
5907bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_8width_v_msa(
5917bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
5927bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3, src4;
596da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 ref0, ref1, ref2, ref3;
597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 vec0, vec1, vec2, vec3;
598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 tmp0, tmp1, tmp2, tmp3;
599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt0;
600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter);
604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_fill_h(filtval);
605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src0 = LD_UB(src);
607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src, src_stride, src1, src2, src3, src4);
611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
613da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
6167bc9febe8749e98a3812a0dc4380ceae75c29450Johann    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
6177bc9febe8749e98a3812a0dc4380ceae75c29450Johann               vec3);
6187bc9febe8749e98a3812a0dc4380ceae75c29450Johann    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
6197bc9febe8749e98a3812a0dc4380ceae75c29450Johann                tmp2, tmp3);
620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
621da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
622da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg);
623da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg);
624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src0 = src4;
625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
626da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
627da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
628da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
629da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
632da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
6337bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_16width_v_msa(
6347bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
6357bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
636da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
637da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
638da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 ref0, ref1, ref2, ref3;
639da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3, src4;
640da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 out0, out1, out2, out3;
641da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
642da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 tmp0, tmp1, tmp2, tmp3;
643da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt0;
644da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
645da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
646da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
647da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter);
648da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_fill_h(filtval);
649da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
650da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src0 = LD_UB(src);
651da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
652da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
653da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src, src_stride, src1, src2, src3, src4);
655da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
656da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
659da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
660da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
663da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
664da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
667da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
668da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
671da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
672da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
674da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
675da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
676da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
677da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
678da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src0 = src4;
679da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
680da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out0, ref0, var, avg);
681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out1, ref1, var, avg);
682da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out2, ref2, var, avg);
683da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out3, ref3, var, avg);
684da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
685da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
686da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
687da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
688da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
689da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
690da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
691da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
6927bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_32width_v_msa(
6937bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
6947bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
695da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt, sse = 0;
696da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t diff0[2];
697da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
698da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
699da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
700da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                            filter, height, &diff0[loop_cnt]);
701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
703da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
704da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = diff0[0] + diff0[1];
706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return sse;
708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
7107bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_64width_v_msa(
7117bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
7127bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
713da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt, sse = 0;
714da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t diff0[4];
715da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
716da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
717da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
718da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                            filter, height, &diff0[loop_cnt]);
719da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
720da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
721da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
722da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
723da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
724da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
725da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return sse;
726da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
727da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
7287bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_4width_hv_msa(
7297bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
7307bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
7317bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff) {
732da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
733da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
734da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t ref0, ref1, ref2, ref3;
735da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3, src4;
736da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 out, ref = { 0 };
737da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt_vt, filt_hz, vec0, vec1;
738da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
739da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
740da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 tmp0, tmp1;
741da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
742da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
743da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
744da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter_horiz);
745da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_fill_h(filtval);
746da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter_vert);
747da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_fill_h(filtval);
748da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
749da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src0 = LD_UB(src);
750da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
751da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
752da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
753da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src, src_stride, src1, src2, src3, src4);
754da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
755da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
756da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
757da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
758da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
759da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
760da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
761da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
762da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
763da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
764da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
765da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
766da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
767da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out, ref, var, avg);
768da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src0 = src4;
769da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
770da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
771da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
772da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
773da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
774da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
775da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
776da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
7777bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_8width_hv_msa(
7787bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
7797bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
7807bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff) {
781da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
782da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
783da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 ref0, ref1, ref2, ref3;
784da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3, src4;
785da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 out0, out1;
786da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
787da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out0, hz_out1;
788da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 tmp0, tmp1, tmp2, tmp3;
789da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt_vt, filt_hz, vec0;
790da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
791da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
792da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
793da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter_horiz);
794da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_fill_h(filtval);
795da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter_vert);
796da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_fill_h(filtval);
797da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
798da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src0 = LD_UB(src);
799da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
800da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
801da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
802da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
803da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src, src_stride, src1, src2, src3, src4);
804da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
805da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
806da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
807da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
808da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
809da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
810da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
811da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
812da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
813da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
814da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
815da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
816da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
817da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
818da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
819da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
820da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
821da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
822da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
823da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
824da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out0, ref0, var, avg);
825da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out1, ref1, var, avg);
826da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
827da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
828da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
829da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
830da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
831da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
832da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
833da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
8347bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_16width_hv_msa(
8357bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
8367bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
8377bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff) {
838da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
839da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
840da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
841da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 ref0, ref1, ref2, ref3;
842da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt_hz, filt_vt, vec0, vec1;
843da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
844da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
845da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 tmp0, tmp1;
846da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
847da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
848da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
849da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter_horiz);
850da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_fill_h(filtval);
851da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter_vert);
852da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_fill_h(filtval);
853da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
854da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_UB2(src, 8, src0, src1);
855da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
856da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
857da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
858da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
859da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
860da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
861da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src, src_stride, src0, src2, src4, src6);
862da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
863da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
864da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
865da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
866da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
867da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
868da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
869da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
870da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
871da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
872da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
873da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
874da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
875da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
876da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
877da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
878da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
879da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
880da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
881da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
882da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
883da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
884da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
885da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
886da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
887da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
888da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
889da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
890da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
891da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
892da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
893da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
894da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
895da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg);
896da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg);
897da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src2, ref2, var, avg);
898da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src3, ref3, var, avg);
899da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
900da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
901da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
902da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
903da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
904da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
905da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
906da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
9077bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_32width_hv_msa(
9087bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
9097bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
9107bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff) {
911da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt, sse = 0;
912da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t diff0[2];
913da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
914da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
915da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
916da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                             filter_horiz, filter_vert, height,
917da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                             &diff0[loop_cnt]);
918da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
919da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
920da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
921da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
922da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = diff0[0] + diff0[1];
923da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
924da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return sse;
925da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
926da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
9277bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_sse_diff_64width_hv_msa(
9287bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
9297bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
9307bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff) {
931da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt, sse = 0;
932da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t diff0[4];
933da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
934da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
935da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
936da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                             filter_horiz, filter_vert, height,
937da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                             &diff0[loop_cnt]);
938da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
939da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
940da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
941da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
942da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
943da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
944da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return sse;
945da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
946da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
9477bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
9487bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
9497bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
9507bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff) {
951da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
952da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
953da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t ref0, ref1, ref2, ref3;
954da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 out, pred, filt0, ref = { 0 };
955da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3;
956da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
957da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 vec0, vec1, vec2, vec3;
958da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
959da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
960da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
961da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter);
962da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_fill_h(filtval);
963da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
964da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
965da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src1, src2, src3);
966da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
967da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred = LD_UB(sec_pred);
968da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
969da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
970da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
971da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
972da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
973da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
974da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
9757bc9febe8749e98a3812a0dc4380ceae75c29450Johann    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
9767bc9febe8749e98a3812a0dc4380ceae75c29450Johann                vec2, vec3);
977da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
9787bc9febe8749e98a3812a0dc4380ceae75c29450Johann    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
9797bc9febe8749e98a3812a0dc4380ceae75c29450Johann                src2, src3);
980da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
981da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
982da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out = __msa_aver_u_b(out, pred);
983da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out, ref, var, avg);
984da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
985da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
986da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
987da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
988da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
989da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
990da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
991da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
9927bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
9937bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
9947bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
9957bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff) {
996da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
997da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
998da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 out, pred, filt0;
999da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 ref0, ref1, ref2, ref3;
1000da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3;
1001da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1002da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 vec0, vec1, vec2, vec3;
1003da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
1004da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
1005da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1006da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter);
1007da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_fill_h(filtval);
1008da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1009da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
1010da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src1, src2, src3);
1011da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
1012da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1013da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
1014da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1015da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1016da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1017da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
10187bc9febe8749e98a3812a0dc4380ceae75c29450Johann    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
10197bc9febe8749e98a3812a0dc4380ceae75c29450Johann                vec2, vec3);
1020da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
10217bc9febe8749e98a3812a0dc4380ceae75c29450Johann    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
10227bc9febe8749e98a3812a0dc4380ceae75c29450Johann                src2, src3);
1023da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
1024da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1025da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred = LD_UB(sec_pred);
1026da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
1027da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out = __msa_aver_u_b(out, pred);
1028da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out, ref0, var, avg);
1029da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
1030da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred = LD_UB(sec_pred);
1031da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
1032da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out = __msa_aver_u_b(out, pred);
1033da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out, ref1, var, avg);
1034da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1035da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1036da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
1037da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
1038da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1039da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
1040da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1041da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
10427bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t subpel_avg_ssediff_16w_h_msa(
10437bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
10447bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
10457bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff, int32_t width) {
1046da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
1047da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
1048da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1049da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1050da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 dst0, dst1, dst2, dst3;
1051da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 tmp0, tmp1, tmp2, tmp3;
1052da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 pred0, pred1, pred2, pred3, filt0;
1053da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1054da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
1055da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
1056da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
1057da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1058da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter);
1059da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_fill_h(filtval);
1060da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1061da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
1062da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src2, src4, src6);
1063da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1064da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
1065da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1066da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
1067da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1068da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += (4 * width);
1069da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1070da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1071da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1072da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
1073da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
10747bc9febe8749e98a3812a0dc4380ceae75c29450Johann    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
10757bc9febe8749e98a3812a0dc4380ceae75c29450Johann                out2, out3);
10767bc9febe8749e98a3812a0dc4380ceae75c29450Johann    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
10777bc9febe8749e98a3812a0dc4380ceae75c29450Johann                out6, out7);
1078da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
1079da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
10807bc9febe8749e98a3812a0dc4380ceae75c29450Johann    PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
10817bc9febe8749e98a3812a0dc4380ceae75c29450Johann                tmp2, tmp3);
10827bc9febe8749e98a3812a0dc4380ceae75c29450Johann    AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
10837bc9febe8749e98a3812a0dc4380ceae75c29450Johann                tmp2, tmp3);
1084da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1085da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(tmp0, dst0, var, avg);
1086da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(tmp1, dst1, var, avg);
1087da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(tmp2, dst2, var, avg);
1088da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(tmp3, dst3, var, avg);
1089da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1090da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1091da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
1092da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
1093da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1094da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
1095da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1096da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
10977bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
10987bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
10997bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
11007bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff) {
1101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
1102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      sec_pred, filter, height, diff, 16);
1103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
11057bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
11067bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
11077bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
11087bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff) {
1109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt, sse = 0;
1110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t diff0[2];
1111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
11137bc9febe8749e98a3812a0dc4380ceae75c29450Johann    sse +=
11147bc9febe8749e98a3812a0dc4380ceae75c29450Johann        subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
11157bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     filter, height, &diff0[loop_cnt], 32);
1116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
1117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
1118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
1119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = diff0[0] + diff0[1];
1122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return sse;
1124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
11267bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
11277bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
11287bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
11297bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff) {
1130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt, sse = 0;
1131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t diff0[4];
1132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
11347bc9febe8749e98a3812a0dc4380ceae75c29450Johann    sse +=
11357bc9febe8749e98a3812a0dc4380ceae75c29450Johann        subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
11367bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     filter, height, &diff0[loop_cnt], 64);
1137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
1138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
1139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
1140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return sse;
1145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
11477bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
11487bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
11497bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
11507bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff) {
1151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
1152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
1153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t ref0, ref1, ref2, ref3;
1154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3, src4;
1155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src10_r, src32_r, src21_r, src43_r;
1156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 out, pred, ref = { 0 };
1157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src2110, src4332, filt0;
1158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
1159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
1160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 tmp0, tmp1;
1161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter);
1163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_fill_h(filtval);
1164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src0 = LD_UB(src);
1166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
1167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
1169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src, src_stride, src1, src2, src3, src4);
1170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
1171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred = LD_UB(sec_pred);
1172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
1173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
1175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
11777bc9febe8749e98a3812a0dc4380ceae75c29450Johann    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
11787bc9febe8749e98a3812a0dc4380ceae75c29450Johann               src32_r, src43_r);
1179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out = __msa_aver_u_b(out, pred);
1185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out, ref, var, avg);
1186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src0 = src4;
1187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
1190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
1191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
1193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
11957bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
11967bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
11977bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
11987bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff) {
1199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
1200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
1201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3, src4;
1202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 ref0, ref1, ref2, ref3;
1203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 pred0, pred1, filt0;
1204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 vec0, vec1, vec2, vec3;
1205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 tmp0, tmp1, tmp2, tmp3;
1206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
1207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
1208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter);
1210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_fill_h(filtval);
1211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src0 = LD_UB(src);
1213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
1214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
1216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src, src_stride, src1, src2, src3, src4);
1217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
1218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(sec_pred, 16, pred0, pred1);
1219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 32;
1220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
1222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
12237bc9febe8749e98a3812a0dc4380ceae75c29450Johann    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
12247bc9febe8749e98a3812a0dc4380ceae75c29450Johann               vec3);
12257bc9febe8749e98a3812a0dc4380ceae75c29450Johann    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
12267bc9febe8749e98a3812a0dc4380ceae75c29450Johann                tmp2, tmp3);
1227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
1228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
1230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src0, ref0, var, avg);
1231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(src1, ref1, var, avg);
1232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src0 = src4;
1234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
1237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
1238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
1240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
12427bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t subpel_avg_ssediff_16w_v_msa(
12437bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
12447bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
12457bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff, int32_t width) {
1246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
1247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
1248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 ref0, ref1, ref2, ref3;
1249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 pred0, pred1, pred2, pred3;
1250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3, src4;
1251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 out0, out1, out2, out3, filt0;
1252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 tmp0, tmp1, tmp2, tmp3;
1254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
1255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
1256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter);
1258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_fill_h(filtval);
1259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src0 = LD_UB(src);
1261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
1262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
1264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src, src_stride, src1, src2, src3, src4);
1265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
1266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += (4 * width);
1268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
1270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
1271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
1276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
1277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src0 = src4;
1290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
1292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
12937bc9febe8749e98a3812a0dc4380ceae75c29450Johann    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
12947bc9febe8749e98a3812a0dc4380ceae75c29450Johann                out2, out3);
1295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out0, ref0, var, avg);
1297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out1, ref1, var, avg);
1298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out2, ref2, var, avg);
1299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out3, ref3, var, avg);
1300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
1303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
1304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
1306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
13087bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
13097bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
13107bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
13117bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff) {
1312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
1313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      sec_pred, filter, height, diff, 16);
1314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
13167bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
13177bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
13187bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
13197bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff) {
1320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt, sse = 0;
1321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t diff0[2];
1322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
13247bc9febe8749e98a3812a0dc4380ceae75c29450Johann    sse +=
13257bc9febe8749e98a3812a0dc4380ceae75c29450Johann        subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
13267bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     filter, height, &diff0[loop_cnt], 32);
1327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
1328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
1329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
1330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = diff0[0] + diff0[1];
1333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return sse;
1335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
13377bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
13387bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
13397bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
13407bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t height, int32_t *diff) {
1341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt, sse = 0;
1342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t diff0[4];
1343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
13457bc9febe8749e98a3812a0dc4380ceae75c29450Johann    sse +=
13467bc9febe8749e98a3812a0dc4380ceae75c29450Johann        subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
13477bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     filter, height, &diff0[loop_cnt], 64);
1348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
1349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
1350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
1351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return sse;
1356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
13597bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
13607bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
13617bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
1363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
1364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t ref0, ref1, ref2, ref3;
1365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3, src4;
1366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt_hz, filt_vt, vec0, vec1;
1368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 out, pred, ref = { 0 };
1369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
1370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
1371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
1372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter_horiz);
1374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_fill_h(filtval);
1375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter_vert);
1376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_fill_h(filtval);
1377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src0 = LD_UB(src);
1379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
1380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
1382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src, src_stride, src1, src2, src3, src4);
1383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
1384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred = LD_UB(sec_pred);
1385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
1386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
1388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
1390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
1391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
1393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
1394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out = __msa_aver_u_b(out, pred);
1399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out, ref, var, avg);
1400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src0 = src4;
1401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
1404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
1405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
1407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
14107bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
14117bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
14127bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
1414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
1415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 ref0, ref1, ref2, ref3;
1416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3, src4;
1417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 pred0, pred1, out0, out1;
1418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt_hz, filt_vt, vec0;
1419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
1421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
1422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
1423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter_horiz);
1425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_fill_h(filtval);
1426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter_vert);
1427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_fill_h(filtval);
1428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src0 = LD_UB(src);
1430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
1431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
1434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src, src_stride, src1, src2, src3, src4);
1435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
1436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(sec_pred, 16, pred0, pred1);
1437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 32;
1438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
1440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
1446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
1450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
1455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
1459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
1463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out0, ref0, var, avg);
1465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out1, ref1, var, avg);
1466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
1469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
1470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
1472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
14747bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic uint32_t subpel_avg_ssediff_16w_hv_msa(
14757bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
14767bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
14777bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
1478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16_t filtval;
1479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
1480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 ref0, ref1, ref2, ref3;
1482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 pred0, pred1, pred2, pred3;
1483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 out0, out1, out2, out3;
1484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt_hz, filt_vt, vec0, vec1;
1485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
1487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 avg = { 0 };
1488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v4i32 vec, var = { 0 };
1489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter_horiz);
1491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_fill_h(filtval);
1492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filtval = LH(filter_vert);
1493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_fill_h(filtval);
1494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_UB2(src, 8, src0, src1);
1496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
1497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
1502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src, src_stride, src0, src2, src4, src6);
1503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
1504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
1505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += (4 * width);
1507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
1524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
1530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
1531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
1538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
15397bc9febe8749e98a3812a0dc4380ceae75c29450Johann    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
15407bc9febe8749e98a3812a0dc4380ceae75c29450Johann                out2, out3);
1541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out0, ref0, var, avg);
1543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out1, ref1, var, avg);
1544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out2, ref2, var, avg);
1545da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    CALC_MSE_AVG_B(out3, ref3, var, avg);
1546da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1547da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec = __msa_hadd_s_w(avg, avg);
1549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = HADD_SW_S32(vec);
1550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return HADD_SW_S32(var);
1552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
15557bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
15567bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
15577bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                       sec_pred, filter_horiz, filter_vert,
1560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                       height, diff, 16);
1561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
15647bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
15657bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
15667bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt, sse = 0;
1568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t diff0[2];
1569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                         sec_pred, filter_horiz, filter_vert,
1573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                         height, &diff0[loop_cnt], 32);
1574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
1575da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
1576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
1577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = diff0[0] + diff0[1];
1580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return sse;
1582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
15857bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
15867bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
15877bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt, sse = 0;
1589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t diff0[4];
1590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1591da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                         sec_pred, filter_horiz, filter_vert,
1594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                         height, &diff0[loop_cnt], 64);
1595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
1596da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
1597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    sec_pred += 16;
1598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return sse;
1603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
1606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
1609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
1612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1613da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
1616da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1617da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1618da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
1619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
16207bc9febe8749e98a3812a0dc4380ceae75c29450Johann#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                              \
16217bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(                           \
16227bc9febe8749e98a3812a0dc4380ceae75c29450Johann      const uint8_t *src, int32_t src_stride, int32_t xoffset,                \
16237bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int32_t yoffset, const uint8_t *ref, int32_t ref_stride,                \
16247bc9febe8749e98a3812a0dc4380ceae75c29450Johann      uint32_t *sse) {                                                        \
16257bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t diff;                                                             \
16267bc9febe8749e98a3812a0dc4380ceae75c29450Johann    uint32_t var;                                                             \
16277bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
16287bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
16297bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                                                              \
16307bc9febe8749e98a3812a0dc4380ceae75c29450Johann    if (yoffset) {                                                            \
16317bc9febe8749e98a3812a0dc4380ceae75c29450Johann      if (xoffset) {                                                          \
16327bc9febe8749e98a3812a0dc4380ceae75c29450Johann        *sse = sub_pixel_sse_diff_##wd##width_hv_msa(                         \
16337bc9febe8749e98a3812a0dc4380ceae75c29450Johann            src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
16347bc9febe8749e98a3812a0dc4380ceae75c29450Johann      } else {                                                                \
16357bc9febe8749e98a3812a0dc4380ceae75c29450Johann        *sse = sub_pixel_sse_diff_##wd##width_v_msa(                          \
16367bc9febe8749e98a3812a0dc4380ceae75c29450Johann            src, src_stride, ref, ref_stride, v_filter, ht, &diff);           \
16377bc9febe8749e98a3812a0dc4380ceae75c29450Johann      }                                                                       \
16387bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                                                              \
16397bc9febe8749e98a3812a0dc4380ceae75c29450Johann      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
16407bc9febe8749e98a3812a0dc4380ceae75c29450Johann    } else {                                                                  \
16417bc9febe8749e98a3812a0dc4380ceae75c29450Johann      if (xoffset) {                                                          \
16427bc9febe8749e98a3812a0dc4380ceae75c29450Johann        *sse = sub_pixel_sse_diff_##wd##width_h_msa(                          \
16437bc9febe8749e98a3812a0dc4380ceae75c29450Johann            src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
16447bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                                                              \
16457bc9febe8749e98a3812a0dc4380ceae75c29450Johann        var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                           \
16467bc9febe8749e98a3812a0dc4380ceae75c29450Johann      } else {                                                                \
16477bc9febe8749e98a3812a0dc4380ceae75c29450Johann        var = vpx_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
16487bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                            sse);                             \
16497bc9febe8749e98a3812a0dc4380ceae75c29450Johann      }                                                                       \
16507bc9febe8749e98a3812a0dc4380ceae75c29450Johann    }                                                                         \
16517bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                                                              \
16527bc9febe8749e98a3812a0dc4380ceae75c29450Johann    return var;                                                               \
16537bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
1654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1655da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
1656da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
1657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
1659da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
1660da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
1661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
1663da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
1664da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
1665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
1667da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
1668da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
1669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
1671da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
1672da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
16747bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa(                       \
16757bc9febe8749e98a3812a0dc4380ceae75c29450Johann      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
16767bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
16777bc9febe8749e98a3812a0dc4380ceae75c29450Johann      uint32_t *sse, const uint8_t *sec_pred) {                               \
16787bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t diff;                                                             \
16797bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
16807bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
1681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                                                              \
16827bc9febe8749e98a3812a0dc4380ceae75c29450Johann    if (yoffset) {                                                            \
16837bc9febe8749e98a3812a0dc4380ceae75c29450Johann      if (xoffset) {                                                          \
16847bc9febe8749e98a3812a0dc4380ceae75c29450Johann        *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(                     \
16857bc9febe8749e98a3812a0dc4380ceae75c29450Johann            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
16867bc9febe8749e98a3812a0dc4380ceae75c29450Johann            v_filter, ht, &diff);                                             \
16877bc9febe8749e98a3812a0dc4380ceae75c29450Johann      } else {                                                                \
16887bc9febe8749e98a3812a0dc4380ceae75c29450Johann        *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(                      \
16897bc9febe8749e98a3812a0dc4380ceae75c29450Johann            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
16907bc9febe8749e98a3812a0dc4380ceae75c29450Johann            &diff);                                                           \
16917bc9febe8749e98a3812a0dc4380ceae75c29450Johann      }                                                                       \
1692da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    } else {                                                                  \
16937bc9febe8749e98a3812a0dc4380ceae75c29450Johann      if (xoffset) {                                                          \
16947bc9febe8749e98a3812a0dc4380ceae75c29450Johann        *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(                      \
16957bc9febe8749e98a3812a0dc4380ceae75c29450Johann            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
16967bc9febe8749e98a3812a0dc4380ceae75c29450Johann            &diff);                                                           \
16977bc9febe8749e98a3812a0dc4380ceae75c29450Johann      } else {                                                                \
16987bc9febe8749e98a3812a0dc4380ceae75c29450Johann        *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr,     \
16997bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                            ref_stride, sec_pred, ht, &diff); \
17007bc9febe8749e98a3812a0dc4380ceae75c29450Johann      }                                                                       \
1701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }                                                                         \
1702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                                                              \
17037bc9febe8749e98a3812a0dc4380ceae75c29450Johann    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                              \
17047bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
1705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
1707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
1708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
1710da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
1711da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
1712da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1713da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
1714da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
1715da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
1716da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1717da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
1718da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
1719da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1720da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianuint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
1721da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                             int32_t src_stride,
17227bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                             int32_t xoffset, int32_t yoffset,
1723da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                             const uint8_t *ref_ptr,
17247bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                             int32_t ref_stride, uint32_t *sse,
1725da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                             const uint8_t *sec_pred) {
1726da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t diff;
1727da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const uint8_t *h_filter = bilinear_filters_msa[xoffset];
1728da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const uint8_t *v_filter = bilinear_filters_msa[yoffset];
1729da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1730da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (yoffset) {
1731da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (xoffset) {
17327bc9febe8749e98a3812a0dc4380ceae75c29450Johann      *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
17337bc9febe8749e98a3812a0dc4380ceae75c29450Johann          src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
17347bc9febe8749e98a3812a0dc4380ceae75c29450Johann          v_filter, 64, &diff);
1735da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    } else {
17367bc9febe8749e98a3812a0dc4380ceae75c29450Johann      *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
17377bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                                  ref_stride, sec_pred,
17387bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                                  v_filter, 64, &diff);
1739da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
1740da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
1741da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (xoffset) {
17427bc9febe8749e98a3812a0dc4380ceae75c29450Johann      *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
17437bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                                  ref_stride, sec_pred,
17447bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                                  h_filter, 64, &diff);
1745da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    } else {
1746da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
1747da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                    sec_pred, &diff);
1748da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
1749da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
1750da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1751da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return VARIANCE_32Wx64H(*sse, diff);
1752da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1753da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
17547bc9febe8749e98a3812a0dc4380ceae75c29450Johann#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                           \
17557bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(                           \
17567bc9febe8749e98a3812a0dc4380ceae75c29450Johann      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
17577bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
17587bc9febe8749e98a3812a0dc4380ceae75c29450Johann      uint32_t *sse, const uint8_t *sec_pred) {                               \
17597bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int32_t diff;                                                             \
17607bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
17617bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
17627bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                                                              \
17637bc9febe8749e98a3812a0dc4380ceae75c29450Johann    if (yoffset) {                                                            \
17647bc9febe8749e98a3812a0dc4380ceae75c29450Johann      if (xoffset) {                                                          \
17657bc9febe8749e98a3812a0dc4380ceae75c29450Johann        *sse = sub_pixel_avg_sse_diff_64width_hv_msa(                         \
17667bc9febe8749e98a3812a0dc4380ceae75c29450Johann            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
17677bc9febe8749e98a3812a0dc4380ceae75c29450Johann            v_filter, ht, &diff);                                             \
17687bc9febe8749e98a3812a0dc4380ceae75c29450Johann      } else {                                                                \
17697bc9febe8749e98a3812a0dc4380ceae75c29450Johann        *sse = sub_pixel_avg_sse_diff_64width_v_msa(                          \
17707bc9febe8749e98a3812a0dc4380ceae75c29450Johann            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
17717bc9febe8749e98a3812a0dc4380ceae75c29450Johann            &diff);                                                           \
17727bc9febe8749e98a3812a0dc4380ceae75c29450Johann      }                                                                       \
17737bc9febe8749e98a3812a0dc4380ceae75c29450Johann    } else {                                                                  \
17747bc9febe8749e98a3812a0dc4380ceae75c29450Johann      if (xoffset) {                                                          \
17757bc9febe8749e98a3812a0dc4380ceae75c29450Johann        *sse = sub_pixel_avg_sse_diff_64width_h_msa(                          \
17767bc9febe8749e98a3812a0dc4380ceae75c29450Johann            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
17777bc9febe8749e98a3812a0dc4380ceae75c29450Johann            &diff);                                                           \
17787bc9febe8749e98a3812a0dc4380ceae75c29450Johann      } else {                                                                \
17797bc9febe8749e98a3812a0dc4380ceae75c29450Johann        *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr,       \
17807bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                          ref_stride, sec_pred, &diff);       \
17817bc9febe8749e98a3812a0dc4380ceae75c29450Johann      }                                                                       \
17827bc9febe8749e98a3812a0dc4380ceae75c29450Johann    }                                                                         \
17837bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                                                              \
17847bc9febe8749e98a3812a0dc4380ceae75c29450Johann    return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
17857bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
1786da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1787da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
1788da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianVPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
1789