sub_pixel_variance_msa.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_dsp_rtcd.h"
12#include "vpx_ports/mem.h"
13#include "vpx_dsp/mips/macros_msa.h"
14#include "vpx_dsp/variance.h"
15
16static const uint8_t bilinear_filters_msa[8][2] = {
17  { 128,   0, },
18  { 112,  16, },
19  {  96,  32, },
20  {  80,  48, },
21  {  64,  64, },
22  {  48,  80, },
23  {  32,  96, },
24  {  16, 112, },
25};
26
27#define CALC_MSE_AVG_B(src, ref, var, sub) {                       \
28  v16u8 src_l0_m, src_l1_m;                                        \
29  v8i16 res_l0_m, res_l1_m;                                        \
30                                                                   \
31  ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
32  HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
33  DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
34                                                                   \
35  sub += res_l0_m + res_l1_m;                                      \
36}
37
38#define VARIANCE_WxH(sse, diff, shift) \
39  sse - (((uint32_t)diff * diff) >> shift)
40
41#define VARIANCE_LARGE_WxH(sse, diff, shift) \
42  sse - (((int64_t)diff * diff) >> shift)
43
44static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
45                                        int32_t src_stride,
46                                        const uint8_t *ref_ptr,
47                                        int32_t ref_stride,
48                                        const uint8_t *sec_pred,
49                                        int32_t height,
50                                        int32_t *diff) {
51  int32_t ht_cnt;
52  uint32_t src0, src1, src2, src3;
53  uint32_t ref0, ref1, ref2, ref3;
54  v16u8 pred, src = { 0 };
55  v16u8 ref = { 0 };
56  v8i16 avg = { 0 };
57  v4i32 vec, var = { 0 };
58
59  for (ht_cnt = (height >> 2); ht_cnt--;) {
60    pred = LD_UB(sec_pred);
61    sec_pred += 16;
62    LW4(src_ptr, src_stride, src0, src1, src2, src3);
63    src_ptr += (4 * src_stride);
64    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
65    ref_ptr += (4 * ref_stride);
66
67    INSERT_W4_UB(src0, src1, src2, src3, src);
68    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
69
70    src = __msa_aver_u_b(src, pred);
71    CALC_MSE_AVG_B(src, ref, var, avg);
72  }
73
74  vec = __msa_hadd_s_w(avg, avg);
75  *diff = HADD_SW_S32(vec);
76
77  return HADD_SW_S32(var);
78}
79
80static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
81                                        int32_t src_stride,
82                                        const uint8_t *ref_ptr,
83                                        int32_t ref_stride,
84                                        const uint8_t *sec_pred,
85                                        int32_t height,
86                                        int32_t *diff) {
87  int32_t ht_cnt;
88  v16u8 src0, src1, src2, src3;
89  v16u8 ref0, ref1, ref2, ref3;
90  v16u8 pred0, pred1;
91  v8i16 avg = { 0 };
92  v4i32 vec, var = { 0 };
93
94  for (ht_cnt = (height >> 2); ht_cnt--;) {
95    LD_UB2(sec_pred, 16, pred0, pred1);
96    sec_pred += 32;
97    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
98    src_ptr += (4 * src_stride);
99    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
100    ref_ptr += (4 * ref_stride);
101
102    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
103                src0, src1, ref0, ref1);
104    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
105    CALC_MSE_AVG_B(src0, ref0, var, avg);
106    CALC_MSE_AVG_B(src1, ref1, var, avg);
107  }
108
109  vec = __msa_hadd_s_w(avg, avg);
110  *diff = HADD_SW_S32(vec);
111
112  return HADD_SW_S32(var);
113}
114
115static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
116                                         int32_t src_stride,
117                                         const uint8_t *ref_ptr,
118                                         int32_t ref_stride,
119                                         const uint8_t *sec_pred,
120                                         int32_t height,
121                                         int32_t *diff) {
122  int32_t ht_cnt;
123  v16u8 src, ref, pred;
124  v8i16 avg = { 0 };
125  v4i32 vec, var = { 0 };
126
127  for (ht_cnt = (height >> 2); ht_cnt--;) {
128    pred = LD_UB(sec_pred);
129    sec_pred += 16;
130    src = LD_UB(src_ptr);
131    src_ptr += src_stride;
132    ref = LD_UB(ref_ptr);
133    ref_ptr += ref_stride;
134    src = __msa_aver_u_b(src, pred);
135    CALC_MSE_AVG_B(src, ref, var, avg);
136
137    pred = LD_UB(sec_pred);
138    sec_pred += 16;
139    src = LD_UB(src_ptr);
140    src_ptr += src_stride;
141    ref = LD_UB(ref_ptr);
142    ref_ptr += ref_stride;
143    src = __msa_aver_u_b(src, pred);
144    CALC_MSE_AVG_B(src, ref, var, avg);
145
146    pred = LD_UB(sec_pred);
147    sec_pred += 16;
148    src = LD_UB(src_ptr);
149    src_ptr += src_stride;
150    ref = LD_UB(ref_ptr);
151    ref_ptr += ref_stride;
152    src = __msa_aver_u_b(src, pred);
153    CALC_MSE_AVG_B(src, ref, var, avg);
154
155    pred = LD_UB(sec_pred);
156    sec_pred += 16;
157    src = LD_UB(src_ptr);
158    src_ptr += src_stride;
159    ref = LD_UB(ref_ptr);
160    ref_ptr += ref_stride;
161    src = __msa_aver_u_b(src, pred);
162    CALC_MSE_AVG_B(src, ref, var, avg);
163  }
164
165  vec = __msa_hadd_s_w(avg, avg);
166  *diff = HADD_SW_S32(vec);
167
168  return HADD_SW_S32(var);
169}
170
171static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
172                                         int32_t src_stride,
173                                         const uint8_t *ref_ptr,
174                                         int32_t ref_stride,
175                                         const uint8_t *sec_pred,
176                                         int32_t height,
177                                         int32_t *diff) {
178  int32_t ht_cnt;
179  v16u8 src0, src1, ref0, ref1, pred0, pred1;
180  v8i16 avg = { 0 };
181  v4i32 vec, var = { 0 };
182
183  for (ht_cnt = (height >> 2); ht_cnt--;) {
184    LD_UB2(sec_pred, 16, pred0, pred1);
185    sec_pred += 32;
186    LD_UB2(src_ptr, 16, src0, src1);
187    src_ptr += src_stride;
188    LD_UB2(ref_ptr, 16, ref0, ref1);
189    ref_ptr += ref_stride;
190    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
191    CALC_MSE_AVG_B(src0, ref0, var, avg);
192    CALC_MSE_AVG_B(src1, ref1, var, avg);
193
194    LD_UB2(sec_pred, 16, pred0, pred1);
195    sec_pred += 32;
196    LD_UB2(src_ptr, 16, src0, src1);
197    src_ptr += src_stride;
198    LD_UB2(ref_ptr, 16, ref0, ref1);
199    ref_ptr += ref_stride;
200    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
201    CALC_MSE_AVG_B(src0, ref0, var, avg);
202    CALC_MSE_AVG_B(src1, ref1, var, avg);
203
204    LD_UB2(sec_pred, 16, pred0, pred1);
205    sec_pred += 32;
206    LD_UB2(src_ptr, 16, src0, src1);
207    src_ptr += src_stride;
208    LD_UB2(ref_ptr, 16, ref0, ref1);
209    ref_ptr += ref_stride;
210    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
211    CALC_MSE_AVG_B(src0, ref0, var, avg);
212    CALC_MSE_AVG_B(src1, ref1, var, avg);
213
214    LD_UB2(sec_pred, 16, pred0, pred1);
215    sec_pred += 32;
216    LD_UB2(src_ptr, 16, src0, src1);
217    src_ptr += src_stride;
218    LD_UB2(ref_ptr, 16, ref0, ref1);
219    ref_ptr += ref_stride;
220    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
221    CALC_MSE_AVG_B(src0, ref0, var, avg);
222    CALC_MSE_AVG_B(src1, ref1, var, avg);
223  }
224
225  vec = __msa_hadd_s_w(avg, avg);
226  *diff = HADD_SW_S32(vec);
227
228  return HADD_SW_S32(var);
229}
230
231static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
232                                       int32_t src_stride,
233                                       const uint8_t *ref_ptr,
234                                       int32_t ref_stride,
235                                       const uint8_t *sec_pred,
236                                       int32_t *diff) {
237  int32_t ht_cnt;
238  v16u8 src0, src1, ref0, ref1, pred0, pred1;
239  v8i16 avg0 = { 0 };
240  v8i16 avg1 = { 0 };
241  v4i32 vec, var = { 0 };
242
243  for (ht_cnt = 16; ht_cnt--;) {
244    LD_UB2(sec_pred, 16, pred0, pred1);
245    sec_pred += 32;
246    LD_UB2(src_ptr, 16, src0, src1);
247    src_ptr += src_stride;
248    LD_UB2(ref_ptr, 16, ref0, ref1);
249    ref_ptr += ref_stride;
250    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
251    CALC_MSE_AVG_B(src0, ref0, var, avg0);
252    CALC_MSE_AVG_B(src1, ref1, var, avg1);
253
254    LD_UB2(sec_pred, 16, pred0, pred1);
255    sec_pred += 32;
256    LD_UB2(src_ptr, 16, src0, src1);
257    src_ptr += src_stride;
258    LD_UB2(ref_ptr, 16, ref0, ref1);
259    ref_ptr += ref_stride;
260    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
261    CALC_MSE_AVG_B(src0, ref0, var, avg0);
262    CALC_MSE_AVG_B(src1, ref1, var, avg1);
263
264    LD_UB2(sec_pred, 16, pred0, pred1);
265    sec_pred += 32;
266    LD_UB2(src_ptr, 16, src0, src1);
267    src_ptr += src_stride;
268    LD_UB2(ref_ptr, 16, ref0, ref1);
269    ref_ptr += ref_stride;
270    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
271    CALC_MSE_AVG_B(src0, ref0, var, avg0);
272    CALC_MSE_AVG_B(src1, ref1, var, avg1);
273
274    LD_UB2(sec_pred, 16, pred0, pred1);
275    sec_pred += 32;
276    LD_UB2(src_ptr, 16, src0, src1);
277    src_ptr += src_stride;
278    LD_UB2(ref_ptr, 16, ref0, ref1);
279    ref_ptr += ref_stride;
280    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
281    CALC_MSE_AVG_B(src0, ref0, var, avg0);
282    CALC_MSE_AVG_B(src1, ref1, var, avg1);
283  }
284
285  vec = __msa_hadd_s_w(avg0, avg0);
286  vec += __msa_hadd_s_w(avg1, avg1);
287  *diff = HADD_SW_S32(vec);
288
289  return HADD_SW_S32(var);
290}
291
292static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
293                                       int32_t src_stride,
294                                       const uint8_t *ref_ptr,
295                                       int32_t ref_stride,
296                                       const uint8_t *sec_pred,
297                                       int32_t *diff) {
298  int32_t ht_cnt;
299  v16u8 src0, src1, src2, src3;
300  v16u8 ref0, ref1, ref2, ref3;
301  v16u8 pred0, pred1, pred2, pred3;
302  v8i16 avg0 = { 0 };
303  v8i16 avg1 = { 0 };
304  v4i32 vec, var = { 0 };
305
306  for (ht_cnt = 16; ht_cnt--;) {
307    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
308    sec_pred += 64;
309    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
310    src_ptr += src_stride;
311    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
312    ref_ptr += ref_stride;
313    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
314                src0, src1, src2, src3);
315    CALC_MSE_AVG_B(src0, ref0, var, avg0);
316    CALC_MSE_AVG_B(src2, ref2, var, avg0);
317    CALC_MSE_AVG_B(src1, ref1, var, avg1);
318    CALC_MSE_AVG_B(src3, ref3, var, avg1);
319
320    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
321    sec_pred += 64;
322    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
323    src_ptr += src_stride;
324    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
325    ref_ptr += ref_stride;
326    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
327                src0, src1, src2, src3);
328    CALC_MSE_AVG_B(src0, ref0, var, avg0);
329    CALC_MSE_AVG_B(src2, ref2, var, avg0);
330    CALC_MSE_AVG_B(src1, ref1, var, avg1);
331    CALC_MSE_AVG_B(src3, ref3, var, avg1);
332  }
333
334  vec = __msa_hadd_s_w(avg0, avg0);
335  vec += __msa_hadd_s_w(avg1, avg1);
336
337  *diff = HADD_SW_S32(vec);
338
339  return HADD_SW_S32(var);
340}
341
342static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
343                                       int32_t src_stride,
344                                       const uint8_t *ref_ptr,
345                                       int32_t ref_stride,
346                                       const uint8_t *sec_pred,
347                                       int32_t *diff) {
348  int32_t ht_cnt;
349  v16u8 src0, src1, src2, src3;
350  v16u8 ref0, ref1, ref2, ref3;
351  v16u8 pred0, pred1, pred2, pred3;
352  v8i16 avg0 = { 0 };
353  v8i16 avg1 = { 0 };
354  v8i16 avg2 = { 0 };
355  v8i16 avg3 = { 0 };
356  v4i32 vec, var = { 0 };
357
358  for (ht_cnt = 32; ht_cnt--;) {
359    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
360    sec_pred += 64;
361    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
362    src_ptr += src_stride;
363    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
364    ref_ptr += ref_stride;
365    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
366                src0, src1, src2, src3);
367    CALC_MSE_AVG_B(src0, ref0, var, avg0);
368    CALC_MSE_AVG_B(src1, ref1, var, avg1);
369    CALC_MSE_AVG_B(src2, ref2, var, avg2);
370    CALC_MSE_AVG_B(src3, ref3, var, avg3);
371
372    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
373    sec_pred += 64;
374    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
375    src_ptr += src_stride;
376    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
377    ref_ptr += ref_stride;
378    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
379                src0, src1, src2, src3);
380    CALC_MSE_AVG_B(src0, ref0, var, avg0);
381    CALC_MSE_AVG_B(src1, ref1, var, avg1);
382    CALC_MSE_AVG_B(src2, ref2, var, avg2);
383    CALC_MSE_AVG_B(src3, ref3, var, avg3);
384  }
385
386  vec = __msa_hadd_s_w(avg0, avg0);
387  vec += __msa_hadd_s_w(avg1, avg1);
388  vec += __msa_hadd_s_w(avg2, avg2);
389  vec += __msa_hadd_s_w(avg3, avg3);
390  *diff = HADD_SW_S32(vec);
391
392  return HADD_SW_S32(var);
393}
394
395static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
396                                                int32_t src_stride,
397                                                const uint8_t *dst,
398                                                int32_t dst_stride,
399                                                const uint8_t *filter,
400                                                int32_t height,
401                                                int32_t *diff) {
402  int16_t filtval;
403  uint32_t loop_cnt;
404  uint32_t ref0, ref1, ref2, ref3;
405  v16u8 filt0, ref = { 0 };
406  v16i8 src0, src1, src2, src3;
407  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
408  v8u16 vec0, vec1, vec2, vec3;
409  v8i16 avg = { 0 };
410  v4i32 vec, var = { 0 };
411
412  filtval = LH(filter);
413  filt0 = (v16u8)__msa_fill_h(filtval);
414
415  for (loop_cnt = (height >> 2); loop_cnt--;) {
416    LD_SB4(src, src_stride, src0, src1, src2, src3);
417    src += (4 * src_stride);
418    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
419    dst += (4 * dst_stride);
420    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
421    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
422    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
423    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
424                vec0, vec1, vec2, vec3);
425    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
426    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
427                src0, src1, src2, src3);
428    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
429    src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
430    CALC_MSE_AVG_B(src0, ref, var, avg);
431  }
432
433  vec = __msa_hadd_s_w(avg, avg);
434  *diff = HADD_SW_S32(vec);
435
436  return HADD_SW_S32(var);
437}
438
439static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
440                                                int32_t src_stride,
441                                                const uint8_t *dst,
442                                                int32_t dst_stride,
443                                                const uint8_t *filter,
444                                                int32_t height,
445                                                int32_t *diff) {
446  int16_t filtval;
447  uint32_t loop_cnt;
448  v16u8 filt0, out, ref0, ref1, ref2, ref3;
449  v16i8 src0, src1, src2, src3;
450  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
451  v8u16 vec0, vec1, vec2, vec3;
452  v8i16 avg = { 0 };
453  v4i32 vec, var = { 0 };
454
455  filtval = LH(filter);
456  filt0 = (v16u8)__msa_fill_h(filtval);
457
458  for (loop_cnt = (height >> 2); loop_cnt--;) {
459    LD_SB4(src, src_stride, src0, src1, src2, src3);
460    src += (4 * src_stride);
461    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
462    dst += (4 * dst_stride);
463
464    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
465    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
466    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
467    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
468                vec0, vec1, vec2, vec3);
469    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
470    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
471                src0, src1, src2, src3);
472    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
473    CALC_MSE_AVG_B(out, ref0, var, avg);
474    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
475    CALC_MSE_AVG_B(out, ref1, var, avg);
476  }
477
478  vec = __msa_hadd_s_w(avg, avg);
479  *diff = HADD_SW_S32(vec);
480
481  return HADD_SW_S32(var);
482}
483
484static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
485                                                 int32_t src_stride,
486                                                 const uint8_t *dst,
487                                                 int32_t dst_stride,
488                                                 const uint8_t *filter,
489                                                 int32_t height,
490                                                 int32_t *diff) {
491  int16_t filtval;
492  uint32_t loop_cnt;
493  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
494  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
495  v16u8 dst0, dst1, dst2, dst3, filt0;
496  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
497  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
498  v8i16 avg = { 0 };
499  v4i32 vec, var = { 0 };
500
501  filtval = LH(filter);
502  filt0 = (v16u8)__msa_fill_h(filtval);
503
504  for (loop_cnt = (height >> 2); loop_cnt--;) {
505    LD_SB4(src, src_stride, src0, src2, src4, src6);
506    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
507    src += (4 * src_stride);
508    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
509    dst += (4 * dst_stride);
510
511    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
512    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
513    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
514    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
515    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
516                out0, out1, out2, out3);
517    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
518                out4, out5, out6, out7);
519    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
520    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
521    PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,
522                src0, src1, src2, src3);
523    CALC_MSE_AVG_B(src0, dst0, var, avg);
524    CALC_MSE_AVG_B(src1, dst1, var, avg);
525    CALC_MSE_AVG_B(src2, dst2, var, avg);
526    CALC_MSE_AVG_B(src3, dst3, var, avg);
527  }
528
529  vec = __msa_hadd_s_w(avg, avg);
530  *diff = HADD_SW_S32(vec);
531
532  return HADD_SW_S32(var);
533}
534
535static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,
536                                                 int32_t src_stride,
537                                                 const uint8_t *dst,
538                                                 int32_t dst_stride,
539                                                 const uint8_t *filter,
540                                                 int32_t height,
541                                                 int32_t *diff) {
542  uint32_t loop_cnt, sse = 0;
543  int32_t diff0[2];
544
545  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
546    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
547                                            filter, height, &diff0[loop_cnt]);
548    src += 16;
549    dst += 16;
550  }
551
552  *diff = diff0[0] + diff0[1];
553
554  return sse;
555}
556
557static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,
558                                                 int32_t src_stride,
559                                                 const uint8_t *dst,
560                                                 int32_t dst_stride,
561                                                 const uint8_t *filter,
562                                                 int32_t height,
563                                                 int32_t *diff) {
564  uint32_t loop_cnt, sse = 0;
565  int32_t diff0[4];
566
567  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
568    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
569                                            filter, height, &diff0[loop_cnt]);
570    src += 16;
571    dst += 16;
572  }
573
574  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
575
576  return sse;
577}
578
579static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
580                                                int32_t src_stride,
581                                                const uint8_t *dst,
582                                                int32_t dst_stride,
583                                                const uint8_t *filter,
584                                                int32_t height,
585                                                int32_t *diff) {
586  int16_t filtval;
587  uint32_t loop_cnt;
588  uint32_t ref0, ref1, ref2, ref3;
589  v16u8 src0, src1, src2, src3, src4, out;
590  v16u8 src10_r, src32_r, src21_r, src43_r;
591  v16u8 ref = { 0 };
592  v16u8 src2110, src4332;
593  v16u8 filt0;
594  v8i16 avg = { 0 };
595  v4i32 vec, var = { 0 };
596  v8u16 tmp0, tmp1;
597
598  filtval = LH(filter);
599  filt0 = (v16u8)__msa_fill_h(filtval);
600
601  src0 = LD_UB(src);
602  src += src_stride;
603
604  for (loop_cnt = (height >> 2); loop_cnt--;) {
605    LD_UB4(src, src_stride, src1, src2, src3, src4);
606    src += (4 * src_stride);
607    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
608    dst += (4 * dst_stride);
609
610    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
611    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
612               src10_r, src21_r, src32_r, src43_r);
613    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
614    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
615    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
616    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
617    CALC_MSE_AVG_B(out, ref, var, avg);
618    src0 = src4;
619  }
620
621  vec = __msa_hadd_s_w(avg, avg);
622  *diff = HADD_SW_S32(vec);
623
624  return HADD_SW_S32(var);
625}
626
627static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
628                                                int32_t src_stride,
629                                                const uint8_t *dst,
630                                                int32_t dst_stride,
631                                                const uint8_t *filter,
632                                                int32_t height,
633                                                int32_t *diff) {
634  int16_t filtval;
635  uint32_t loop_cnt;
636  v16u8 src0, src1, src2, src3, src4;
637  v16u8 ref0, ref1, ref2, ref3;
638  v8u16 vec0, vec1, vec2, vec3;
639  v8u16 tmp0, tmp1, tmp2, tmp3;
640  v16u8 filt0;
641  v8i16 avg = { 0 };
642  v4i32 vec, var = { 0 };
643
644  filtval = LH(filter);
645  filt0 = (v16u8)__msa_fill_h(filtval);
646
647  src0 = LD_UB(src);
648  src += src_stride;
649
650  for (loop_cnt = (height >> 2); loop_cnt--;) {
651    LD_UB4(src, src_stride, src1, src2, src3, src4);
652    src += (4 * src_stride);
653    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
654    dst += (4 * dst_stride);
655
656    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
657    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
658               vec0, vec1, vec2, vec3);
659    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
660                tmp0, tmp1, tmp2, tmp3);
661    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
662    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
663    CALC_MSE_AVG_B(src0, ref0, var, avg);
664    CALC_MSE_AVG_B(src1, ref1, var, avg);
665    src0 = src4;
666  }
667
668  vec = __msa_hadd_s_w(avg, avg);
669  *diff = HADD_SW_S32(vec);
670
671  return HADD_SW_S32(var);
672}
673
674static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,
675                                                 int32_t src_stride,
676                                                 const uint8_t *dst,
677                                                 int32_t dst_stride,
678                                                 const uint8_t *filter,
679                                                 int32_t height,
680                                                 int32_t *diff) {
681  int16_t filtval;
682  uint32_t loop_cnt;
683  v16u8 ref0, ref1, ref2, ref3;
684  v16u8 src0, src1, src2, src3, src4;
685  v16u8 out0, out1, out2, out3;
686  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
687  v8u16 tmp0, tmp1, tmp2, tmp3;
688  v16u8 filt0;
689  v8i16 avg = { 0 };
690  v4i32 vec, var = { 0 };
691
692  filtval = LH(filter);
693  filt0 = (v16u8)__msa_fill_h(filtval);
694
695  src0 = LD_UB(src);
696  src += src_stride;
697
698  for (loop_cnt = (height >> 2); loop_cnt--;) {
699    LD_UB4(src, src_stride, src1, src2, src3, src4);
700    src += (4 * src_stride);
701    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
702    dst += (4 * dst_stride);
703
704    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
705    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
706    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
707    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
708    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
709
710    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
711    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
712    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
713    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
714    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
715
716    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
717    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
718    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
719    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
720    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
721    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
722
723    src0 = src4;
724
725    CALC_MSE_AVG_B(out0, ref0, var, avg);
726    CALC_MSE_AVG_B(out1, ref1, var, avg);
727    CALC_MSE_AVG_B(out2, ref2, var, avg);
728    CALC_MSE_AVG_B(out3, ref3, var, avg);
729  }
730
731  vec = __msa_hadd_s_w(avg, avg);
732  *diff = HADD_SW_S32(vec);
733
734  return HADD_SW_S32(var);
735}
736
737static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,
738                                                 int32_t src_stride,
739                                                 const uint8_t *dst,
740                                                 int32_t dst_stride,
741                                                 const uint8_t *filter,
742                                                 int32_t height,
743                                                 int32_t *diff) {
744  uint32_t loop_cnt, sse = 0;
745  int32_t diff0[2];
746
747  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
748    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
749                                            filter, height, &diff0[loop_cnt]);
750    src += 16;
751    dst += 16;
752  }
753
754  *diff = diff0[0] + diff0[1];
755
756  return sse;
757}
758
759static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,
760                                                 int32_t src_stride,
761                                                 const uint8_t *dst,
762                                                 int32_t dst_stride,
763                                                 const uint8_t *filter,
764                                                 int32_t height,
765                                                 int32_t *diff) {
766  uint32_t loop_cnt, sse = 0;
767  int32_t diff0[4];
768
769  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
770    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
771                                            filter, height, &diff0[loop_cnt]);
772    src += 16;
773    dst += 16;
774  }
775
776  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
777
778  return sse;
779}
780
781static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,
782                                                 int32_t src_stride,
783                                                 const uint8_t *dst,
784                                                 int32_t dst_stride,
785                                                 const uint8_t *filter_horiz,
786                                                 const uint8_t *filter_vert,
787                                                 int32_t height,
788                                                 int32_t *diff) {
789  int16_t filtval;
790  uint32_t loop_cnt;
791  uint32_t ref0, ref1, ref2, ref3;
792  v16u8 src0, src1, src2, src3, src4;
793  v16u8 out, ref = { 0 };
794  v16u8 filt_vt, filt_hz, vec0, vec1;
795  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
796  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
797  v8u16 tmp0, tmp1;
798  v8i16 avg = { 0 };
799  v4i32 vec, var = { 0 };
800
801  filtval = LH(filter_horiz);
802  filt_hz = (v16u8)__msa_fill_h(filtval);
803  filtval = LH(filter_vert);
804  filt_vt = (v16u8)__msa_fill_h(filtval);
805
806  src0 = LD_UB(src);
807  src += src_stride;
808
809  for (loop_cnt = (height >> 2); loop_cnt--;) {
810    LD_UB4(src, src_stride, src1, src2, src3, src4);
811    src += (4 * src_stride);
812    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
813    dst += (4 * dst_stride);
814    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
815    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
816    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
817    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
818    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
819    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
820    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
821    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
822    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
823    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
824    CALC_MSE_AVG_B(out, ref, var, avg);
825    src0 = src4;
826  }
827
828  vec = __msa_hadd_s_w(avg, avg);
829  *diff = HADD_SW_S32(vec);
830
831  return HADD_SW_S32(var);
832}
833
834static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,
835                                                 int32_t src_stride,
836                                                 const uint8_t *dst,
837                                                 int32_t dst_stride,
838                                                 const uint8_t *filter_horiz,
839                                                 const uint8_t *filter_vert,
840                                                 int32_t height,
841                                                 int32_t *diff) {
842  int16_t filtval;
843  uint32_t loop_cnt;
844  v16u8 ref0, ref1, ref2, ref3;
845  v16u8 src0, src1, src2, src3, src4;
846  v16u8 out0, out1;
847  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
848  v8u16 hz_out0, hz_out1;
849  v8u16 tmp0, tmp1, tmp2, tmp3;
850  v16u8 filt_vt, filt_hz, vec0;
851  v8i16 avg = { 0 };
852  v4i32 vec, var = { 0 };
853
854  filtval = LH(filter_horiz);
855  filt_hz = (v16u8)__msa_fill_h(filtval);
856  filtval = LH(filter_vert);
857  filt_vt = (v16u8)__msa_fill_h(filtval);
858
859  src0 = LD_UB(src);
860  src += src_stride;
861  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
862
863  for (loop_cnt = (height >> 2); loop_cnt--;) {
864    LD_UB4(src, src_stride, src1, src2, src3, src4);
865    src += (4 * src_stride);
866    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
867    dst += (4 * dst_stride);
868
869    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
870    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
871    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
872    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
873    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
874    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
875    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
876    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
877    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
878    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
879    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
880    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
881    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
882    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
883    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
884    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
885    CALC_MSE_AVG_B(out0, ref0, var, avg);
886    CALC_MSE_AVG_B(out1, ref1, var, avg);
887  }
888
889  vec = __msa_hadd_s_w(avg, avg);
890  *diff = HADD_SW_S32(vec);
891
892  return HADD_SW_S32(var);
893}
894
895static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,
896                                                  int32_t src_stride,
897                                                  const uint8_t *dst,
898                                                  int32_t dst_stride,
899                                                  const uint8_t *filter_horiz,
900                                                  const uint8_t *filter_vert,
901                                                  int32_t height,
902                                                  int32_t *diff) {
903  int16_t filtval;
904  uint32_t loop_cnt;
905  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
906  v16u8 ref0, ref1, ref2, ref3;
907  v16u8 filt_hz, filt_vt, vec0, vec1;
908  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
909  v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
910  v8u16 tmp0, tmp1;
911  v8i16 avg = { 0 };
912  v4i32 vec, var = { 0 };
913
914  filtval = LH(filter_horiz);
915  filt_hz = (v16u8)__msa_fill_h(filtval);
916  filtval = LH(filter_vert);
917  filt_vt = (v16u8)__msa_fill_h(filtval);
918
919  LD_UB2(src, 8, src0, src1);
920  src += src_stride;
921
922  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
923  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
924
925  for (loop_cnt = (height >> 2); loop_cnt--;) {
926    LD_UB4(src, src_stride, src0, src2, src4, src6);
927    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
928    src += (4 * src_stride);
929    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
930    dst += (4 * dst_stride);
931
932    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
933    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
934    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
935    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
936    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
937    src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
938
939    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
940    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
941    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
942    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
943    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
944    src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
945
946    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
947    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
948    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
949    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
950    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
951    src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
952
953    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
954    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
955    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
956    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
957    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
958    src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
959
960    CALC_MSE_AVG_B(src0, ref0, var, avg);
961    CALC_MSE_AVG_B(src1, ref1, var, avg);
962    CALC_MSE_AVG_B(src2, ref2, var, avg);
963    CALC_MSE_AVG_B(src3, ref3, var, avg);
964  }
965
966  vec = __msa_hadd_s_w(avg, avg);
967  *diff = HADD_SW_S32(vec);
968
969  return HADD_SW_S32(var);
970}
971
972static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,
973                                                  int32_t src_stride,
974                                                  const uint8_t *dst,
975                                                  int32_t dst_stride,
976                                                  const uint8_t *filter_horiz,
977                                                  const uint8_t *filter_vert,
978                                                  int32_t height,
979                                                  int32_t *diff) {
980  uint32_t loop_cnt, sse = 0;
981  int32_t diff0[2];
982
983  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
984    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
985                                             filter_horiz, filter_vert, height,
986                                             &diff0[loop_cnt]);
987    src += 16;
988    dst += 16;
989  }
990
991  *diff = diff0[0] + diff0[1];
992
993  return sse;
994}
995
996static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
997                                                  int32_t src_stride,
998                                                  const uint8_t *dst,
999                                                  int32_t dst_stride,
1000                                                  const uint8_t *filter_horiz,
1001                                                  const uint8_t *filter_vert,
1002                                                  int32_t height,
1003                                                  int32_t *diff) {
1004  uint32_t loop_cnt, sse = 0;
1005  int32_t diff0[4];
1006
1007  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1008    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
1009                                             filter_horiz, filter_vert, height,
1010                                             &diff0[loop_cnt]);
1011    src += 16;
1012    dst += 16;
1013  }
1014
1015  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1016
1017  return sse;
1018}
1019
1020static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src,
1021                                                    int32_t src_stride,
1022                                                    const uint8_t *dst,
1023                                                    int32_t dst_stride,
1024                                                    const uint8_t *sec_pred,
1025                                                    const uint8_t *filter,
1026                                                    int32_t height,
1027                                                    int32_t *diff) {
1028  int16_t filtval;
1029  uint32_t loop_cnt;
1030  uint32_t ref0, ref1, ref2, ref3;
1031  v16u8 out, pred, filt0, ref = { 0 };
1032  v16i8 src0, src1, src2, src3;
1033  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1034  v8u16 vec0, vec1, vec2, vec3;
1035  v8i16 avg = { 0 };
1036  v4i32 vec, var = { 0 };
1037
1038  filtval = LH(filter);
1039  filt0 = (v16u8)__msa_fill_h(filtval);
1040
1041  for (loop_cnt = (height >> 2); loop_cnt--;) {
1042    LD_SB4(src, src_stride, src0, src1, src2, src3);
1043    src += (4 * src_stride);
1044    pred = LD_UB(sec_pred);
1045    sec_pred += 16;
1046    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1047    dst += (4 * dst_stride);
1048
1049    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1050    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1051    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1052    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1053                vec0, vec1, vec2, vec3);
1054    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
1055    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
1056                src0, src1, src2, src3);
1057    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
1058    out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
1059    out = __msa_aver_u_b(out, pred);
1060    CALC_MSE_AVG_B(out, ref, var, avg);
1061  }
1062
1063  vec = __msa_hadd_s_w(avg, avg);
1064  *diff = HADD_SW_S32(vec);
1065
1066  return HADD_SW_S32(var);
1067}
1068
1069static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src,
1070                                                    int32_t src_stride,
1071                                                    const uint8_t *dst,
1072                                                    int32_t dst_stride,
1073                                                    const uint8_t *sec_pred,
1074                                                    const uint8_t *filter,
1075                                                    int32_t height,
1076                                                    int32_t *diff) {
1077  int16_t filtval;
1078  uint32_t loop_cnt;
1079  v16u8 out, pred, filt0;
1080  v16u8 ref0, ref1, ref2, ref3;
1081  v16i8 src0, src1, src2, src3;
1082  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1083  v8u16 vec0, vec1, vec2, vec3;
1084  v8i16 avg = { 0 };
1085  v4i32 vec, var = { 0 };
1086
1087  filtval = LH(filter);
1088  filt0 = (v16u8)__msa_fill_h(filtval);
1089
1090  for (loop_cnt = (height >> 2); loop_cnt--;) {
1091    LD_SB4(src, src_stride, src0, src1, src2, src3);
1092    src += (4 * src_stride);
1093    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1094    dst += (4 * dst_stride);
1095
1096    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1097    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1098    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1099    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1100                vec0, vec1, vec2, vec3);
1101    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
1102    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
1103                src0, src1, src2, src3);
1104    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
1105
1106    pred = LD_UB(sec_pred);
1107    sec_pred += 16;
1108    out = __msa_aver_u_b(out, pred);
1109    CALC_MSE_AVG_B(out, ref0, var, avg);
1110    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
1111    pred = LD_UB(sec_pred);
1112    sec_pred += 16;
1113    out = __msa_aver_u_b(out, pred);
1114    CALC_MSE_AVG_B(out, ref1, var, avg);
1115  }
1116
1117  vec = __msa_hadd_s_w(avg, avg);
1118  *diff = HADD_SW_S32(vec);
1119
1120  return HADD_SW_S32(var);
1121}
1122
1123static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src,
1124                                             int32_t src_stride,
1125                                             const uint8_t *dst,
1126                                             int32_t dst_stride,
1127                                             const uint8_t *sec_pred,
1128                                             const uint8_t *filter,
1129                                             int32_t height,
1130                                             int32_t *diff,
1131                                             int32_t width) {
1132  int16_t filtval;
1133  uint32_t loop_cnt;
1134  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1135  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1136  v16u8 dst0, dst1, dst2, dst3;
1137  v16u8 tmp0, tmp1, tmp2, tmp3;
1138  v16u8 pred0, pred1, pred2, pred3, filt0;
1139  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1140  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
1141  v8i16 avg = { 0 };
1142  v4i32 vec, var = { 0 };
1143
1144  filtval = LH(filter);
1145  filt0 = (v16u8)__msa_fill_h(filtval);
1146
1147  for (loop_cnt = (height >> 2); loop_cnt--;) {
1148    LD_SB4(src, src_stride, src0, src2, src4, src6);
1149    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1150    src += (4 * src_stride);
1151    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1152    dst += (4 * dst_stride);
1153    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1154    sec_pred += (4 * width);
1155
1156    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1157    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1158    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
1159    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
1160    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1161                out0, out1, out2, out3);
1162    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1163                out4, out5, out6, out7);
1164    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
1165    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
1166    PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6,
1167                tmp0, tmp1, tmp2, tmp3);
1168    AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3,
1169                tmp0, tmp1, tmp2, tmp3);
1170
1171    CALC_MSE_AVG_B(tmp0, dst0, var, avg);
1172    CALC_MSE_AVG_B(tmp1, dst1, var, avg);
1173    CALC_MSE_AVG_B(tmp2, dst2, var, avg);
1174    CALC_MSE_AVG_B(tmp3, dst3, var, avg);
1175  }
1176
1177  vec = __msa_hadd_s_w(avg, avg);
1178  *diff = HADD_SW_S32(vec);
1179
1180  return HADD_SW_S32(var);
1181}
1182
1183static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t *src,
1184                                                     int32_t src_stride,
1185                                                     const uint8_t *dst,
1186                                                     int32_t dst_stride,
1187                                                     const uint8_t *sec_pred,
1188                                                     const uint8_t *filter,
1189                                                     int32_t height,
1190                                                     int32_t *diff) {
1191  return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
1192                                      sec_pred, filter, height, diff, 16);
1193}
1194
1195static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src,
1196                                                     int32_t src_stride,
1197                                                     const uint8_t *dst,
1198                                                     int32_t dst_stride,
1199                                                     const uint8_t *sec_pred,
1200                                                     const uint8_t *filter,
1201                                                     int32_t height,
1202                                                     int32_t *diff) {
1203  uint32_t loop_cnt, sse = 0;
1204  int32_t diff0[2];
1205
1206  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1207    sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
1208                                        sec_pred, filter, height,
1209                                        &diff0[loop_cnt], 32);
1210    src += 16;
1211    dst += 16;
1212    sec_pred += 16;
1213  }
1214
1215  *diff = diff0[0] + diff0[1];
1216
1217  return sse;
1218}
1219
1220static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src,
1221                                                     int32_t src_stride,
1222                                                     const uint8_t *dst,
1223                                                     int32_t dst_stride,
1224                                                     const uint8_t *sec_pred,
1225                                                     const uint8_t *filter,
1226                                                     int32_t height,
1227                                                     int32_t *diff) {
1228  uint32_t loop_cnt, sse = 0;
1229  int32_t diff0[4];
1230
1231  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1232    sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
1233                                        sec_pred, filter, height,
1234                                        &diff0[loop_cnt], 64);
1235    src += 16;
1236    dst += 16;
1237    sec_pred += 16;
1238  }
1239
1240  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1241
1242  return sse;
1243}
1244
1245static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src,
1246                                                    int32_t src_stride,
1247                                                    const uint8_t *dst,
1248                                                    int32_t dst_stride,
1249                                                    const uint8_t *sec_pred,
1250                                                    const uint8_t *filter,
1251                                                    int32_t height,
1252                                                    int32_t *diff) {
1253  int16_t filtval;
1254  uint32_t loop_cnt;
1255  uint32_t ref0, ref1, ref2, ref3;
1256  v16u8 src0, src1, src2, src3, src4;
1257  v16u8 src10_r, src32_r, src21_r, src43_r;
1258  v16u8 out, pred, ref = { 0 };
1259  v16u8 src2110, src4332, filt0;
1260  v8i16 avg = { 0 };
1261  v4i32 vec, var = { 0 };
1262  v8u16 tmp0, tmp1;
1263
1264  filtval = LH(filter);
1265  filt0 = (v16u8)__msa_fill_h(filtval);
1266
1267  src0 = LD_UB(src);
1268  src += src_stride;
1269
1270  for (loop_cnt = (height >> 2); loop_cnt--;) {
1271    LD_UB4(src, src_stride, src1, src2, src3, src4);
1272    src += (4 * src_stride);
1273    pred = LD_UB(sec_pred);
1274    sec_pred += 16;
1275    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1276    dst += (4 * dst_stride);
1277
1278    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1279    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1280               src10_r, src21_r, src32_r, src43_r);
1281    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1282    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1283    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1284
1285    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1286    out = __msa_aver_u_b(out, pred);
1287    CALC_MSE_AVG_B(out, ref, var, avg);
1288    src0 = src4;
1289  }
1290
1291  vec = __msa_hadd_s_w(avg, avg);
1292  *diff = HADD_SW_S32(vec);
1293
1294  return HADD_SW_S32(var);
1295}
1296
1297static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src,
1298                                                    int32_t src_stride,
1299                                                    const uint8_t *dst,
1300                                                    int32_t dst_stride,
1301                                                    const uint8_t *sec_pred,
1302                                                    const uint8_t *filter,
1303                                                    int32_t height,
1304                                                    int32_t *diff) {
1305  int16_t filtval;
1306  uint32_t loop_cnt;
1307  v16u8 src0, src1, src2, src3, src4;
1308  v16u8 ref0, ref1, ref2, ref3;
1309  v16u8 pred0, pred1, filt0;
1310  v8u16 vec0, vec1, vec2, vec3;
1311  v8u16 tmp0, tmp1, tmp2, tmp3;
1312  v8i16 avg = { 0 };
1313  v4i32 vec, var = { 0 };
1314
1315  filtval = LH(filter);
1316  filt0 = (v16u8)__msa_fill_h(filtval);
1317
1318  src0 = LD_UB(src);
1319  src += src_stride;
1320
1321  for (loop_cnt = (height >> 2); loop_cnt--;) {
1322    LD_UB4(src, src_stride, src1, src2, src3, src4);
1323    src += (4 * src_stride);
1324    LD_UB2(sec_pred, 16, pred0, pred1);
1325    sec_pred += 32;
1326    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1327    dst += (4 * dst_stride);
1328    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1329    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
1330               vec0, vec1, vec2, vec3);
1331    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1332                tmp0, tmp1, tmp2, tmp3);
1333    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
1334    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1335    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
1336    CALC_MSE_AVG_B(src0, ref0, var, avg);
1337    CALC_MSE_AVG_B(src1, ref1, var, avg);
1338
1339    src0 = src4;
1340  }
1341
1342  vec = __msa_hadd_s_w(avg, avg);
1343  *diff = HADD_SW_S32(vec);
1344
1345  return HADD_SW_S32(var);
1346}
1347
1348static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src,
1349                                             int32_t src_stride,
1350                                             const uint8_t *dst,
1351                                             int32_t dst_stride,
1352                                             const uint8_t *sec_pred,
1353                                             const uint8_t *filter,
1354                                             int32_t height,
1355                                             int32_t *diff,
1356                                             int32_t width) {
1357  int16_t filtval;
1358  uint32_t loop_cnt;
1359  v16u8 ref0, ref1, ref2, ref3;
1360  v16u8 pred0, pred1, pred2, pred3;
1361  v16u8 src0, src1, src2, src3, src4;
1362  v16u8 out0, out1, out2, out3, filt0;
1363  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1364  v8u16 tmp0, tmp1, tmp2, tmp3;
1365  v8i16 avg = { 0 };
1366  v4i32 vec, var = { 0 };
1367
1368  filtval = LH(filter);
1369  filt0 = (v16u8)__msa_fill_h(filtval);
1370
1371  src0 = LD_UB(src);
1372  src += src_stride;
1373
1374  for (loop_cnt = (height >> 2); loop_cnt--;) {
1375    LD_UB4(src, src_stride, src1, src2, src3, src4);
1376    src += (4 * src_stride);
1377    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1378    sec_pred += (4 * width);
1379
1380    ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
1381    ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
1382    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1383    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1384    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1385
1386    ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
1387    ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
1388    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1389    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1390    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1391
1392    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1393    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1394    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1395
1396    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1397    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1398    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1399
1400    src0 = src4;
1401    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1402    dst += (4 * dst_stride);
1403
1404    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
1405                out0, out1, out2, out3);
1406
1407    CALC_MSE_AVG_B(out0, ref0, var, avg);
1408    CALC_MSE_AVG_B(out1, ref1, var, avg);
1409    CALC_MSE_AVG_B(out2, ref2, var, avg);
1410    CALC_MSE_AVG_B(out3, ref3, var, avg);
1411  }
1412
1413  vec = __msa_hadd_s_w(avg, avg);
1414  *diff = HADD_SW_S32(vec);
1415
1416  return HADD_SW_S32(var);
1417}
1418
1419static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t *src,
1420                                                     int32_t src_stride,
1421                                                     const uint8_t *dst,
1422                                                     int32_t dst_stride,
1423                                                     const uint8_t *sec_pred,
1424                                                     const uint8_t *filter,
1425                                                     int32_t height,
1426                                                     int32_t *diff) {
1427  return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
1428                                      sec_pred, filter, height, diff, 16);
1429}
1430
1431static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src,
1432                                                     int32_t src_stride,
1433                                                     const uint8_t *dst,
1434                                                     int32_t dst_stride,
1435                                                     const uint8_t *sec_pred,
1436                                                     const uint8_t *filter,
1437                                                     int32_t height,
1438                                                     int32_t *diff) {
1439  uint32_t loop_cnt, sse = 0;
1440  int32_t diff0[2];
1441
1442  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1443    sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
1444                                        sec_pred, filter, height,
1445                                        &diff0[loop_cnt], 32);
1446    src += 16;
1447    dst += 16;
1448    sec_pred += 16;
1449  }
1450
1451  *diff = diff0[0] + diff0[1];
1452
1453  return sse;
1454}
1455
1456static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src,
1457                                                     int32_t src_stride,
1458                                                     const uint8_t *dst,
1459                                                     int32_t dst_stride,
1460                                                     const uint8_t *sec_pred,
1461                                                     const uint8_t *filter,
1462                                                     int32_t height,
1463                                                     int32_t *diff) {
1464  uint32_t loop_cnt, sse = 0;
1465  int32_t diff0[4];
1466
1467  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1468    sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
1469                                        sec_pred, filter, height,
1470                                        &diff0[loop_cnt], 64);
1471    src += 16;
1472    dst += 16;
1473    sec_pred += 16;
1474  }
1475
1476  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1477
1478  return sse;
1479}
1480
1481static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
1482  const uint8_t *src, int32_t src_stride,
1483  const uint8_t *dst, int32_t dst_stride,
1484  const uint8_t *sec_pred,
1485  const uint8_t *filter_horiz, const uint8_t *filter_vert,
1486  int32_t height, int32_t *diff) {
1487  int16_t filtval;
1488  uint32_t loop_cnt;
1489  uint32_t ref0, ref1, ref2, ref3;
1490  v16u8 src0, src1, src2, src3, src4;
1491  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1492  v16u8 filt_hz, filt_vt, vec0, vec1;
1493  v16u8 out, pred, ref = { 0 };
1494  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
1495  v8i16 avg = { 0 };
1496  v4i32 vec, var = { 0 };
1497
1498  filtval = LH(filter_horiz);
1499  filt_hz = (v16u8)__msa_fill_h(filtval);
1500  filtval = LH(filter_vert);
1501  filt_vt = (v16u8)__msa_fill_h(filtval);
1502
1503  src0 = LD_UB(src);
1504  src += src_stride;
1505
1506  for (loop_cnt = (height >> 2); loop_cnt--;) {
1507    LD_UB4(src, src_stride, src1, src2, src3, src4);
1508    src += (4 * src_stride);
1509    pred = LD_UB(sec_pred);
1510    sec_pred += 16;
1511    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1512    dst += (4 * dst_stride);
1513    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1514    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
1515    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
1516    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1517    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
1518    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
1519    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1520    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1521    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1522    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1523    out = __msa_aver_u_b(out, pred);
1524    CALC_MSE_AVG_B(out, ref, var, avg);
1525    src0 = src4;
1526  }
1527
1528  vec = __msa_hadd_s_w(avg, avg);
1529  *diff = HADD_SW_S32(vec);
1530
1531  return HADD_SW_S32(var);
1532}
1533
1534static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
1535  const uint8_t *src, int32_t src_stride,
1536  const uint8_t *dst, int32_t dst_stride,
1537  const uint8_t *sec_pred,
1538  const uint8_t *filter_horiz, const uint8_t *filter_vert,
1539  int32_t height, int32_t *diff) {
1540  int16_t filtval;
1541  uint32_t loop_cnt;
1542  v16u8 ref0, ref1, ref2, ref3;
1543  v16u8 src0, src1, src2, src3, src4;
1544  v16u8 pred0, pred1, out0, out1;
1545  v16u8 filt_hz, filt_vt, vec0;
1546  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1547  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
1548  v8i16 avg = { 0 };
1549  v4i32 vec, var = { 0 };
1550
1551  filtval = LH(filter_horiz);
1552  filt_hz = (v16u8)__msa_fill_h(filtval);
1553  filtval = LH(filter_vert);
1554  filt_vt = (v16u8)__msa_fill_h(filtval);
1555
1556  src0 = LD_UB(src);
1557  src += src_stride;
1558  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1559
1560  for (loop_cnt = (height >> 2); loop_cnt--;) {
1561    LD_UB4(src, src_stride, src1, src2, src3, src4);
1562    src += (4 * src_stride);
1563    LD_UB2(sec_pred, 16, pred0, pred1);
1564    sec_pred += 32;
1565    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1566    dst += (4 * dst_stride);
1567
1568    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1569    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1570
1571    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1572    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
1573    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1574
1575    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1576    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
1577    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1578    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1579
1580    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1581    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
1582    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1583
1584    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1585    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
1586
1587    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1588    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1589    AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
1590
1591    CALC_MSE_AVG_B(out0, ref0, var, avg);
1592    CALC_MSE_AVG_B(out1, ref1, var, avg);
1593  }
1594
1595  vec = __msa_hadd_s_w(avg, avg);
1596  *diff = HADD_SW_S32(vec);
1597
1598  return HADD_SW_S32(var);
1599}
1600
1601static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src,
1602                                              int32_t src_stride,
1603                                              const uint8_t *dst,
1604                                              int32_t dst_stride,
1605                                              const uint8_t *sec_pred,
1606                                              const uint8_t *filter_horiz,
1607                                              const uint8_t *filter_vert,
1608                                              int32_t height,
1609                                              int32_t *diff,
1610                                              int32_t width) {
1611  int16_t filtval;
1612  uint32_t loop_cnt;
1613  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1614  v16u8 ref0, ref1, ref2, ref3;
1615  v16u8 pred0, pred1, pred2, pred3;
1616  v16u8 out0, out1, out2, out3;
1617  v16u8 filt_hz, filt_vt, vec0, vec1;
1618  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1619  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
1620  v8i16 avg = { 0 };
1621  v4i32 vec, var = { 0 };
1622
1623  filtval = LH(filter_horiz);
1624  filt_hz = (v16u8)__msa_fill_h(filtval);
1625  filtval = LH(filter_vert);
1626  filt_vt = (v16u8)__msa_fill_h(filtval);
1627
1628  LD_UB2(src, 8, src0, src1);
1629  src += src_stride;
1630
1631  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1632  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1633
1634  for (loop_cnt = (height >> 2); loop_cnt--;) {
1635    LD_UB4(src, src_stride, src0, src2, src4, src6);
1636    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
1637    src += (4 * src_stride);
1638    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1639    sec_pred += (4 * width);
1640
1641    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1642    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1643    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1644    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1645    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1646    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1647
1648    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1649    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1650    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1651    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1652    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1653    out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1654
1655    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1656    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
1657    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1658    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1659    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1660    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1661
1662    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
1663    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
1664    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1665    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1666    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1667    out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1668
1669    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1670    dst += (4 * dst_stride);
1671
1672    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
1673                out0, out1, out2, out3);
1674
1675    CALC_MSE_AVG_B(out0, ref0, var, avg);
1676    CALC_MSE_AVG_B(out1, ref1, var, avg);
1677    CALC_MSE_AVG_B(out2, ref2, var, avg);
1678    CALC_MSE_AVG_B(out3, ref3, var, avg);
1679  }
1680
1681  vec = __msa_hadd_s_w(avg, avg);
1682  *diff = HADD_SW_S32(vec);
1683
1684  return HADD_SW_S32(var);
1685}
1686
1687static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
1688  const uint8_t *src, int32_t src_stride,
1689  const uint8_t *dst, int32_t dst_stride,
1690  const uint8_t *sec_pred,
1691  const uint8_t *filter_horiz, const uint8_t *filter_vert,
1692  int32_t height, int32_t *diff) {
1693  return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1694                                       sec_pred, filter_horiz, filter_vert,
1695                                       height, diff, 16);
1696}
1697
1698static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
1699  const uint8_t *src, int32_t src_stride,
1700  const uint8_t *dst, int32_t dst_stride,
1701  const uint8_t *sec_pred,
1702  const uint8_t *filter_horiz, const uint8_t *filter_vert,
1703  int32_t height, int32_t *diff) {
1704  uint32_t loop_cnt, sse = 0;
1705  int32_t diff0[2];
1706
1707  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1708    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1709                                         sec_pred, filter_horiz, filter_vert,
1710                                         height, &diff0[loop_cnt], 32);
1711    src += 16;
1712    dst += 16;
1713    sec_pred += 16;
1714  }
1715
1716  *diff = diff0[0] + diff0[1];
1717
1718  return sse;
1719}
1720
1721static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
1722  const uint8_t *src, int32_t src_stride,
1723  const uint8_t *dst, int32_t dst_stride,
1724  const uint8_t *sec_pred,
1725  const uint8_t *filter_horiz, const uint8_t *filter_vert,
1726  int32_t height, int32_t *diff) {
1727  uint32_t loop_cnt, sse = 0;
1728  int32_t diff0[4];
1729
1730  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1731    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1732                                         sec_pred, filter_horiz, filter_vert,
1733                                         height, &diff0[loop_cnt], 64);
1734    src += 16;
1735    dst += 16;
1736    sec_pred += 16;
1737  }
1738
1739  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1740
1741  return sse;
1742}
1743
1744#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
1745#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1746#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1747#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
1748#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1749#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1750#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
1751
1752#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1753#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1754#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
1755#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1756#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1757#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
1758
1759#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                         \
1760uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src,     \
1761                                                 int32_t src_stride,     \
1762                                                 int32_t xoffset,        \
1763                                                 int32_t yoffset,        \
1764                                                 const uint8_t *ref,     \
1765                                                 int32_t ref_stride,     \
1766                                                 uint32_t *sse) {        \
1767  int32_t diff;                                                          \
1768  uint32_t var;                                                          \
1769  const uint8_t *h_filter = bilinear_filters_msa[xoffset];               \
1770  const uint8_t *v_filter = bilinear_filters_msa[yoffset];               \
1771                                                                         \
1772  if (yoffset) {                                                         \
1773    if (xoffset) {                                                       \
1774      *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride,      \
1775                                                   ref, ref_stride,      \
1776                                                   h_filter, v_filter,   \
1777                                                   ht, &diff);           \
1778    } else {                                                             \
1779      *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride,       \
1780                                                  ref, ref_stride,       \
1781                                                  v_filter, ht, &diff);  \
1782    }                                                                    \
1783                                                                         \
1784    var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                          \
1785  } else {                                                               \
1786    if (xoffset) {                                                       \
1787      *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride,       \
1788                                                  ref, ref_stride,       \
1789                                                  h_filter, ht, &diff);  \
1790                                                                         \
1791      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                        \
1792    } else {                                                             \
1793      var = vpx_variance##wd##x##ht##_msa(src, src_stride,               \
1794                                          ref, ref_stride, sse);         \
1795    }                                                                    \
1796  }                                                                      \
1797                                                                         \
1798  return var;                                                            \
1799}
1800
1801VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
1802VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
1803
1804VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
1805VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
1806VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
1807
1808VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
1809VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
1810VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
1811
1812VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
1813VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
1814VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
1815
1816VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
1817VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
1818
1819#define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
1820uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa(                         \
1821  const uint8_t *src_ptr, int32_t src_stride,                                 \
1822  int32_t xoffset, int32_t yoffset,                                           \
1823  const uint8_t *ref_ptr, int32_t ref_stride,                                 \
1824  uint32_t *sse, const uint8_t *sec_pred) {                                   \
1825  int32_t diff;                                                               \
1826  const uint8_t *h_filter = bilinear_filters_msa[xoffset];                    \
1827  const uint8_t *v_filter = bilinear_filters_msa[yoffset];                    \
1828                                                                              \
1829  if (yoffset) {                                                              \
1830    if (xoffset) {                                                            \
1831      *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(src_ptr, src_stride,   \
1832                                                       ref_ptr, ref_stride,   \
1833                                                       sec_pred, h_filter,    \
1834                                                       v_filter, ht, &diff);  \
1835    } else {                                                                  \
1836      *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(src_ptr, src_stride,    \
1837                                                      ref_ptr, ref_stride,    \
1838                                                      sec_pred, v_filter,     \
1839                                                      ht, &diff);             \
1840    }                                                                         \
1841  } else {                                                                    \
1842    if (xoffset) {                                                            \
1843      *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(src_ptr, src_stride,    \
1844                                                      ref_ptr, ref_stride,    \
1845                                                      sec_pred, h_filter,     \
1846                                                      ht, &diff);             \
1847    } else {                                                                  \
1848      *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride,                \
1849                                          ref_ptr, ref_stride,                \
1850                                          sec_pred, ht, &diff);               \
1851    }                                                                         \
1852  }                                                                           \
1853                                                                              \
1854  return VARIANCE_##wd##Wx##ht##H(*sse, diff);                                \
1855}
1856
1857VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
1858VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
1859
1860VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
1861VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
1862VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
1863
1864VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
1865VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
1866VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
1867
1868VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
1869VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
1870
1871uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
1872                                             int32_t src_stride,
1873                                             int32_t xoffset,
1874                                             int32_t yoffset,
1875                                             const uint8_t *ref_ptr,
1876                                             int32_t ref_stride,
1877                                             uint32_t *sse,
1878                                             const uint8_t *sec_pred) {
1879  int32_t diff;
1880  const uint8_t *h_filter = bilinear_filters_msa[xoffset];
1881  const uint8_t *v_filter = bilinear_filters_msa[yoffset];
1882
1883  if (yoffset) {
1884    if (xoffset) {
1885      *sse = sub_pixel_avg_sse_diff_32width_hv_msa(src_ptr, src_stride,
1886                                                   ref_ptr, ref_stride,
1887                                                   sec_pred, h_filter,
1888                                                   v_filter, 64, &diff);
1889    } else {
1890      *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride,
1891                                                  ref_ptr, ref_stride,
1892                                                  sec_pred, v_filter,
1893                                                  64, &diff);
1894    }
1895  } else {
1896    if (xoffset) {
1897      *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride,
1898                                                  ref_ptr, ref_stride,
1899                                                  sec_pred, h_filter,
1900                                                  64, &diff);
1901    } else {
1902      *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
1903                                    sec_pred, &diff);
1904    }
1905  }
1906
1907  return VARIANCE_32Wx64H(*sse, diff);
1908}
1909
1910#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                          \
1911uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(const uint8_t *src_ptr,     \
1912                                                 int32_t src_stride,         \
1913                                                 int32_t xoffset,            \
1914                                                 int32_t yoffset,            \
1915                                                 const uint8_t *ref_ptr,     \
1916                                                 int32_t ref_stride,         \
1917                                                 uint32_t *sse,              \
1918                                                 const uint8_t *sec_pred) {  \
1919  int32_t diff;                                                              \
1920  const uint8_t *h_filter = bilinear_filters_msa[xoffset];                   \
1921  const uint8_t *v_filter = bilinear_filters_msa[yoffset];                   \
1922                                                                             \
1923  if (yoffset) {                                                             \
1924    if (xoffset) {                                                           \
1925      *sse = sub_pixel_avg_sse_diff_64width_hv_msa(src_ptr, src_stride,      \
1926                                                   ref_ptr, ref_stride,      \
1927                                                   sec_pred, h_filter,       \
1928                                                   v_filter, ht, &diff);     \
1929    } else {                                                                 \
1930      *sse = sub_pixel_avg_sse_diff_64width_v_msa(src_ptr, src_stride,       \
1931                                                  ref_ptr, ref_stride,       \
1932                                                  sec_pred, v_filter,        \
1933                                                  ht, &diff);                \
1934    }                                                                        \
1935  } else {                                                                   \
1936    if (xoffset) {                                                           \
1937      *sse = sub_pixel_avg_sse_diff_64width_h_msa(src_ptr, src_stride,       \
1938                                                  ref_ptr, ref_stride,       \
1939                                                  sec_pred, h_filter,        \
1940                                                  ht, &diff);                \
1941    } else {                                                                 \
1942      *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride,                 \
1943                                        ref_ptr, ref_stride,                 \
1944                                        sec_pred, &diff);                    \
1945    }                                                                        \
1946  }                                                                          \
1947                                                                             \
1948  return VARIANCE_64Wx##ht##H(*sse, diff);                                   \
1949}
1950
1951VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
1952VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
1953