1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_dsp_rtcd.h"
12#include "vpx_ports/mem.h"
13#include "vpx_dsp/mips/macros_msa.h"
14#include "vpx_dsp/variance.h"
15
16static const uint8_t bilinear_filters_msa[8][2] = {
17  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
18  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
19};
20
21#define CALC_MSE_AVG_B(src, ref, var, sub)                          \
22  {                                                                 \
23    v16u8 src_l0_m, src_l1_m;                                       \
24    v8i16 res_l0_m, res_l1_m;                                       \
25                                                                    \
26    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
27    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
28    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
29                                                                    \
30    sub += res_l0_m + res_l1_m;                                     \
31  }
32
33#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
34
35#define VARIANCE_LARGE_WxH(sse, diff, shift) \
36  sse - (((int64_t)diff * diff) >> shift)
37
38static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
39                                        int32_t src_stride,
40                                        const uint8_t *ref_ptr,
41                                        int32_t ref_stride,
42                                        const uint8_t *sec_pred, int32_t height,
43                                        int32_t *diff) {
44  int32_t ht_cnt;
45  uint32_t src0, src1, src2, src3;
46  uint32_t ref0, ref1, ref2, ref3;
47  v16u8 pred, src = { 0 };
48  v16u8 ref = { 0 };
49  v8i16 avg = { 0 };
50  v4i32 vec, var = { 0 };
51
52  for (ht_cnt = (height >> 2); ht_cnt--;) {
53    pred = LD_UB(sec_pred);
54    sec_pred += 16;
55    LW4(src_ptr, src_stride, src0, src1, src2, src3);
56    src_ptr += (4 * src_stride);
57    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
58    ref_ptr += (4 * ref_stride);
59
60    INSERT_W4_UB(src0, src1, src2, src3, src);
61    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
62
63    src = __msa_aver_u_b(src, pred);
64    CALC_MSE_AVG_B(src, ref, var, avg);
65  }
66
67  vec = __msa_hadd_s_w(avg, avg);
68  *diff = HADD_SW_S32(vec);
69
70  return HADD_SW_S32(var);
71}
72
73static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
74                                        int32_t src_stride,
75                                        const uint8_t *ref_ptr,
76                                        int32_t ref_stride,
77                                        const uint8_t *sec_pred, int32_t height,
78                                        int32_t *diff) {
79  int32_t ht_cnt;
80  v16u8 src0, src1, src2, src3;
81  v16u8 ref0, ref1, ref2, ref3;
82  v16u8 pred0, pred1;
83  v8i16 avg = { 0 };
84  v4i32 vec, var = { 0 };
85
86  for (ht_cnt = (height >> 2); ht_cnt--;) {
87    LD_UB2(sec_pred, 16, pred0, pred1);
88    sec_pred += 32;
89    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
90    src_ptr += (4 * src_stride);
91    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
92    ref_ptr += (4 * ref_stride);
93
94    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
95                ref0, ref1);
96    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
97    CALC_MSE_AVG_B(src0, ref0, var, avg);
98    CALC_MSE_AVG_B(src1, ref1, var, avg);
99  }
100
101  vec = __msa_hadd_s_w(avg, avg);
102  *diff = HADD_SW_S32(vec);
103
104  return HADD_SW_S32(var);
105}
106
107static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
108                                         int32_t src_stride,
109                                         const uint8_t *ref_ptr,
110                                         int32_t ref_stride,
111                                         const uint8_t *sec_pred,
112                                         int32_t height, int32_t *diff) {
113  int32_t ht_cnt;
114  v16u8 src, ref, pred;
115  v8i16 avg = { 0 };
116  v4i32 vec, var = { 0 };
117
118  for (ht_cnt = (height >> 2); ht_cnt--;) {
119    pred = LD_UB(sec_pred);
120    sec_pred += 16;
121    src = LD_UB(src_ptr);
122    src_ptr += src_stride;
123    ref = LD_UB(ref_ptr);
124    ref_ptr += ref_stride;
125    src = __msa_aver_u_b(src, pred);
126    CALC_MSE_AVG_B(src, ref, var, avg);
127
128    pred = LD_UB(sec_pred);
129    sec_pred += 16;
130    src = LD_UB(src_ptr);
131    src_ptr += src_stride;
132    ref = LD_UB(ref_ptr);
133    ref_ptr += ref_stride;
134    src = __msa_aver_u_b(src, pred);
135    CALC_MSE_AVG_B(src, ref, var, avg);
136
137    pred = LD_UB(sec_pred);
138    sec_pred += 16;
139    src = LD_UB(src_ptr);
140    src_ptr += src_stride;
141    ref = LD_UB(ref_ptr);
142    ref_ptr += ref_stride;
143    src = __msa_aver_u_b(src, pred);
144    CALC_MSE_AVG_B(src, ref, var, avg);
145
146    pred = LD_UB(sec_pred);
147    sec_pred += 16;
148    src = LD_UB(src_ptr);
149    src_ptr += src_stride;
150    ref = LD_UB(ref_ptr);
151    ref_ptr += ref_stride;
152    src = __msa_aver_u_b(src, pred);
153    CALC_MSE_AVG_B(src, ref, var, avg);
154  }
155
156  vec = __msa_hadd_s_w(avg, avg);
157  *diff = HADD_SW_S32(vec);
158
159  return HADD_SW_S32(var);
160}
161
162static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
163                                         int32_t src_stride,
164                                         const uint8_t *ref_ptr,
165                                         int32_t ref_stride,
166                                         const uint8_t *sec_pred,
167                                         int32_t height, int32_t *diff) {
168  int32_t ht_cnt;
169  v16u8 src0, src1, ref0, ref1, pred0, pred1;
170  v8i16 avg = { 0 };
171  v4i32 vec, var = { 0 };
172
173  for (ht_cnt = (height >> 2); ht_cnt--;) {
174    LD_UB2(sec_pred, 16, pred0, pred1);
175    sec_pred += 32;
176    LD_UB2(src_ptr, 16, src0, src1);
177    src_ptr += src_stride;
178    LD_UB2(ref_ptr, 16, ref0, ref1);
179    ref_ptr += ref_stride;
180    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
181    CALC_MSE_AVG_B(src0, ref0, var, avg);
182    CALC_MSE_AVG_B(src1, ref1, var, avg);
183
184    LD_UB2(sec_pred, 16, pred0, pred1);
185    sec_pred += 32;
186    LD_UB2(src_ptr, 16, src0, src1);
187    src_ptr += src_stride;
188    LD_UB2(ref_ptr, 16, ref0, ref1);
189    ref_ptr += ref_stride;
190    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
191    CALC_MSE_AVG_B(src0, ref0, var, avg);
192    CALC_MSE_AVG_B(src1, ref1, var, avg);
193
194    LD_UB2(sec_pred, 16, pred0, pred1);
195    sec_pred += 32;
196    LD_UB2(src_ptr, 16, src0, src1);
197    src_ptr += src_stride;
198    LD_UB2(ref_ptr, 16, ref0, ref1);
199    ref_ptr += ref_stride;
200    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
201    CALC_MSE_AVG_B(src0, ref0, var, avg);
202    CALC_MSE_AVG_B(src1, ref1, var, avg);
203
204    LD_UB2(sec_pred, 16, pred0, pred1);
205    sec_pred += 32;
206    LD_UB2(src_ptr, 16, src0, src1);
207    src_ptr += src_stride;
208    LD_UB2(ref_ptr, 16, ref0, ref1);
209    ref_ptr += ref_stride;
210    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
211    CALC_MSE_AVG_B(src0, ref0, var, avg);
212    CALC_MSE_AVG_B(src1, ref1, var, avg);
213  }
214
215  vec = __msa_hadd_s_w(avg, avg);
216  *diff = HADD_SW_S32(vec);
217
218  return HADD_SW_S32(var);
219}
220
221static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
222                                       int32_t src_stride,
223                                       const uint8_t *ref_ptr,
224                                       int32_t ref_stride,
225                                       const uint8_t *sec_pred, int32_t *diff) {
226  int32_t ht_cnt;
227  v16u8 src0, src1, ref0, ref1, pred0, pred1;
228  v8i16 avg0 = { 0 };
229  v8i16 avg1 = { 0 };
230  v4i32 vec, var = { 0 };
231
232  for (ht_cnt = 16; ht_cnt--;) {
233    LD_UB2(sec_pred, 16, pred0, pred1);
234    sec_pred += 32;
235    LD_UB2(src_ptr, 16, src0, src1);
236    src_ptr += src_stride;
237    LD_UB2(ref_ptr, 16, ref0, ref1);
238    ref_ptr += ref_stride;
239    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
240    CALC_MSE_AVG_B(src0, ref0, var, avg0);
241    CALC_MSE_AVG_B(src1, ref1, var, avg1);
242
243    LD_UB2(sec_pred, 16, pred0, pred1);
244    sec_pred += 32;
245    LD_UB2(src_ptr, 16, src0, src1);
246    src_ptr += src_stride;
247    LD_UB2(ref_ptr, 16, ref0, ref1);
248    ref_ptr += ref_stride;
249    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
250    CALC_MSE_AVG_B(src0, ref0, var, avg0);
251    CALC_MSE_AVG_B(src1, ref1, var, avg1);
252
253    LD_UB2(sec_pred, 16, pred0, pred1);
254    sec_pred += 32;
255    LD_UB2(src_ptr, 16, src0, src1);
256    src_ptr += src_stride;
257    LD_UB2(ref_ptr, 16, ref0, ref1);
258    ref_ptr += ref_stride;
259    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
260    CALC_MSE_AVG_B(src0, ref0, var, avg0);
261    CALC_MSE_AVG_B(src1, ref1, var, avg1);
262
263    LD_UB2(sec_pred, 16, pred0, pred1);
264    sec_pred += 32;
265    LD_UB2(src_ptr, 16, src0, src1);
266    src_ptr += src_stride;
267    LD_UB2(ref_ptr, 16, ref0, ref1);
268    ref_ptr += ref_stride;
269    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
270    CALC_MSE_AVG_B(src0, ref0, var, avg0);
271    CALC_MSE_AVG_B(src1, ref1, var, avg1);
272  }
273
274  vec = __msa_hadd_s_w(avg0, avg0);
275  vec += __msa_hadd_s_w(avg1, avg1);
276  *diff = HADD_SW_S32(vec);
277
278  return HADD_SW_S32(var);
279}
280
281static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
282                                       int32_t src_stride,
283                                       const uint8_t *ref_ptr,
284                                       int32_t ref_stride,
285                                       const uint8_t *sec_pred, int32_t *diff) {
286  int32_t ht_cnt;
287  v16u8 src0, src1, src2, src3;
288  v16u8 ref0, ref1, ref2, ref3;
289  v16u8 pred0, pred1, pred2, pred3;
290  v8i16 avg0 = { 0 };
291  v8i16 avg1 = { 0 };
292  v4i32 vec, var = { 0 };
293
294  for (ht_cnt = 16; ht_cnt--;) {
295    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
296    sec_pred += 64;
297    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
298    src_ptr += src_stride;
299    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
300    ref_ptr += ref_stride;
301    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
302                src2, src3);
303    CALC_MSE_AVG_B(src0, ref0, var, avg0);
304    CALC_MSE_AVG_B(src2, ref2, var, avg0);
305    CALC_MSE_AVG_B(src1, ref1, var, avg1);
306    CALC_MSE_AVG_B(src3, ref3, var, avg1);
307
308    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
309    sec_pred += 64;
310    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
311    src_ptr += src_stride;
312    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
313    ref_ptr += ref_stride;
314    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
315                src2, src3);
316    CALC_MSE_AVG_B(src0, ref0, var, avg0);
317    CALC_MSE_AVG_B(src2, ref2, var, avg0);
318    CALC_MSE_AVG_B(src1, ref1, var, avg1);
319    CALC_MSE_AVG_B(src3, ref3, var, avg1);
320  }
321
322  vec = __msa_hadd_s_w(avg0, avg0);
323  vec += __msa_hadd_s_w(avg1, avg1);
324
325  *diff = HADD_SW_S32(vec);
326
327  return HADD_SW_S32(var);
328}
329
330static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
331                                       int32_t src_stride,
332                                       const uint8_t *ref_ptr,
333                                       int32_t ref_stride,
334                                       const uint8_t *sec_pred, int32_t *diff) {
335  int32_t ht_cnt;
336  v16u8 src0, src1, src2, src3;
337  v16u8 ref0, ref1, ref2, ref3;
338  v16u8 pred0, pred1, pred2, pred3;
339  v8i16 avg0 = { 0 };
340  v8i16 avg1 = { 0 };
341  v8i16 avg2 = { 0 };
342  v8i16 avg3 = { 0 };
343  v4i32 vec, var = { 0 };
344
345  for (ht_cnt = 32; ht_cnt--;) {
346    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
347    sec_pred += 64;
348    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
349    src_ptr += src_stride;
350    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
351    ref_ptr += ref_stride;
352    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
353                src2, src3);
354    CALC_MSE_AVG_B(src0, ref0, var, avg0);
355    CALC_MSE_AVG_B(src1, ref1, var, avg1);
356    CALC_MSE_AVG_B(src2, ref2, var, avg2);
357    CALC_MSE_AVG_B(src3, ref3, var, avg3);
358
359    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
360    sec_pred += 64;
361    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
362    src_ptr += src_stride;
363    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
364    ref_ptr += ref_stride;
365    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
366                src2, src3);
367    CALC_MSE_AVG_B(src0, ref0, var, avg0);
368    CALC_MSE_AVG_B(src1, ref1, var, avg1);
369    CALC_MSE_AVG_B(src2, ref2, var, avg2);
370    CALC_MSE_AVG_B(src3, ref3, var, avg3);
371  }
372
373  vec = __msa_hadd_s_w(avg0, avg0);
374  vec += __msa_hadd_s_w(avg1, avg1);
375  vec += __msa_hadd_s_w(avg2, avg2);
376  vec += __msa_hadd_s_w(avg3, avg3);
377  *diff = HADD_SW_S32(vec);
378
379  return HADD_SW_S32(var);
380}
381
382static uint32_t sub_pixel_sse_diff_4width_h_msa(
383    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
384    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
385  int16_t filtval;
386  uint32_t loop_cnt;
387  uint32_t ref0, ref1, ref2, ref3;
388  v16u8 filt0, ref = { 0 };
389  v16i8 src0, src1, src2, src3;
390  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
391  v8u16 vec0, vec1, vec2, vec3;
392  v8i16 avg = { 0 };
393  v4i32 vec, var = { 0 };
394
395  filtval = LH(filter);
396  filt0 = (v16u8)__msa_fill_h(filtval);
397
398  for (loop_cnt = (height >> 2); loop_cnt--;) {
399    LD_SB4(src, src_stride, src0, src1, src2, src3);
400    src += (4 * src_stride);
401    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
402    dst += (4 * dst_stride);
403    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
404    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
405    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
406    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
407                vec2, vec3);
408    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
409    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
410                src2, src3);
411    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
412    src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
413    CALC_MSE_AVG_B(src0, ref, var, avg);
414  }
415
416  vec = __msa_hadd_s_w(avg, avg);
417  *diff = HADD_SW_S32(vec);
418
419  return HADD_SW_S32(var);
420}
421
422static uint32_t sub_pixel_sse_diff_8width_h_msa(
423    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
424    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
425  int16_t filtval;
426  uint32_t loop_cnt;
427  v16u8 filt0, out, ref0, ref1, ref2, ref3;
428  v16i8 src0, src1, src2, src3;
429  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
430  v8u16 vec0, vec1, vec2, vec3;
431  v8i16 avg = { 0 };
432  v4i32 vec, var = { 0 };
433
434  filtval = LH(filter);
435  filt0 = (v16u8)__msa_fill_h(filtval);
436
437  for (loop_cnt = (height >> 2); loop_cnt--;) {
438    LD_SB4(src, src_stride, src0, src1, src2, src3);
439    src += (4 * src_stride);
440    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
441    dst += (4 * dst_stride);
442
443    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
444    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
445    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
446    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
447                vec2, vec3);
448    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
449    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
450                src2, src3);
451    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
452    CALC_MSE_AVG_B(out, ref0, var, avg);
453    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
454    CALC_MSE_AVG_B(out, ref1, var, avg);
455  }
456
457  vec = __msa_hadd_s_w(avg, avg);
458  *diff = HADD_SW_S32(vec);
459
460  return HADD_SW_S32(var);
461}
462
463static uint32_t sub_pixel_sse_diff_16width_h_msa(
464    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
465    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
466  int16_t filtval;
467  uint32_t loop_cnt;
468  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
469  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
470  v16u8 dst0, dst1, dst2, dst3, filt0;
471  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
472  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
473  v8i16 avg = { 0 };
474  v4i32 vec, var = { 0 };
475
476  filtval = LH(filter);
477  filt0 = (v16u8)__msa_fill_h(filtval);
478
479  for (loop_cnt = (height >> 2); loop_cnt--;) {
480    LD_SB4(src, src_stride, src0, src2, src4, src6);
481    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
482    src += (4 * src_stride);
483    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
484    dst += (4 * dst_stride);
485
486    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
487    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
488    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
489    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
490    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
491                out2, out3);
492    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
493                out6, out7);
494    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
495    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
496    PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
497                src2, src3);
498    CALC_MSE_AVG_B(src0, dst0, var, avg);
499    CALC_MSE_AVG_B(src1, dst1, var, avg);
500    CALC_MSE_AVG_B(src2, dst2, var, avg);
501    CALC_MSE_AVG_B(src3, dst3, var, avg);
502  }
503
504  vec = __msa_hadd_s_w(avg, avg);
505  *diff = HADD_SW_S32(vec);
506
507  return HADD_SW_S32(var);
508}
509
510static uint32_t sub_pixel_sse_diff_32width_h_msa(
511    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
512    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
513  uint32_t loop_cnt, sse = 0;
514  int32_t diff0[2];
515
516  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
517    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
518                                            filter, height, &diff0[loop_cnt]);
519    src += 16;
520    dst += 16;
521  }
522
523  *diff = diff0[0] + diff0[1];
524
525  return sse;
526}
527
528static uint32_t sub_pixel_sse_diff_64width_h_msa(
529    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
530    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
531  uint32_t loop_cnt, sse = 0;
532  int32_t diff0[4];
533
534  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
535    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
536                                            filter, height, &diff0[loop_cnt]);
537    src += 16;
538    dst += 16;
539  }
540
541  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
542
543  return sse;
544}
545
546static uint32_t sub_pixel_sse_diff_4width_v_msa(
547    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
548    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
549  int16_t filtval;
550  uint32_t loop_cnt;
551  uint32_t ref0, ref1, ref2, ref3;
552  v16u8 src0, src1, src2, src3, src4, out;
553  v16u8 src10_r, src32_r, src21_r, src43_r;
554  v16u8 ref = { 0 };
555  v16u8 src2110, src4332;
556  v16u8 filt0;
557  v8i16 avg = { 0 };
558  v4i32 vec, var = { 0 };
559  v8u16 tmp0, tmp1;
560
561  filtval = LH(filter);
562  filt0 = (v16u8)__msa_fill_h(filtval);
563
564  src0 = LD_UB(src);
565  src += src_stride;
566
567  for (loop_cnt = (height >> 2); loop_cnt--;) {
568    LD_UB4(src, src_stride, src1, src2, src3, src4);
569    src += (4 * src_stride);
570    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
571    dst += (4 * dst_stride);
572
573    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
574    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
575               src32_r, src43_r);
576    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
577    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
578    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
579    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
580    CALC_MSE_AVG_B(out, ref, var, avg);
581    src0 = src4;
582  }
583
584  vec = __msa_hadd_s_w(avg, avg);
585  *diff = HADD_SW_S32(vec);
586
587  return HADD_SW_S32(var);
588}
589
590static uint32_t sub_pixel_sse_diff_8width_v_msa(
591    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
592    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
593  int16_t filtval;
594  uint32_t loop_cnt;
595  v16u8 src0, src1, src2, src3, src4;
596  v16u8 ref0, ref1, ref2, ref3;
597  v8u16 vec0, vec1, vec2, vec3;
598  v8u16 tmp0, tmp1, tmp2, tmp3;
599  v16u8 filt0;
600  v8i16 avg = { 0 };
601  v4i32 vec, var = { 0 };
602
603  filtval = LH(filter);
604  filt0 = (v16u8)__msa_fill_h(filtval);
605
606  src0 = LD_UB(src);
607  src += src_stride;
608
609  for (loop_cnt = (height >> 2); loop_cnt--;) {
610    LD_UB4(src, src_stride, src1, src2, src3, src4);
611    src += (4 * src_stride);
612    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
613    dst += (4 * dst_stride);
614
615    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
616    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
617               vec3);
618    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
619                tmp2, tmp3);
620    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
621    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
622    CALC_MSE_AVG_B(src0, ref0, var, avg);
623    CALC_MSE_AVG_B(src1, ref1, var, avg);
624    src0 = src4;
625  }
626
627  vec = __msa_hadd_s_w(avg, avg);
628  *diff = HADD_SW_S32(vec);
629
630  return HADD_SW_S32(var);
631}
632
633static uint32_t sub_pixel_sse_diff_16width_v_msa(
634    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
635    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
636  int16_t filtval;
637  uint32_t loop_cnt;
638  v16u8 ref0, ref1, ref2, ref3;
639  v16u8 src0, src1, src2, src3, src4;
640  v16u8 out0, out1, out2, out3;
641  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
642  v8u16 tmp0, tmp1, tmp2, tmp3;
643  v16u8 filt0;
644  v8i16 avg = { 0 };
645  v4i32 vec, var = { 0 };
646
647  filtval = LH(filter);
648  filt0 = (v16u8)__msa_fill_h(filtval);
649
650  src0 = LD_UB(src);
651  src += src_stride;
652
653  for (loop_cnt = (height >> 2); loop_cnt--;) {
654    LD_UB4(src, src_stride, src1, src2, src3, src4);
655    src += (4 * src_stride);
656    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
657    dst += (4 * dst_stride);
658
659    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
660    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
661    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
662    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
663    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
664
665    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
666    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
667    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
668    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
669    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
670
671    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
672    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
673    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
674    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
675    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
676    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
677
678    src0 = src4;
679
680    CALC_MSE_AVG_B(out0, ref0, var, avg);
681    CALC_MSE_AVG_B(out1, ref1, var, avg);
682    CALC_MSE_AVG_B(out2, ref2, var, avg);
683    CALC_MSE_AVG_B(out3, ref3, var, avg);
684  }
685
686  vec = __msa_hadd_s_w(avg, avg);
687  *diff = HADD_SW_S32(vec);
688
689  return HADD_SW_S32(var);
690}
691
692static uint32_t sub_pixel_sse_diff_32width_v_msa(
693    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
694    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
695  uint32_t loop_cnt, sse = 0;
696  int32_t diff0[2];
697
698  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
699    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
700                                            filter, height, &diff0[loop_cnt]);
701    src += 16;
702    dst += 16;
703  }
704
705  *diff = diff0[0] + diff0[1];
706
707  return sse;
708}
709
710static uint32_t sub_pixel_sse_diff_64width_v_msa(
711    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
712    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
713  uint32_t loop_cnt, sse = 0;
714  int32_t diff0[4];
715
716  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
717    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
718                                            filter, height, &diff0[loop_cnt]);
719    src += 16;
720    dst += 16;
721  }
722
723  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
724
725  return sse;
726}
727
728static uint32_t sub_pixel_sse_diff_4width_hv_msa(
729    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
730    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
731    int32_t height, int32_t *diff) {
732  int16_t filtval;
733  uint32_t loop_cnt;
734  uint32_t ref0, ref1, ref2, ref3;
735  v16u8 src0, src1, src2, src3, src4;
736  v16u8 out, ref = { 0 };
737  v16u8 filt_vt, filt_hz, vec0, vec1;
738  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
739  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
740  v8u16 tmp0, tmp1;
741  v8i16 avg = { 0 };
742  v4i32 vec, var = { 0 };
743
744  filtval = LH(filter_horiz);
745  filt_hz = (v16u8)__msa_fill_h(filtval);
746  filtval = LH(filter_vert);
747  filt_vt = (v16u8)__msa_fill_h(filtval);
748
749  src0 = LD_UB(src);
750  src += src_stride;
751
752  for (loop_cnt = (height >> 2); loop_cnt--;) {
753    LD_UB4(src, src_stride, src1, src2, src3, src4);
754    src += (4 * src_stride);
755    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
756    dst += (4 * dst_stride);
757    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
758    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
759    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
760    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
761    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
762    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
763    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
764    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
765    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
766    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
767    CALC_MSE_AVG_B(out, ref, var, avg);
768    src0 = src4;
769  }
770
771  vec = __msa_hadd_s_w(avg, avg);
772  *diff = HADD_SW_S32(vec);
773
774  return HADD_SW_S32(var);
775}
776
777static uint32_t sub_pixel_sse_diff_8width_hv_msa(
778    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
779    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
780    int32_t height, int32_t *diff) {
781  int16_t filtval;
782  uint32_t loop_cnt;
783  v16u8 ref0, ref1, ref2, ref3;
784  v16u8 src0, src1, src2, src3, src4;
785  v16u8 out0, out1;
786  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
787  v8u16 hz_out0, hz_out1;
788  v8u16 tmp0, tmp1, tmp2, tmp3;
789  v16u8 filt_vt, filt_hz, vec0;
790  v8i16 avg = { 0 };
791  v4i32 vec, var = { 0 };
792
793  filtval = LH(filter_horiz);
794  filt_hz = (v16u8)__msa_fill_h(filtval);
795  filtval = LH(filter_vert);
796  filt_vt = (v16u8)__msa_fill_h(filtval);
797
798  src0 = LD_UB(src);
799  src += src_stride;
800  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
801
802  for (loop_cnt = (height >> 2); loop_cnt--;) {
803    LD_UB4(src, src_stride, src1, src2, src3, src4);
804    src += (4 * src_stride);
805    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
806    dst += (4 * dst_stride);
807
808    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
809    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
810    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
811    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
812    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
813    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
814    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
815    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
816    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
817    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
818    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
819    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
820    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
821    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
822    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
823    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
824    CALC_MSE_AVG_B(out0, ref0, var, avg);
825    CALC_MSE_AVG_B(out1, ref1, var, avg);
826  }
827
828  vec = __msa_hadd_s_w(avg, avg);
829  *diff = HADD_SW_S32(vec);
830
831  return HADD_SW_S32(var);
832}
833
834static uint32_t sub_pixel_sse_diff_16width_hv_msa(
835    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
836    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
837    int32_t height, int32_t *diff) {
838  int16_t filtval;
839  uint32_t loop_cnt;
840  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
841  v16u8 ref0, ref1, ref2, ref3;
842  v16u8 filt_hz, filt_vt, vec0, vec1;
843  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
844  v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
845  v8u16 tmp0, tmp1;
846  v8i16 avg = { 0 };
847  v4i32 vec, var = { 0 };
848
849  filtval = LH(filter_horiz);
850  filt_hz = (v16u8)__msa_fill_h(filtval);
851  filtval = LH(filter_vert);
852  filt_vt = (v16u8)__msa_fill_h(filtval);
853
854  LD_UB2(src, 8, src0, src1);
855  src += src_stride;
856
857  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
858  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
859
860  for (loop_cnt = (height >> 2); loop_cnt--;) {
861    LD_UB4(src, src_stride, src0, src2, src4, src6);
862    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
863    src += (4 * src_stride);
864    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
865    dst += (4 * dst_stride);
866
867    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
868    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
869    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
870    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
871    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
872    src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
873
874    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
875    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
876    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
877    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
878    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
879    src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
880
881    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
882    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
883    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
884    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
885    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
886    src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
887
888    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
889    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
890    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
891    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
892    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
893    src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
894
895    CALC_MSE_AVG_B(src0, ref0, var, avg);
896    CALC_MSE_AVG_B(src1, ref1, var, avg);
897    CALC_MSE_AVG_B(src2, ref2, var, avg);
898    CALC_MSE_AVG_B(src3, ref3, var, avg);
899  }
900
901  vec = __msa_hadd_s_w(avg, avg);
902  *diff = HADD_SW_S32(vec);
903
904  return HADD_SW_S32(var);
905}
906
907static uint32_t sub_pixel_sse_diff_32width_hv_msa(
908    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
909    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
910    int32_t height, int32_t *diff) {
911  uint32_t loop_cnt, sse = 0;
912  int32_t diff0[2];
913
914  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
915    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
916                                             filter_horiz, filter_vert, height,
917                                             &diff0[loop_cnt]);
918    src += 16;
919    dst += 16;
920  }
921
922  *diff = diff0[0] + diff0[1];
923
924  return sse;
925}
926
927static uint32_t sub_pixel_sse_diff_64width_hv_msa(
928    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
929    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
930    int32_t height, int32_t *diff) {
931  uint32_t loop_cnt, sse = 0;
932  int32_t diff0[4];
933
934  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
935    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
936                                             filter_horiz, filter_vert, height,
937                                             &diff0[loop_cnt]);
938    src += 16;
939    dst += 16;
940  }
941
942  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
943
944  return sse;
945}
946
947static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
948    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
949    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
950    int32_t height, int32_t *diff) {
951  int16_t filtval;
952  uint32_t loop_cnt;
953  uint32_t ref0, ref1, ref2, ref3;
954  v16u8 out, pred, filt0, ref = { 0 };
955  v16i8 src0, src1, src2, src3;
956  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
957  v8u16 vec0, vec1, vec2, vec3;
958  v8i16 avg = { 0 };
959  v4i32 vec, var = { 0 };
960
961  filtval = LH(filter);
962  filt0 = (v16u8)__msa_fill_h(filtval);
963
964  for (loop_cnt = (height >> 2); loop_cnt--;) {
965    LD_SB4(src, src_stride, src0, src1, src2, src3);
966    src += (4 * src_stride);
967    pred = LD_UB(sec_pred);
968    sec_pred += 16;
969    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
970    dst += (4 * dst_stride);
971
972    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
973    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
974    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
975    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
976                vec2, vec3);
977    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
978    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
979                src2, src3);
980    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
981    out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
982    out = __msa_aver_u_b(out, pred);
983    CALC_MSE_AVG_B(out, ref, var, avg);
984  }
985
986  vec = __msa_hadd_s_w(avg, avg);
987  *diff = HADD_SW_S32(vec);
988
989  return HADD_SW_S32(var);
990}
991
992static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
993    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
994    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
995    int32_t height, int32_t *diff) {
996  int16_t filtval;
997  uint32_t loop_cnt;
998  v16u8 out, pred, filt0;
999  v16u8 ref0, ref1, ref2, ref3;
1000  v16i8 src0, src1, src2, src3;
1001  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1002  v8u16 vec0, vec1, vec2, vec3;
1003  v8i16 avg = { 0 };
1004  v4i32 vec, var = { 0 };
1005
1006  filtval = LH(filter);
1007  filt0 = (v16u8)__msa_fill_h(filtval);
1008
1009  for (loop_cnt = (height >> 2); loop_cnt--;) {
1010    LD_SB4(src, src_stride, src0, src1, src2, src3);
1011    src += (4 * src_stride);
1012    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1013    dst += (4 * dst_stride);
1014
1015    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1016    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1017    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1018    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
1019                vec2, vec3);
1020    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
1021    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
1022                src2, src3);
1023    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
1024
1025    pred = LD_UB(sec_pred);
1026    sec_pred += 16;
1027    out = __msa_aver_u_b(out, pred);
1028    CALC_MSE_AVG_B(out, ref0, var, avg);
1029    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
1030    pred = LD_UB(sec_pred);
1031    sec_pred += 16;
1032    out = __msa_aver_u_b(out, pred);
1033    CALC_MSE_AVG_B(out, ref1, var, avg);
1034  }
1035
1036  vec = __msa_hadd_s_w(avg, avg);
1037  *diff = HADD_SW_S32(vec);
1038
1039  return HADD_SW_S32(var);
1040}
1041
1042static uint32_t subpel_avg_ssediff_16w_h_msa(
1043    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1044    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1045    int32_t height, int32_t *diff, int32_t width) {
1046  int16_t filtval;
1047  uint32_t loop_cnt;
1048  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1049  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1050  v16u8 dst0, dst1, dst2, dst3;
1051  v16u8 tmp0, tmp1, tmp2, tmp3;
1052  v16u8 pred0, pred1, pred2, pred3, filt0;
1053  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1054  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
1055  v8i16 avg = { 0 };
1056  v4i32 vec, var = { 0 };
1057
1058  filtval = LH(filter);
1059  filt0 = (v16u8)__msa_fill_h(filtval);
1060
1061  for (loop_cnt = (height >> 2); loop_cnt--;) {
1062    LD_SB4(src, src_stride, src0, src2, src4, src6);
1063    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1064    src += (4 * src_stride);
1065    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1066    dst += (4 * dst_stride);
1067    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1068    sec_pred += (4 * width);
1069
1070    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1071    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1072    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
1073    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
1074    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
1075                out2, out3);
1076    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
1077                out6, out7);
1078    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
1079    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
1080    PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
1081                tmp2, tmp3);
1082    AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
1083                tmp2, tmp3);
1084
1085    CALC_MSE_AVG_B(tmp0, dst0, var, avg);
1086    CALC_MSE_AVG_B(tmp1, dst1, var, avg);
1087    CALC_MSE_AVG_B(tmp2, dst2, var, avg);
1088    CALC_MSE_AVG_B(tmp3, dst3, var, avg);
1089  }
1090
1091  vec = __msa_hadd_s_w(avg, avg);
1092  *diff = HADD_SW_S32(vec);
1093
1094  return HADD_SW_S32(var);
1095}
1096
1097static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
1098    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1099    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1100    int32_t height, int32_t *diff) {
1101  return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
1102                                      sec_pred, filter, height, diff, 16);
1103}
1104
1105static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
1106    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1107    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1108    int32_t height, int32_t *diff) {
1109  uint32_t loop_cnt, sse = 0;
1110  int32_t diff0[2];
1111
1112  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1113    sse +=
1114        subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
1115                                     filter, height, &diff0[loop_cnt], 32);
1116    src += 16;
1117    dst += 16;
1118    sec_pred += 16;
1119  }
1120
1121  *diff = diff0[0] + diff0[1];
1122
1123  return sse;
1124}
1125
1126static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
1127    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1128    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1129    int32_t height, int32_t *diff) {
1130  uint32_t loop_cnt, sse = 0;
1131  int32_t diff0[4];
1132
1133  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1134    sse +=
1135        subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
1136                                     filter, height, &diff0[loop_cnt], 64);
1137    src += 16;
1138    dst += 16;
1139    sec_pred += 16;
1140  }
1141
1142  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1143
1144  return sse;
1145}
1146
1147static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
1148    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1149    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1150    int32_t height, int32_t *diff) {
1151  int16_t filtval;
1152  uint32_t loop_cnt;
1153  uint32_t ref0, ref1, ref2, ref3;
1154  v16u8 src0, src1, src2, src3, src4;
1155  v16u8 src10_r, src32_r, src21_r, src43_r;
1156  v16u8 out, pred, ref = { 0 };
1157  v16u8 src2110, src4332, filt0;
1158  v8i16 avg = { 0 };
1159  v4i32 vec, var = { 0 };
1160  v8u16 tmp0, tmp1;
1161
1162  filtval = LH(filter);
1163  filt0 = (v16u8)__msa_fill_h(filtval);
1164
1165  src0 = LD_UB(src);
1166  src += src_stride;
1167
1168  for (loop_cnt = (height >> 2); loop_cnt--;) {
1169    LD_UB4(src, src_stride, src1, src2, src3, src4);
1170    src += (4 * src_stride);
1171    pred = LD_UB(sec_pred);
1172    sec_pred += 16;
1173    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1174    dst += (4 * dst_stride);
1175
1176    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1177    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1178               src32_r, src43_r);
1179    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1180    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1181    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1182
1183    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1184    out = __msa_aver_u_b(out, pred);
1185    CALC_MSE_AVG_B(out, ref, var, avg);
1186    src0 = src4;
1187  }
1188
1189  vec = __msa_hadd_s_w(avg, avg);
1190  *diff = HADD_SW_S32(vec);
1191
1192  return HADD_SW_S32(var);
1193}
1194
1195static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
1196    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1197    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1198    int32_t height, int32_t *diff) {
1199  int16_t filtval;
1200  uint32_t loop_cnt;
1201  v16u8 src0, src1, src2, src3, src4;
1202  v16u8 ref0, ref1, ref2, ref3;
1203  v16u8 pred0, pred1, filt0;
1204  v8u16 vec0, vec1, vec2, vec3;
1205  v8u16 tmp0, tmp1, tmp2, tmp3;
1206  v8i16 avg = { 0 };
1207  v4i32 vec, var = { 0 };
1208
1209  filtval = LH(filter);
1210  filt0 = (v16u8)__msa_fill_h(filtval);
1211
1212  src0 = LD_UB(src);
1213  src += src_stride;
1214
1215  for (loop_cnt = (height >> 2); loop_cnt--;) {
1216    LD_UB4(src, src_stride, src1, src2, src3, src4);
1217    src += (4 * src_stride);
1218    LD_UB2(sec_pred, 16, pred0, pred1);
1219    sec_pred += 32;
1220    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1221    dst += (4 * dst_stride);
1222    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1223    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
1224               vec3);
1225    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
1226                tmp2, tmp3);
1227    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
1228    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1229    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
1230    CALC_MSE_AVG_B(src0, ref0, var, avg);
1231    CALC_MSE_AVG_B(src1, ref1, var, avg);
1232
1233    src0 = src4;
1234  }
1235
1236  vec = __msa_hadd_s_w(avg, avg);
1237  *diff = HADD_SW_S32(vec);
1238
1239  return HADD_SW_S32(var);
1240}
1241
1242static uint32_t subpel_avg_ssediff_16w_v_msa(
1243    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1244    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1245    int32_t height, int32_t *diff, int32_t width) {
1246  int16_t filtval;
1247  uint32_t loop_cnt;
1248  v16u8 ref0, ref1, ref2, ref3;
1249  v16u8 pred0, pred1, pred2, pred3;
1250  v16u8 src0, src1, src2, src3, src4;
1251  v16u8 out0, out1, out2, out3, filt0;
1252  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1253  v8u16 tmp0, tmp1, tmp2, tmp3;
1254  v8i16 avg = { 0 };
1255  v4i32 vec, var = { 0 };
1256
1257  filtval = LH(filter);
1258  filt0 = (v16u8)__msa_fill_h(filtval);
1259
1260  src0 = LD_UB(src);
1261  src += src_stride;
1262
1263  for (loop_cnt = (height >> 2); loop_cnt--;) {
1264    LD_UB4(src, src_stride, src1, src2, src3, src4);
1265    src += (4 * src_stride);
1266    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1267    sec_pred += (4 * width);
1268
1269    ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
1270    ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
1271    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1272    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1273    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1274
1275    ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
1276    ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
1277    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1278    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1279    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1280
1281    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1282    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1283    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1284
1285    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1286    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1287    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1288
1289    src0 = src4;
1290    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1291    dst += (4 * dst_stride);
1292
1293    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
1294                out2, out3);
1295
1296    CALC_MSE_AVG_B(out0, ref0, var, avg);
1297    CALC_MSE_AVG_B(out1, ref1, var, avg);
1298    CALC_MSE_AVG_B(out2, ref2, var, avg);
1299    CALC_MSE_AVG_B(out3, ref3, var, avg);
1300  }
1301
1302  vec = __msa_hadd_s_w(avg, avg);
1303  *diff = HADD_SW_S32(vec);
1304
1305  return HADD_SW_S32(var);
1306}
1307
1308static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
1309    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1310    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1311    int32_t height, int32_t *diff) {
1312  return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
1313                                      sec_pred, filter, height, diff, 16);
1314}
1315
1316static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
1317    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1318    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1319    int32_t height, int32_t *diff) {
1320  uint32_t loop_cnt, sse = 0;
1321  int32_t diff0[2];
1322
1323  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1324    sse +=
1325        subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
1326                                     filter, height, &diff0[loop_cnt], 32);
1327    src += 16;
1328    dst += 16;
1329    sec_pred += 16;
1330  }
1331
1332  *diff = diff0[0] + diff0[1];
1333
1334  return sse;
1335}
1336
1337static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
1338    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1339    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1340    int32_t height, int32_t *diff) {
1341  uint32_t loop_cnt, sse = 0;
1342  int32_t diff0[4];
1343
1344  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1345    sse +=
1346        subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
1347                                     filter, height, &diff0[loop_cnt], 64);
1348    src += 16;
1349    dst += 16;
1350    sec_pred += 16;
1351  }
1352
1353  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1354
1355  return sse;
1356}
1357
1358static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
1359    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1360    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1361    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1362  int16_t filtval;
1363  uint32_t loop_cnt;
1364  uint32_t ref0, ref1, ref2, ref3;
1365  v16u8 src0, src1, src2, src3, src4;
1366  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1367  v16u8 filt_hz, filt_vt, vec0, vec1;
1368  v16u8 out, pred, ref = { 0 };
1369  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
1370  v8i16 avg = { 0 };
1371  v4i32 vec, var = { 0 };
1372
1373  filtval = LH(filter_horiz);
1374  filt_hz = (v16u8)__msa_fill_h(filtval);
1375  filtval = LH(filter_vert);
1376  filt_vt = (v16u8)__msa_fill_h(filtval);
1377
1378  src0 = LD_UB(src);
1379  src += src_stride;
1380
1381  for (loop_cnt = (height >> 2); loop_cnt--;) {
1382    LD_UB4(src, src_stride, src1, src2, src3, src4);
1383    src += (4 * src_stride);
1384    pred = LD_UB(sec_pred);
1385    sec_pred += 16;
1386    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1387    dst += (4 * dst_stride);
1388    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1389    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
1390    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
1391    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1392    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
1393    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
1394    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1395    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1396    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1397    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1398    out = __msa_aver_u_b(out, pred);
1399    CALC_MSE_AVG_B(out, ref, var, avg);
1400    src0 = src4;
1401  }
1402
1403  vec = __msa_hadd_s_w(avg, avg);
1404  *diff = HADD_SW_S32(vec);
1405
1406  return HADD_SW_S32(var);
1407}
1408
1409static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
1410    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1411    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1412    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1413  int16_t filtval;
1414  uint32_t loop_cnt;
1415  v16u8 ref0, ref1, ref2, ref3;
1416  v16u8 src0, src1, src2, src3, src4;
1417  v16u8 pred0, pred1, out0, out1;
1418  v16u8 filt_hz, filt_vt, vec0;
1419  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1420  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
1421  v8i16 avg = { 0 };
1422  v4i32 vec, var = { 0 };
1423
1424  filtval = LH(filter_horiz);
1425  filt_hz = (v16u8)__msa_fill_h(filtval);
1426  filtval = LH(filter_vert);
1427  filt_vt = (v16u8)__msa_fill_h(filtval);
1428
1429  src0 = LD_UB(src);
1430  src += src_stride;
1431  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1432
1433  for (loop_cnt = (height >> 2); loop_cnt--;) {
1434    LD_UB4(src, src_stride, src1, src2, src3, src4);
1435    src += (4 * src_stride);
1436    LD_UB2(sec_pred, 16, pred0, pred1);
1437    sec_pred += 32;
1438    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1439    dst += (4 * dst_stride);
1440
1441    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1442    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1443
1444    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1445    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
1446    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1447
1448    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1449    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
1450    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1451    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1452
1453    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1454    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
1455    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1456
1457    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1458    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
1459
1460    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1461    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1462    AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
1463
1464    CALC_MSE_AVG_B(out0, ref0, var, avg);
1465    CALC_MSE_AVG_B(out1, ref1, var, avg);
1466  }
1467
1468  vec = __msa_hadd_s_w(avg, avg);
1469  *diff = HADD_SW_S32(vec);
1470
1471  return HADD_SW_S32(var);
1472}
1473
1474static uint32_t subpel_avg_ssediff_16w_hv_msa(
1475    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1476    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1477    const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
1478  int16_t filtval;
1479  uint32_t loop_cnt;
1480  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1481  v16u8 ref0, ref1, ref2, ref3;
1482  v16u8 pred0, pred1, pred2, pred3;
1483  v16u8 out0, out1, out2, out3;
1484  v16u8 filt_hz, filt_vt, vec0, vec1;
1485  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1486  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
1487  v8i16 avg = { 0 };
1488  v4i32 vec, var = { 0 };
1489
1490  filtval = LH(filter_horiz);
1491  filt_hz = (v16u8)__msa_fill_h(filtval);
1492  filtval = LH(filter_vert);
1493  filt_vt = (v16u8)__msa_fill_h(filtval);
1494
1495  LD_UB2(src, 8, src0, src1);
1496  src += src_stride;
1497
1498  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1499  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1500
1501  for (loop_cnt = (height >> 2); loop_cnt--;) {
1502    LD_UB4(src, src_stride, src0, src2, src4, src6);
1503    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
1504    src += (4 * src_stride);
1505    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1506    sec_pred += (4 * width);
1507
1508    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1509    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1510    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1511    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1512    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1513    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1514
1515    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1516    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1517    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1518    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1519    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1520    out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1521
1522    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1523    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
1524    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1525    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1526    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1527    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1528
1529    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
1530    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
1531    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1532    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1533    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1534    out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1535
1536    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1537    dst += (4 * dst_stride);
1538
1539    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
1540                out2, out3);
1541
1542    CALC_MSE_AVG_B(out0, ref0, var, avg);
1543    CALC_MSE_AVG_B(out1, ref1, var, avg);
1544    CALC_MSE_AVG_B(out2, ref2, var, avg);
1545    CALC_MSE_AVG_B(out3, ref3, var, avg);
1546  }
1547
1548  vec = __msa_hadd_s_w(avg, avg);
1549  *diff = HADD_SW_S32(vec);
1550
1551  return HADD_SW_S32(var);
1552}
1553
1554static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
1555    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1556    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1557    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1558  return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1559                                       sec_pred, filter_horiz, filter_vert,
1560                                       height, diff, 16);
1561}
1562
1563static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
1564    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1565    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1566    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1567  uint32_t loop_cnt, sse = 0;
1568  int32_t diff0[2];
1569
1570  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1571    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1572                                         sec_pred, filter_horiz, filter_vert,
1573                                         height, &diff0[loop_cnt], 32);
1574    src += 16;
1575    dst += 16;
1576    sec_pred += 16;
1577  }
1578
1579  *diff = diff0[0] + diff0[1];
1580
1581  return sse;
1582}
1583
1584static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
1585    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1586    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1587    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1588  uint32_t loop_cnt, sse = 0;
1589  int32_t diff0[4];
1590
1591  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1592    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1593                                         sec_pred, filter_horiz, filter_vert,
1594                                         height, &diff0[loop_cnt], 64);
1595    src += 16;
1596    dst += 16;
1597    sec_pred += 16;
1598  }
1599
1600  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1601
1602  return sse;
1603}
1604
1605#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
1606#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1607#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1608#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
1609#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1610#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1611#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
1612
1613#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1614#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1615#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
1616#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1617#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1618#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
1619
1620#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                              \
1621  uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(                           \
1622      const uint8_t *src, int32_t src_stride, int32_t xoffset,                \
1623      int32_t yoffset, const uint8_t *ref, int32_t ref_stride,                \
1624      uint32_t *sse) {                                                        \
1625    int32_t diff;                                                             \
1626    uint32_t var;                                                             \
1627    const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
1628    const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
1629                                                                              \
1630    if (yoffset) {                                                            \
1631      if (xoffset) {                                                          \
1632        *sse = sub_pixel_sse_diff_##wd##width_hv_msa(                         \
1633            src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
1634      } else {                                                                \
1635        *sse = sub_pixel_sse_diff_##wd##width_v_msa(                          \
1636            src, src_stride, ref, ref_stride, v_filter, ht, &diff);           \
1637      }                                                                       \
1638                                                                              \
1639      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
1640    } else {                                                                  \
1641      if (xoffset) {                                                          \
1642        *sse = sub_pixel_sse_diff_##wd##width_h_msa(                          \
1643            src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
1644                                                                              \
1645        var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                           \
1646      } else {                                                                \
1647        var = vpx_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
1648                                            sse);                             \
1649      }                                                                       \
1650    }                                                                         \
1651                                                                              \
1652    return var;                                                               \
1653  }
1654
1655VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
1656VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
1657
1658VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
1659VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
1660VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
1661
1662VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
1663VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
1664VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
1665
1666VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
1667VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
1668VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
1669
1670VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
1671VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
1672
1673#define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
1674  uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa(                       \
1675      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
1676      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
1677      uint32_t *sse, const uint8_t *sec_pred) {                               \
1678    int32_t diff;                                                             \
1679    const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
1680    const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
1681                                                                              \
1682    if (yoffset) {                                                            \
1683      if (xoffset) {                                                          \
1684        *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(                     \
1685            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
1686            v_filter, ht, &diff);                                             \
1687      } else {                                                                \
1688        *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(                      \
1689            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
1690            &diff);                                                           \
1691      }                                                                       \
1692    } else {                                                                  \
1693      if (xoffset) {                                                          \
1694        *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(                      \
1695            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
1696            &diff);                                                           \
1697      } else {                                                                \
1698        *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr,     \
1699                                            ref_stride, sec_pred, ht, &diff); \
1700      }                                                                       \
1701    }                                                                         \
1702                                                                              \
1703    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                              \
1704  }
1705
1706VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
1707VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
1708
1709VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
1710VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
1711VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
1712
1713VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
1714VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
1715VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
1716
1717VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
1718VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
1719
1720uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
1721                                             int32_t src_stride,
1722                                             int32_t xoffset, int32_t yoffset,
1723                                             const uint8_t *ref_ptr,
1724                                             int32_t ref_stride, uint32_t *sse,
1725                                             const uint8_t *sec_pred) {
1726  int32_t diff;
1727  const uint8_t *h_filter = bilinear_filters_msa[xoffset];
1728  const uint8_t *v_filter = bilinear_filters_msa[yoffset];
1729
1730  if (yoffset) {
1731    if (xoffset) {
1732      *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
1733          src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
1734          v_filter, 64, &diff);
1735    } else {
1736      *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
1737                                                  ref_stride, sec_pred,
1738                                                  v_filter, 64, &diff);
1739    }
1740  } else {
1741    if (xoffset) {
1742      *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
1743                                                  ref_stride, sec_pred,
1744                                                  h_filter, 64, &diff);
1745    } else {
1746      *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
1747                                    sec_pred, &diff);
1748    }
1749  }
1750
1751  return VARIANCE_32Wx64H(*sse, diff);
1752}
1753
1754#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                           \
1755  uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(                           \
1756      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
1757      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
1758      uint32_t *sse, const uint8_t *sec_pred) {                               \
1759    int32_t diff;                                                             \
1760    const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
1761    const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
1762                                                                              \
1763    if (yoffset) {                                                            \
1764      if (xoffset) {                                                          \
1765        *sse = sub_pixel_avg_sse_diff_64width_hv_msa(                         \
1766            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
1767            v_filter, ht, &diff);                                             \
1768      } else {                                                                \
1769        *sse = sub_pixel_avg_sse_diff_64width_v_msa(                          \
1770            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
1771            &diff);                                                           \
1772      }                                                                       \
1773    } else {                                                                  \
1774      if (xoffset) {                                                          \
1775        *sse = sub_pixel_avg_sse_diff_64width_h_msa(                          \
1776            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
1777            &diff);                                                           \
1778      } else {                                                                \
1779        *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr,       \
1780                                          ref_stride, sec_pred, &diff);       \
1781      }                                                                       \
1782    }                                                                         \
1783                                                                              \
1784    return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
1785  }
1786
1787VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
1788VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
1789