sad_msa.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_dsp_rtcd.h"
12#include "vpx_dsp/mips/macros_msa.h"
13
14#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) {    \
15  out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0);  \
16  out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1);  \
17  out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2);  \
18  out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3);  \
19}
20#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
21
22static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
23                               const uint8_t *ref_ptr, int32_t ref_stride,
24                               int32_t height) {
25  int32_t ht_cnt;
26  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
27  v16u8 src = { 0 };
28  v16u8 ref = { 0 };
29  v16u8 diff;
30  v8u16 sad = { 0 };
31
32  for (ht_cnt = (height >> 2); ht_cnt--;) {
33    LW4(src_ptr, src_stride, src0, src1, src2, src3);
34    src_ptr += (4 * src_stride);
35    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
36    ref_ptr += (4 * ref_stride);
37
38    INSERT_W4_UB(src0, src1, src2, src3, src);
39    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
40
41    diff = __msa_asub_u_b(src, ref);
42    sad += __msa_hadd_u_h(diff, diff);
43  }
44
45  return HADD_UH_U32(sad);
46}
47
48static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
49                               const uint8_t *ref, int32_t ref_stride,
50                               int32_t height) {
51  int32_t ht_cnt;
52  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
53  v8u16 sad = { 0 };
54
55  for (ht_cnt = (height >> 2); ht_cnt--;) {
56    LD_UB4(src, src_stride, src0, src1, src2, src3);
57    src += (4 * src_stride);
58    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
59    ref += (4 * ref_stride);
60
61    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
62                src0, src1, ref0, ref1);
63    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
64  }
65
66  return HADD_UH_U32(sad);
67}
68
69static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
70                                const uint8_t *ref, int32_t ref_stride,
71                                int32_t height) {
72  int32_t ht_cnt;
73  v16u8 src0, src1, ref0, ref1;
74  v8u16 sad = { 0 };
75
76  for (ht_cnt = (height >> 2); ht_cnt--;) {
77    LD_UB2(src, src_stride, src0, src1);
78    src += (2 * src_stride);
79    LD_UB2(ref, ref_stride, ref0, ref1);
80    ref += (2 * ref_stride);
81    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
82
83    LD_UB2(src, src_stride, src0, src1);
84    src += (2 * src_stride);
85    LD_UB2(ref, ref_stride, ref0, ref1);
86    ref += (2 * ref_stride);
87    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
88  }
89
90  return HADD_UH_U32(sad);
91}
92
93static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
94                                const uint8_t *ref, int32_t ref_stride,
95                                int32_t height) {
96  int32_t ht_cnt;
97  v16u8 src0, src1, ref0, ref1;
98  v8u16 sad = { 0 };
99
100  for (ht_cnt = (height >> 2); ht_cnt--;) {
101    LD_UB2(src, 16, src0, src1);
102    src += src_stride;
103    LD_UB2(ref, 16, ref0, ref1);
104    ref += ref_stride;
105    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
106
107    LD_UB2(src, 16, src0, src1);
108    src += src_stride;
109    LD_UB2(ref, 16, ref0, ref1);
110    ref += ref_stride;
111    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
112
113    LD_UB2(src, 16, src0, src1);
114    src += src_stride;
115    LD_UB2(ref, 16, ref0, ref1);
116    ref += ref_stride;
117    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
118
119    LD_UB2(src, 16, src0, src1);
120    src += src_stride;
121    LD_UB2(ref, 16, ref0, ref1);
122    ref += ref_stride;
123    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
124  }
125
126  return HADD_UH_U32(sad);
127}
128
129static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
130                                const uint8_t *ref, int32_t ref_stride,
131                                int32_t height) {
132  int32_t ht_cnt;
133  uint32_t sad = 0;
134  v16u8 src0, src1, src2, src3;
135  v16u8 ref0, ref1, ref2, ref3;
136  v8u16 sad0 = { 0 };
137  v8u16 sad1 = { 0 };
138
139  for (ht_cnt = (height >> 1); ht_cnt--;) {
140    LD_UB4(src, 16, src0, src1, src2, src3);
141    src += src_stride;
142    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
143    ref += ref_stride;
144    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
145    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
146
147    LD_UB4(src, 16, src0, src1, src2, src3);
148    src += src_stride;
149    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
150    ref += ref_stride;
151    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
152    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
153  }
154
155  sad = HADD_UH_U32(sad0);
156  sad += HADD_UH_U32(sad1);
157
158  return sad;
159}
160
161static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
162                              const uint8_t *ref_ptr, int32_t ref_stride,
163                              int32_t height, uint32_t *sad_array) {
164  int32_t ht_cnt;
165  uint32_t src0, src1, src2, src3;
166  v16u8 src = { 0 };
167  v16u8 ref = { 0 };
168  v16u8 ref0, ref1, ref2, ref3, diff;
169  v8u16 sad0 = { 0 };
170  v8u16 sad1 = { 0 };
171  v8u16 sad2 = { 0 };
172
173  for (ht_cnt = (height >> 2); ht_cnt--;) {
174    LW4(src_ptr, src_stride, src0, src1, src2, src3);
175    src_ptr += (4 * src_stride);
176    INSERT_W4_UB(src0, src1, src2, src3, src);
177
178    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
179    ref_ptr += (4 * ref_stride);
180    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
181    diff = __msa_asub_u_b(src, ref);
182    sad0 += __msa_hadd_u_h(diff, diff);
183
184    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
185    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
186    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
187    diff = __msa_asub_u_b(src, ref);
188    sad1 += __msa_hadd_u_h(diff, diff);
189
190    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
191    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
192    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
193    diff = __msa_asub_u_b(src, ref);
194    sad2 += __msa_hadd_u_h(diff, diff);
195  }
196
197  sad_array[0] = HADD_UH_U32(sad0);
198  sad_array[1] = HADD_UH_U32(sad1);
199  sad_array[2] = HADD_UH_U32(sad2);
200}
201
202static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
203                              const uint8_t *ref, int32_t ref_stride,
204                              int32_t height, uint32_t *sad_array) {
205  int32_t ht_cnt;
206  v16u8 src0, src1, src2, src3;
207  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
208  v8u16 sad0 = { 0 };
209  v8u16 sad1 = { 0 };
210  v8u16 sad2 = { 0 };
211
212  for (ht_cnt = (height >> 2); ht_cnt--;) {
213    LD_UB4(src, src_stride, src0, src1, src2, src3);
214    src += (4 * src_stride);
215    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
216    ref += (4 * ref_stride);
217    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
218                src0, src1, ref0, ref1);
219    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
220
221    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
222    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
223    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
224    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
225
226    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
227    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
228    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
229    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
230  }
231
232  sad_array[0] = HADD_UH_U32(sad0);
233  sad_array[1] = HADD_UH_U32(sad1);
234  sad_array[2] = HADD_UH_U32(sad2);
235}
236
237static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
238                               const uint8_t *ref_ptr, int32_t ref_stride,
239                               int32_t height, uint32_t *sad_array) {
240  int32_t ht_cnt;
241  v16u8 src, ref, ref0, ref1, diff;
242  v8u16 sad0 = { 0 };
243  v8u16 sad1 = { 0 };
244  v8u16 sad2 = { 0 };
245
246  for (ht_cnt = (height >> 1); ht_cnt--;) {
247    src = LD_UB(src_ptr);
248    src_ptr += src_stride;
249    LD_UB2(ref_ptr, 16, ref0, ref1);
250    ref_ptr += ref_stride;
251
252    diff = __msa_asub_u_b(src, ref0);
253    sad0 += __msa_hadd_u_h(diff, diff);
254
255    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
256    diff = __msa_asub_u_b(src, ref);
257    sad1 += __msa_hadd_u_h(diff, diff);
258
259    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
260    diff = __msa_asub_u_b(src, ref);
261    sad2 += __msa_hadd_u_h(diff, diff);
262
263    src = LD_UB(src_ptr);
264    src_ptr += src_stride;
265    LD_UB2(ref_ptr, 16, ref0, ref1);
266    ref_ptr += ref_stride;
267
268    diff = __msa_asub_u_b(src, ref0);
269    sad0 += __msa_hadd_u_h(diff, diff);
270
271    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
272    diff = __msa_asub_u_b(src, ref);
273    sad1 += __msa_hadd_u_h(diff, diff);
274
275    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
276    diff = __msa_asub_u_b(src, ref);
277    sad2 += __msa_hadd_u_h(diff, diff);
278  }
279
280  sad_array[0] = HADD_UH_U32(sad0);
281  sad_array[1] = HADD_UH_U32(sad1);
282  sad_array[2] = HADD_UH_U32(sad2);
283}
284
285static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
286                               const uint8_t *ref, int32_t ref_stride,
287                               int32_t height, uint32_t *sad_array) {
288  int32_t ht_cnt;
289  v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
290  v8u16 sad0 = { 0 };
291  v8u16 sad1 = { 0 };
292  v8u16 sad2 = { 0 };
293
294  for (ht_cnt = height >> 1; ht_cnt--;) {
295    LD_UB2(src, 16, src0, src1);
296    src += src_stride;
297    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
298    ref += ref_stride;
299
300    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
301
302    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
303    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
304
305    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
306    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
307
308    LD_UB2(src, 16, src0, src1);
309    src += src_stride;
310    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
311    ref += ref_stride;
312
313    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
314
315    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
316    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
317
318    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
319    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
320  }
321
322  sad_array[0] = HADD_UH_U32(sad0);
323  sad_array[1] = HADD_UH_U32(sad1);
324  sad_array[2] = HADD_UH_U32(sad2);
325}
326
327static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
328                               const uint8_t *ref, int32_t ref_stride,
329                               int32_t height, uint32_t *sad_array) {
330  int32_t ht_cnt;
331  v16u8 src0, src1, src2, src3;
332  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
333  v8u16 sad0_0 = { 0 };
334  v8u16 sad0_1 = { 0 };
335  v8u16 sad1_0 = { 0 };
336  v8u16 sad1_1 = { 0 };
337  v8u16 sad2_0 = { 0 };
338  v8u16 sad2_1 = { 0 };
339  v4u32 sad;
340
341  for (ht_cnt = height; ht_cnt--;) {
342    LD_UB4(src, 16, src0, src1, src2, src3);
343    src += src_stride;
344    LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
345    ref0_4 = LD_UB(ref + 64);
346    ref += ref_stride;
347
348    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
349    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
350
351    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
352    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
353    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
354    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
355
356    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
357    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
358    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
359    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
360  }
361
362  sad = __msa_hadd_u_w(sad0_0, sad0_0);
363  sad += __msa_hadd_u_w(sad0_1, sad0_1);
364  sad_array[0] = HADD_SW_S32((v4i32)sad);
365
366  sad = __msa_hadd_u_w(sad1_0, sad1_0);
367  sad += __msa_hadd_u_w(sad1_1, sad1_1);
368  sad_array[1] = HADD_SW_S32((v4i32)sad);
369
370  sad = __msa_hadd_u_w(sad2_0, sad2_0);
371  sad += __msa_hadd_u_w(sad2_1, sad2_1);
372  sad_array[2] = HADD_SW_S32((v4i32)sad);
373}
374
375static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
376                              const uint8_t *ref_ptr, int32_t ref_stride,
377                              int32_t height, uint32_t *sad_array) {
378  int32_t ht_cnt;
379  uint32_t src0, src1, src2, src3;
380  v16u8 ref0, ref1, ref2, ref3, diff;
381  v16u8 src = { 0 };
382  v16u8 ref = { 0 };
383  v8u16 sad0 = { 0 };
384  v8u16 sad1 = { 0 };
385  v8u16 sad2 = { 0 };
386  v8u16 sad3 = { 0 };
387  v8u16 sad4 = { 0 };
388  v8u16 sad5 = { 0 };
389  v8u16 sad6 = { 0 };
390  v8u16 sad7 = { 0 };
391
392  for (ht_cnt = (height >> 2); ht_cnt--;) {
393    LW4(src_ptr, src_stride, src0, src1, src2, src3);
394    INSERT_W4_UB(src0, src1, src2, src3, src);
395    src_ptr += (4 * src_stride);
396    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
397    ref_ptr += (4 * ref_stride);
398
399    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
400    diff = __msa_asub_u_b(src, ref);
401    sad0 += __msa_hadd_u_h(diff, diff);
402
403    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
404    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
405    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
406    diff = __msa_asub_u_b(src, ref);
407    sad1 += __msa_hadd_u_h(diff, diff);
408
409    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
410    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
411    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
412    diff = __msa_asub_u_b(src, ref);
413    sad2 += __msa_hadd_u_h(diff, diff);
414
415    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
416    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
417    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
418    diff = __msa_asub_u_b(src, ref);
419    sad3 += __msa_hadd_u_h(diff, diff);
420
421    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
422    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
423    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
424    diff = __msa_asub_u_b(src, ref);
425    sad4 += __msa_hadd_u_h(diff, diff);
426
427    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
428    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
429    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
430    diff = __msa_asub_u_b(src, ref);
431    sad5 += __msa_hadd_u_h(diff, diff);
432
433    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
434    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
435    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
436    diff = __msa_asub_u_b(src, ref);
437    sad6 += __msa_hadd_u_h(diff, diff);
438
439    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
440    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
441    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
442    diff = __msa_asub_u_b(src, ref);
443    sad7 += __msa_hadd_u_h(diff, diff);
444  }
445
446  sad_array[0] = HADD_UH_U32(sad0);
447  sad_array[1] = HADD_UH_U32(sad1);
448  sad_array[2] = HADD_UH_U32(sad2);
449  sad_array[3] = HADD_UH_U32(sad3);
450  sad_array[4] = HADD_UH_U32(sad4);
451  sad_array[5] = HADD_UH_U32(sad5);
452  sad_array[6] = HADD_UH_U32(sad6);
453  sad_array[7] = HADD_UH_U32(sad7);
454}
455
456static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
457                              const uint8_t *ref, int32_t ref_stride,
458                              int32_t height, uint32_t *sad_array) {
459  int32_t ht_cnt;
460  v16u8 src0, src1, src2, src3;
461  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
462  v8u16 sad0 = { 0 };
463  v8u16 sad1 = { 0 };
464  v8u16 sad2 = { 0 };
465  v8u16 sad3 = { 0 };
466  v8u16 sad4 = { 0 };
467  v8u16 sad5 = { 0 };
468  v8u16 sad6 = { 0 };
469  v8u16 sad7 = { 0 };
470
471  for (ht_cnt = (height >> 2); ht_cnt--;) {
472    LD_UB4(src, src_stride, src0, src1, src2, src3);
473    src += (4 * src_stride);
474    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
475    ref += (4 * ref_stride);
476    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
477                src0, src1, ref0, ref1);
478    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
479
480    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
481    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
482    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
483    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
484
485    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
486    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
487    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
488    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
489
490    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
491    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
492    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
493    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
494
495    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
496    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
497    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
498    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
499
500    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
501    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
502    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
503    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
504
505    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
506    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
507    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
508    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
509
510    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
511    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
512    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
513    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
514  }
515
516  sad_array[0] = HADD_UH_U32(sad0);
517  sad_array[1] = HADD_UH_U32(sad1);
518  sad_array[2] = HADD_UH_U32(sad2);
519  sad_array[3] = HADD_UH_U32(sad3);
520  sad_array[4] = HADD_UH_U32(sad4);
521  sad_array[5] = HADD_UH_U32(sad5);
522  sad_array[6] = HADD_UH_U32(sad6);
523  sad_array[7] = HADD_UH_U32(sad7);
524}
525
526static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
527                               const uint8_t *ref_ptr, int32_t ref_stride,
528                               int32_t height, uint32_t *sad_array) {
529  int32_t ht_cnt;
530  v16u8 src, ref0, ref1, ref;
531  v16u8 diff;
532  v8u16 sad0 = { 0 };
533  v8u16 sad1 = { 0 };
534  v8u16 sad2 = { 0 };
535  v8u16 sad3 = { 0 };
536  v8u16 sad4 = { 0 };
537  v8u16 sad5 = { 0 };
538  v8u16 sad6 = { 0 };
539  v8u16 sad7 = { 0 };
540
541  for (ht_cnt = (height >> 1); ht_cnt--;) {
542    src = LD_UB(src_ptr);
543    src_ptr += src_stride;
544    LD_UB2(ref_ptr, 16, ref0, ref1);
545    ref_ptr += ref_stride;
546
547    diff = __msa_asub_u_b(src, ref0);
548    sad0 += __msa_hadd_u_h(diff, diff);
549
550    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
551    diff = __msa_asub_u_b(src, ref);
552    sad1 += __msa_hadd_u_h(diff, diff);
553
554    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
555    diff = __msa_asub_u_b(src, ref);
556    sad2 += __msa_hadd_u_h(diff, diff);
557
558    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
559    diff = __msa_asub_u_b(src, ref);
560    sad3 += __msa_hadd_u_h(diff, diff);
561
562    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
563    diff = __msa_asub_u_b(src, ref);
564    sad4 += __msa_hadd_u_h(diff, diff);
565
566    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
567    diff = __msa_asub_u_b(src, ref);
568    sad5 += __msa_hadd_u_h(diff, diff);
569
570    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
571    diff = __msa_asub_u_b(src, ref);
572    sad6 += __msa_hadd_u_h(diff, diff);
573
574    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
575    diff = __msa_asub_u_b(src, ref);
576    sad7 += __msa_hadd_u_h(diff, diff);
577
578    src = LD_UB(src_ptr);
579    src_ptr += src_stride;
580    LD_UB2(ref_ptr, 16, ref0, ref1);
581    ref_ptr += ref_stride;
582
583    diff = __msa_asub_u_b(src, ref0);
584    sad0 += __msa_hadd_u_h(diff, diff);
585
586    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
587    diff = __msa_asub_u_b(src, ref);
588    sad1 += __msa_hadd_u_h(diff, diff);
589
590    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
591    diff = __msa_asub_u_b(src, ref);
592    sad2 += __msa_hadd_u_h(diff, diff);
593
594    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
595    diff = __msa_asub_u_b(src, ref);
596    sad3 += __msa_hadd_u_h(diff, diff);
597
598    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
599    diff = __msa_asub_u_b(src, ref);
600    sad4 += __msa_hadd_u_h(diff, diff);
601
602    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
603    diff = __msa_asub_u_b(src, ref);
604    sad5 += __msa_hadd_u_h(diff, diff);
605
606    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
607    diff = __msa_asub_u_b(src, ref);
608    sad6 += __msa_hadd_u_h(diff, diff);
609
610    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
611    diff = __msa_asub_u_b(src, ref);
612    sad7 += __msa_hadd_u_h(diff, diff);
613  }
614
615  sad_array[0] = HADD_UH_U32(sad0);
616  sad_array[1] = HADD_UH_U32(sad1);
617  sad_array[2] = HADD_UH_U32(sad2);
618  sad_array[3] = HADD_UH_U32(sad3);
619  sad_array[4] = HADD_UH_U32(sad4);
620  sad_array[5] = HADD_UH_U32(sad5);
621  sad_array[6] = HADD_UH_U32(sad6);
622  sad_array[7] = HADD_UH_U32(sad7);
623}
624
625static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
626                               const uint8_t *ref, int32_t ref_stride,
627                               int32_t height, uint32_t *sad_array) {
628  int32_t ht_cnt;
629  v16u8 src0, src1;
630  v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
631  v8u16 sad0 = { 0 };
632  v8u16 sad1 = { 0 };
633  v8u16 sad2 = { 0 };
634  v8u16 sad3 = { 0 };
635  v8u16 sad4 = { 0 };
636  v8u16 sad5 = { 0 };
637  v8u16 sad6 = { 0 };
638  v8u16 sad7 = { 0 };
639
640  for (ht_cnt = height; ht_cnt--;) {
641    LD_UB2(src, 16, src0, src1);
642    src += src_stride;
643    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
644    ref += ref_stride;
645
646    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
647
648    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
649    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
650
651    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
652    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
653
654    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
655    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
656
657    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
658    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
659
660    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
661    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
662
663    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
664    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
665
666    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
667    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
668  }
669
670  sad_array[0] = HADD_UH_U32(sad0);
671  sad_array[1] = HADD_UH_U32(sad1);
672  sad_array[2] = HADD_UH_U32(sad2);
673  sad_array[3] = HADD_UH_U32(sad3);
674  sad_array[4] = HADD_UH_U32(sad4);
675  sad_array[5] = HADD_UH_U32(sad5);
676  sad_array[6] = HADD_UH_U32(sad6);
677  sad_array[7] = HADD_UH_U32(sad7);
678}
679
680static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
681                               const uint8_t *ref, int32_t ref_stride,
682                               int32_t height, uint32_t *sad_array) {
683  const uint8_t *src_dup, *ref_dup;
684  int32_t ht_cnt;
685  v16u8 src0, src1, src2, src3;
686  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
687  v16u8 ref0, ref1, ref2, ref3;
688  v8u16 sad0_0 = { 0 };
689  v8u16 sad0_1 = { 0 };
690  v8u16 sad1_0 = { 0 };
691  v8u16 sad1_1 = { 0 };
692  v8u16 sad2_0 = { 0 };
693  v8u16 sad2_1 = { 0 };
694  v8u16 sad3_0 = { 0 };
695  v8u16 sad3_1 = { 0 };
696  v4u32 sad;
697
698  src_dup = src;
699  ref_dup = ref;
700
701  for (ht_cnt = height; ht_cnt--;) {
702    LD_UB4(src, 16, src0, src1, src2, src3);
703    src += src_stride;
704    LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
705    ref += ref_stride;
706
707    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
708    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
709
710    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
711    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
712    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
713    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
714
715    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
716    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
717    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
718    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
719
720    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
721    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
722    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
723    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
724  }
725
726  sad = __msa_hadd_u_w(sad0_0, sad0_0);
727  sad += __msa_hadd_u_w(sad0_1, sad0_1);
728  sad_array[0] = HADD_SW_S32(sad);
729
730  sad = __msa_hadd_u_w(sad1_0, sad1_0);
731  sad += __msa_hadd_u_w(sad1_1, sad1_1);
732  sad_array[1] = HADD_SW_S32(sad);
733
734  sad = __msa_hadd_u_w(sad2_0, sad2_0);
735  sad += __msa_hadd_u_w(sad2_1, sad2_1);
736  sad_array[2] = HADD_SW_S32(sad);
737
738  sad = __msa_hadd_u_w(sad3_0, sad3_0);
739  sad += __msa_hadd_u_w(sad3_1, sad3_1);
740  sad_array[3] = HADD_SW_S32(sad);
741
742  sad0_0 = (v8u16)__msa_ldi_h(0);
743  sad0_1 = (v8u16)__msa_ldi_h(0);
744  sad1_0 = (v8u16)__msa_ldi_h(0);
745  sad1_1 = (v8u16)__msa_ldi_h(0);
746  sad2_0 = (v8u16)__msa_ldi_h(0);
747  sad2_1 = (v8u16)__msa_ldi_h(0);
748  sad3_0 = (v8u16)__msa_ldi_h(0);
749  sad3_1 = (v8u16)__msa_ldi_h(0);
750
751  for (ht_cnt = 64; ht_cnt--;) {
752    LD_UB4(src_dup, 16, src0, src1, src2, src3);
753    src_dup += src_stride;
754    LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
755    ref_dup += ref_stride;
756
757    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
758    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
759    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
760    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
761
762    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
763    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
764    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
765    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
766
767    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
768    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
769    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
770    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
771
772    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
773    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
774    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
775    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
776  }
777
778  sad = __msa_hadd_u_w(sad0_0, sad0_0);
779  sad += __msa_hadd_u_w(sad0_1, sad0_1);
780  sad_array[4] = HADD_SW_S32(sad);
781
782  sad = __msa_hadd_u_w(sad1_0, sad1_0);
783  sad += __msa_hadd_u_w(sad1_1, sad1_1);
784  sad_array[5] = HADD_SW_S32(sad);
785
786  sad = __msa_hadd_u_w(sad2_0, sad2_0);
787  sad += __msa_hadd_u_w(sad2_1, sad2_1);
788  sad_array[6] = HADD_SW_S32(sad);
789
790  sad = __msa_hadd_u_w(sad3_0, sad3_0);
791  sad += __msa_hadd_u_w(sad3_1, sad3_1);
792  sad_array[7] = HADD_SW_S32(sad);
793}
794
795static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
796                               const uint8_t * const aref_ptr[],
797                               int32_t ref_stride,
798                               int32_t height, uint32_t *sad_array) {
799  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
800  int32_t ht_cnt;
801  uint32_t src0, src1, src2, src3;
802  uint32_t ref0, ref1, ref2, ref3;
803  v16u8 src = { 0 };
804  v16u8 ref = { 0 };
805  v16u8 diff;
806  v8u16 sad0 = { 0 };
807  v8u16 sad1 = { 0 };
808  v8u16 sad2 = { 0 };
809  v8u16 sad3 = { 0 };
810
811  ref0_ptr = aref_ptr[0];
812  ref1_ptr = aref_ptr[1];
813  ref2_ptr = aref_ptr[2];
814  ref3_ptr = aref_ptr[3];
815
816  for (ht_cnt = (height >> 2); ht_cnt--;) {
817    LW4(src_ptr, src_stride, src0, src1, src2, src3);
818    INSERT_W4_UB(src0, src1, src2, src3, src);
819    src_ptr += (4 * src_stride);
820
821    LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
822    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
823    ref0_ptr += (4 * ref_stride);
824
825    diff = __msa_asub_u_b(src, ref);
826    sad0 += __msa_hadd_u_h(diff, diff);
827
828    LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
829    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
830    ref1_ptr += (4 * ref_stride);
831
832    diff = __msa_asub_u_b(src, ref);
833    sad1 += __msa_hadd_u_h(diff, diff);
834
835    LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
836    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
837    ref2_ptr += (4 * ref_stride);
838
839    diff = __msa_asub_u_b(src, ref);
840    sad2 += __msa_hadd_u_h(diff, diff);
841
842    LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
843    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
844    ref3_ptr += (4 * ref_stride);
845
846    diff = __msa_asub_u_b(src, ref);
847    sad3 += __msa_hadd_u_h(diff, diff);
848  }
849
850  sad_array[0] = HADD_UH_U32(sad0);
851  sad_array[1] = HADD_UH_U32(sad1);
852  sad_array[2] = HADD_UH_U32(sad2);
853  sad_array[3] = HADD_UH_U32(sad3);
854}
855
856static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
857                               const uint8_t * const aref_ptr[],
858                               int32_t ref_stride,
859                               int32_t height, uint32_t *sad_array) {
860  int32_t ht_cnt;
861  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
862  v16u8 src0, src1, src2, src3;
863  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
864  v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
865  v8u16 sad0 = { 0 };
866  v8u16 sad1 = { 0 };
867  v8u16 sad2 = { 0 };
868  v8u16 sad3 = { 0 };
869
870  ref0_ptr = aref_ptr[0];
871  ref1_ptr = aref_ptr[1];
872  ref2_ptr = aref_ptr[2];
873  ref3_ptr = aref_ptr[3];
874
875  for (ht_cnt = (height >> 2); ht_cnt--;) {
876    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
877    src_ptr += (4 * src_stride);
878    LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
879    ref0_ptr += (4 * ref_stride);
880    LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
881    ref1_ptr += (4 * ref_stride);
882    LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
883    ref2_ptr += (4 * ref_stride);
884    LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
885    ref3_ptr += (4 * ref_stride);
886
887    PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
888    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
889    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
890
891    PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
892    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
893
894    PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
895    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
896
897    PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
898    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
899  }
900
901  sad_array[0] = HADD_UH_U32(sad0);
902  sad_array[1] = HADD_UH_U32(sad1);
903  sad_array[2] = HADD_UH_U32(sad2);
904  sad_array[3] = HADD_UH_U32(sad3);
905}
906
907static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
908                                const uint8_t * const aref_ptr[],
909                                int32_t ref_stride,
910                                int32_t height, uint32_t *sad_array) {
911  int32_t ht_cnt;
912  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
913  v16u8 src, ref0, ref1, ref2, ref3, diff;
914  v8u16 sad0 = { 0 };
915  v8u16 sad1 = { 0 };
916  v8u16 sad2 = { 0 };
917  v8u16 sad3 = { 0 };
918
919  ref0_ptr = aref_ptr[0];
920  ref1_ptr = aref_ptr[1];
921  ref2_ptr = aref_ptr[2];
922  ref3_ptr = aref_ptr[3];
923
924  for (ht_cnt = (height >> 1); ht_cnt--;) {
925    src = LD_UB(src_ptr);
926    src_ptr += src_stride;
927    ref0 = LD_UB(ref0_ptr);
928    ref0_ptr += ref_stride;
929    ref1 = LD_UB(ref1_ptr);
930    ref1_ptr += ref_stride;
931    ref2 = LD_UB(ref2_ptr);
932    ref2_ptr += ref_stride;
933    ref3 = LD_UB(ref3_ptr);
934    ref3_ptr += ref_stride;
935
936    diff = __msa_asub_u_b(src, ref0);
937    sad0 += __msa_hadd_u_h(diff, diff);
938    diff = __msa_asub_u_b(src, ref1);
939    sad1 += __msa_hadd_u_h(diff, diff);
940    diff = __msa_asub_u_b(src, ref2);
941    sad2 += __msa_hadd_u_h(diff, diff);
942    diff = __msa_asub_u_b(src, ref3);
943    sad3 += __msa_hadd_u_h(diff, diff);
944
945    src = LD_UB(src_ptr);
946    src_ptr += src_stride;
947    ref0 = LD_UB(ref0_ptr);
948    ref0_ptr += ref_stride;
949    ref1 = LD_UB(ref1_ptr);
950    ref1_ptr += ref_stride;
951    ref2 = LD_UB(ref2_ptr);
952    ref2_ptr += ref_stride;
953    ref3 = LD_UB(ref3_ptr);
954    ref3_ptr += ref_stride;
955
956    diff = __msa_asub_u_b(src, ref0);
957    sad0 += __msa_hadd_u_h(diff, diff);
958    diff = __msa_asub_u_b(src, ref1);
959    sad1 += __msa_hadd_u_h(diff, diff);
960    diff = __msa_asub_u_b(src, ref2);
961    sad2 += __msa_hadd_u_h(diff, diff);
962    diff = __msa_asub_u_b(src, ref3);
963    sad3 += __msa_hadd_u_h(diff, diff);
964  }
965
966  sad_array[0] = HADD_UH_U32(sad0);
967  sad_array[1] = HADD_UH_U32(sad1);
968  sad_array[2] = HADD_UH_U32(sad2);
969  sad_array[3] = HADD_UH_U32(sad3);
970}
971
972static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
973                                const uint8_t * const aref_ptr[],
974                                int32_t ref_stride,
975                                int32_t height, uint32_t *sad_array) {
976  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
977  int32_t ht_cnt;
978  v16u8 src0, src1, ref0, ref1;
979  v8u16 sad0 = { 0 };
980  v8u16 sad1 = { 0 };
981  v8u16 sad2 = { 0 };
982  v8u16 sad3 = { 0 };
983
984  ref0_ptr = aref_ptr[0];
985  ref1_ptr = aref_ptr[1];
986  ref2_ptr = aref_ptr[2];
987  ref3_ptr = aref_ptr[3];
988
989  for (ht_cnt = height; ht_cnt--;) {
990    LD_UB2(src, 16, src0, src1);
991    src += src_stride;
992
993    LD_UB2(ref0_ptr, 16, ref0, ref1);
994    ref0_ptr += ref_stride;
995    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
996
997    LD_UB2(ref1_ptr, 16, ref0, ref1);
998    ref1_ptr += ref_stride;
999    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
1000
1001    LD_UB2(ref2_ptr, 16, ref0, ref1);
1002    ref2_ptr += ref_stride;
1003    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
1004
1005    LD_UB2(ref3_ptr, 16, ref0, ref1);
1006    ref3_ptr += ref_stride;
1007    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
1008  }
1009
1010  sad_array[0] = HADD_UH_U32(sad0);
1011  sad_array[1] = HADD_UH_U32(sad1);
1012  sad_array[2] = HADD_UH_U32(sad2);
1013  sad_array[3] = HADD_UH_U32(sad3);
1014}
1015
1016static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
1017                                const uint8_t * const aref_ptr[],
1018                                int32_t ref_stride,
1019                                int32_t height, uint32_t *sad_array) {
1020  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
1021  int32_t ht_cnt;
1022  v16u8 src0, src1, src2, src3;
1023  v16u8 ref0, ref1, ref2, ref3;
1024  v8u16 sad0_0 = { 0 };
1025  v8u16 sad0_1 = { 0 };
1026  v8u16 sad1_0 = { 0 };
1027  v8u16 sad1_1 = { 0 };
1028  v8u16 sad2_0 = { 0 };
1029  v8u16 sad2_1 = { 0 };
1030  v8u16 sad3_0 = { 0 };
1031  v8u16 sad3_1 = { 0 };
1032
1033  ref0_ptr = aref_ptr[0];
1034  ref1_ptr = aref_ptr[1];
1035  ref2_ptr = aref_ptr[2];
1036  ref3_ptr = aref_ptr[3];
1037
1038  for (ht_cnt = height; ht_cnt--;) {
1039    LD_UB4(src, 16, src0, src1, src2, src3);
1040    src += src_stride;
1041
1042    LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
1043    ref0_ptr += ref_stride;
1044    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1045    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1046
1047    LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
1048    ref1_ptr += ref_stride;
1049    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1050    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1051
1052    LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
1053    ref2_ptr += ref_stride;
1054    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1055    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1056
1057    LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
1058    ref3_ptr += ref_stride;
1059    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1060    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1061  }
1062
1063  sad_array[0] = HADD_UH_U32(sad0_0);
1064  sad_array[0] += HADD_UH_U32(sad0_1);
1065  sad_array[1] = HADD_UH_U32(sad1_0);
1066  sad_array[1] += HADD_UH_U32(sad1_1);
1067  sad_array[2] = HADD_UH_U32(sad2_0);
1068  sad_array[2] += HADD_UH_U32(sad2_1);
1069  sad_array[3] = HADD_UH_U32(sad3_0);
1070  sad_array[3] += HADD_UH_U32(sad3_1);
1071}
1072
1073static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
1074                                  const uint8_t *ref_ptr, int32_t ref_stride,
1075                                  int32_t height, const uint8_t *sec_pred) {
1076  int32_t ht_cnt;
1077  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1078  v16u8 src = { 0 };
1079  v16u8 ref = { 0 };
1080  v16u8 diff, pred, comp;
1081  v8u16 sad = { 0 };
1082
1083  for (ht_cnt = (height >> 2); ht_cnt--;) {
1084    LW4(src_ptr, src_stride, src0, src1, src2, src3);
1085    src_ptr += (4 * src_stride);
1086    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
1087    ref_ptr += (4 * ref_stride);
1088    pred = LD_UB(sec_pred);
1089    sec_pred += 16;
1090
1091    INSERT_W4_UB(src0, src1, src2, src3, src);
1092    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1093
1094    comp = __msa_aver_u_b(pred, ref);
1095    diff = __msa_asub_u_b(src, comp);
1096    sad += __msa_hadd_u_h(diff, diff);
1097  }
1098
1099  return HADD_UH_U32(sad);
1100}
1101
1102static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
1103                                  const uint8_t *ref, int32_t ref_stride,
1104                                  int32_t height, const uint8_t *sec_pred) {
1105  int32_t ht_cnt;
1106  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1107  v16u8 diff0, diff1, pred0, pred1;
1108  v8u16 sad = { 0 };
1109
1110  for (ht_cnt = (height >> 2); ht_cnt--;) {
1111    LD_UB4(src, src_stride, src0, src1, src2, src3);
1112    src += (4 * src_stride);
1113    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1114    ref += (4 * ref_stride);
1115    LD_UB2(sec_pred, 16, pred0, pred1);
1116    sec_pred += 32;
1117    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
1118                src0, src1, ref0, ref1);
1119    AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
1120    sad += SAD_UB2_UH(src0, src1, diff0, diff1);
1121  }
1122
1123  return HADD_UH_U32(sad);
1124}
1125
1126static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
1127                                   const uint8_t *ref, int32_t ref_stride,
1128                                   int32_t height, const uint8_t *sec_pred) {
1129  int32_t ht_cnt;
1130  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1131  v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
1132  v8u16 sad = { 0 };
1133
1134  for (ht_cnt = (height >> 3); ht_cnt--;) {
1135    LD_UB4(src, src_stride, src0, src1, src2, src3);
1136    src += (4 * src_stride);
1137    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1138    ref += (4 * ref_stride);
1139    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1140    sec_pred += (4 * 16);
1141    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1142    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1143    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1144    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1145
1146    LD_UB4(src, src_stride, src0, src1, src2, src3);
1147    src += (4 * src_stride);
1148    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1149    ref += (4 * ref_stride);
1150    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1151    sec_pred += (4 * 16);
1152    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1153    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1154    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1155    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1156  }
1157
1158  return HADD_UH_U32(sad);
1159}
1160
1161static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
1162                                   const uint8_t *ref, int32_t ref_stride,
1163                                   int32_t height, const uint8_t *sec_pred) {
1164  int32_t ht_cnt;
1165  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1166  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
1167  v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
1168  v16u8 comp0, comp1;
1169  v8u16 sad = { 0 };
1170
1171  for (ht_cnt = (height >> 2); ht_cnt--;) {
1172    LD_UB4(src, src_stride, src0, src2, src4, src6);
1173    LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
1174    src += (4 * src_stride);
1175
1176    LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
1177    LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
1178    ref += (4 * ref_stride);
1179
1180    LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
1181    LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
1182    sec_pred += (4 * 32);
1183
1184    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1185    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1186    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1187    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1188    AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
1189    sad += SAD_UB2_UH(src4, src5, comp0, comp1);
1190    AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
1191    sad += SAD_UB2_UH(src6, src7, comp0, comp1);
1192  }
1193
1194  return HADD_UH_U32(sad);
1195}
1196
1197static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
1198                                   const uint8_t *ref, int32_t ref_stride,
1199                                   int32_t height, const uint8_t *sec_pred) {
1200  int32_t ht_cnt;
1201  v16u8 src0, src1, src2, src3;
1202  v16u8 ref0, ref1, ref2, ref3;
1203  v16u8 comp0, comp1, comp2, comp3;
1204  v16u8 pred0, pred1, pred2, pred3;
1205  v8u16 sad0 = { 0 };
1206  v8u16 sad1 = { 0 };
1207  v4u32 sad;
1208
1209  for (ht_cnt = (height >> 2); ht_cnt--;) {
1210    LD_UB4(src, 16, src0, src1, src2, src3);
1211    src += src_stride;
1212    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1213    ref += ref_stride;
1214    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1215    sec_pred += 64;
1216    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
1217                comp0, comp1, comp2, comp3);
1218    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1219    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1220
1221    LD_UB4(src, 16, src0, src1, src2, src3);
1222    src += src_stride;
1223    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1224    ref += ref_stride;
1225    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1226    sec_pred += 64;
1227    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
1228                comp0, comp1, comp2, comp3);
1229    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1230    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1231
1232    LD_UB4(src, 16, src0, src1, src2, src3);
1233    src += src_stride;
1234    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1235    ref += ref_stride;
1236    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1237    sec_pred += 64;
1238    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
1239                comp0, comp1, comp2, comp3);
1240    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1241    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1242
1243    LD_UB4(src, 16, src0, src1, src2, src3);
1244    src += src_stride;
1245    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1246    ref += ref_stride;
1247    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1248    sec_pred += 64;
1249    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
1250                comp0, comp1, comp2, comp3);
1251    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1252    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1253  }
1254
1255  sad = __msa_hadd_u_w(sad0, sad0);
1256  sad += __msa_hadd_u_w(sad1, sad1);
1257
1258  return HADD_SW_S32(sad);
1259}
1260
1261#define VPX_SAD_4xHEIGHT_MSA(height)                                        \
1262uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,    \
1263                                 const uint8_t *ref, int32_t ref_stride) {  \
1264  return sad_4width_msa(src, src_stride,  ref, ref_stride, height);         \
1265}
1266
1267#define VPX_SAD_8xHEIGHT_MSA(height)                                        \
1268uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,    \
1269                                 const uint8_t *ref, int32_t ref_stride) {  \
1270  return sad_8width_msa(src, src_stride, ref, ref_stride, height);          \
1271}
1272
1273#define VPX_SAD_16xHEIGHT_MSA(height)                                        \
1274uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,    \
1275                                  const uint8_t *ref, int32_t ref_stride) {  \
1276  return sad_16width_msa(src, src_stride, ref, ref_stride, height);          \
1277}
1278
1279#define VPX_SAD_32xHEIGHT_MSA(height)                                        \
1280uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,    \
1281                                  const uint8_t *ref, int32_t ref_stride) {  \
1282  return sad_32width_msa(src, src_stride, ref, ref_stride, height);          \
1283}
1284
1285#define VPX_SAD_64xHEIGHT_MSA(height)                                        \
1286uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,    \
1287                                  const uint8_t *ref, int32_t ref_stride) {  \
1288  return sad_64width_msa(src, src_stride, ref, ref_stride, height);          \
1289}
1290
1291#define VPX_SAD_4xHEIGHTx3_MSA(height)                                  \
1292void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
1293                               const uint8_t *ref, int32_t ref_stride,  \
1294                               uint32_t *sads) {                        \
1295  sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
1296}
1297
1298#define VPX_SAD_8xHEIGHTx3_MSA(height)                                  \
1299void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
1300                               const uint8_t *ref, int32_t ref_stride,  \
1301                               uint32_t *sads) {                        \
1302  sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
1303}
1304
1305#define VPX_SAD_16xHEIGHTx3_MSA(height)                                  \
1306void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
1307                                const uint8_t *ref, int32_t ref_stride,  \
1308                                uint32_t *sads) {                        \
1309  sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
1310}
1311
1312#define VPX_SAD_32xHEIGHTx3_MSA(height)                                  \
1313void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
1314                                const uint8_t *ref, int32_t ref_stride,  \
1315                                uint32_t *sads) {                        \
1316  sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
1317}
1318
1319#define VPX_SAD_64xHEIGHTx3_MSA(height)                                  \
1320void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
1321                                const uint8_t *ref, int32_t ref_stride,  \
1322                                uint32_t *sads) {                        \
1323  sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
1324}
1325
1326#define VPX_SAD_4xHEIGHTx8_MSA(height)                                  \
1327void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
1328                               const uint8_t *ref, int32_t ref_stride,  \
1329                               uint32_t *sads) {                        \
1330  sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
1331}
1332
1333#define VPX_SAD_8xHEIGHTx8_MSA(height)                                  \
1334void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
1335                               const uint8_t *ref, int32_t ref_stride,  \
1336                               uint32_t *sads) {                        \
1337  sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
1338}
1339
1340#define VPX_SAD_16xHEIGHTx8_MSA(height)                                  \
1341void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
1342                                const uint8_t *ref, int32_t ref_stride,  \
1343                                uint32_t *sads) {                        \
1344  sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
1345}
1346
1347#define VPX_SAD_32xHEIGHTx8_MSA(height)                                  \
1348void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
1349                                const uint8_t *ref, int32_t ref_stride,  \
1350                                uint32_t *sads) {                        \
1351  sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
1352}
1353
1354#define VPX_SAD_64xHEIGHTx8_MSA(height)                                  \
1355void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
1356                                const uint8_t *ref, int32_t ref_stride,  \
1357                                uint32_t *sads) {                        \
1358  sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
1359}
1360
1361#define VPX_SAD_4xHEIGHTx4D_MSA(height)                                  \
1362void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
1363                                const uint8_t *const refs[],             \
1364                                int32_t ref_stride, uint32_t *sads) {    \
1365  sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
1366}
1367
1368#define VPX_SAD_8xHEIGHTx4D_MSA(height)                                  \
1369void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
1370                                const uint8_t *const refs[],             \
1371                                int32_t ref_stride, uint32_t *sads) {    \
1372  sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
1373}
1374
1375#define VPX_SAD_16xHEIGHTx4D_MSA(height)                                  \
1376void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
1377                                 const uint8_t *const refs[],             \
1378                                 int32_t ref_stride, uint32_t *sads) {    \
1379  sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
1380}
1381
1382#define VPX_SAD_32xHEIGHTx4D_MSA(height)                                  \
1383void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
1384                                 const uint8_t *const refs[],             \
1385                                 int32_t ref_stride, uint32_t *sads) {    \
1386  sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
1387}
1388
1389#define VPX_SAD_64xHEIGHTx4D_MSA(height)                                  \
1390void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
1391                                 const uint8_t *const refs[],             \
1392                                 int32_t ref_stride, uint32_t *sads) {    \
1393  sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
1394}
1395
1396#define VPX_AVGSAD_4xHEIGHT_MSA(height)                                       \
1397uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
1398                                     const uint8_t *ref, int32_t ref_stride,  \
1399                                     const uint8_t *second_pred) {            \
1400  return avgsad_4width_msa(src, src_stride, ref, ref_stride,                  \
1401                           height, second_pred);                              \
1402}
1403
1404#define VPX_AVGSAD_8xHEIGHT_MSA(height)                                       \
1405uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
1406                                     const uint8_t *ref, int32_t ref_stride,  \
1407                                     const uint8_t *second_pred) {            \
1408  return avgsad_8width_msa(src, src_stride, ref, ref_stride,                  \
1409                           height, second_pred);                              \
1410}
1411
1412#define VPX_AVGSAD_16xHEIGHT_MSA(height)                                       \
1413uint32_t vpx_sad16x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
1414                                      const uint8_t *ref, int32_t ref_stride,  \
1415                                      const uint8_t *second_pred) {            \
1416  return avgsad_16width_msa(src, src_stride, ref, ref_stride,                  \
1417                            height, second_pred);                              \
1418}
1419
1420#define VPX_AVGSAD_32xHEIGHT_MSA(height)                                       \
1421uint32_t vpx_sad32x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
1422                                      const uint8_t *ref, int32_t ref_stride,  \
1423                                      const uint8_t *second_pred) {            \
1424  return avgsad_32width_msa(src, src_stride, ref, ref_stride,                  \
1425                            height, second_pred);                              \
1426}
1427
1428#define VPX_AVGSAD_64xHEIGHT_MSA(height)                                       \
1429uint32_t vpx_sad64x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
1430                                      const uint8_t *ref, int32_t ref_stride,  \
1431                                      const uint8_t *second_pred) {            \
1432  return avgsad_64width_msa(src, src_stride, ref, ref_stride,                  \
1433                            height, second_pred);                              \
1434}
1435
1436// 64x64
1437VPX_SAD_64xHEIGHT_MSA(64);
1438VPX_SAD_64xHEIGHTx3_MSA(64);
1439VPX_SAD_64xHEIGHTx8_MSA(64);
1440VPX_SAD_64xHEIGHTx4D_MSA(64);
1441VPX_AVGSAD_64xHEIGHT_MSA(64);
1442
1443// 64x32
1444VPX_SAD_64xHEIGHT_MSA(32);
1445VPX_SAD_64xHEIGHTx3_MSA(32);
1446VPX_SAD_64xHEIGHTx8_MSA(32);
1447VPX_SAD_64xHEIGHTx4D_MSA(32);
1448VPX_AVGSAD_64xHEIGHT_MSA(32);
1449
1450// 32x64
1451VPX_SAD_32xHEIGHT_MSA(64);
1452VPX_SAD_32xHEIGHTx3_MSA(64);
1453VPX_SAD_32xHEIGHTx8_MSA(64);
1454VPX_SAD_32xHEIGHTx4D_MSA(64);
1455VPX_AVGSAD_32xHEIGHT_MSA(64);
1456
1457// 32x32
1458VPX_SAD_32xHEIGHT_MSA(32);
1459VPX_SAD_32xHEIGHTx3_MSA(32);
1460VPX_SAD_32xHEIGHTx8_MSA(32);
1461VPX_SAD_32xHEIGHTx4D_MSA(32);
1462VPX_AVGSAD_32xHEIGHT_MSA(32);
1463
1464// 32x16
1465VPX_SAD_32xHEIGHT_MSA(16);
1466VPX_SAD_32xHEIGHTx3_MSA(16);
1467VPX_SAD_32xHEIGHTx8_MSA(16);
1468VPX_SAD_32xHEIGHTx4D_MSA(16);
1469VPX_AVGSAD_32xHEIGHT_MSA(16);
1470
1471// 16x32
1472VPX_SAD_16xHEIGHT_MSA(32);
1473VPX_SAD_16xHEIGHTx3_MSA(32);
1474VPX_SAD_16xHEIGHTx8_MSA(32);
1475VPX_SAD_16xHEIGHTx4D_MSA(32);
1476VPX_AVGSAD_16xHEIGHT_MSA(32);
1477
1478// 16x16
1479VPX_SAD_16xHEIGHT_MSA(16);
1480VPX_SAD_16xHEIGHTx3_MSA(16);
1481VPX_SAD_16xHEIGHTx8_MSA(16);
1482VPX_SAD_16xHEIGHTx4D_MSA(16);
1483VPX_AVGSAD_16xHEIGHT_MSA(16);
1484
1485// 16x8
1486VPX_SAD_16xHEIGHT_MSA(8);
1487VPX_SAD_16xHEIGHTx3_MSA(8);
1488VPX_SAD_16xHEIGHTx8_MSA(8);
1489VPX_SAD_16xHEIGHTx4D_MSA(8);
1490VPX_AVGSAD_16xHEIGHT_MSA(8);
1491
1492// 8x16
1493VPX_SAD_8xHEIGHT_MSA(16);
1494VPX_SAD_8xHEIGHTx3_MSA(16);
1495VPX_SAD_8xHEIGHTx8_MSA(16);
1496VPX_SAD_8xHEIGHTx4D_MSA(16);
1497VPX_AVGSAD_8xHEIGHT_MSA(16);
1498
1499// 8x8
1500VPX_SAD_8xHEIGHT_MSA(8);
1501VPX_SAD_8xHEIGHTx3_MSA(8);
1502VPX_SAD_8xHEIGHTx8_MSA(8);
1503VPX_SAD_8xHEIGHTx4D_MSA(8);
1504VPX_AVGSAD_8xHEIGHT_MSA(8);
1505
1506// 8x4
1507VPX_SAD_8xHEIGHT_MSA(4);
1508VPX_SAD_8xHEIGHTx3_MSA(4);
1509VPX_SAD_8xHEIGHTx8_MSA(4);
1510VPX_SAD_8xHEIGHTx4D_MSA(4);
1511VPX_AVGSAD_8xHEIGHT_MSA(4);
1512
1513// 4x8
1514VPX_SAD_4xHEIGHT_MSA(8);
1515VPX_SAD_4xHEIGHTx3_MSA(8);
1516VPX_SAD_4xHEIGHTx8_MSA(8);
1517VPX_SAD_4xHEIGHTx4D_MSA(8);
1518VPX_AVGSAD_4xHEIGHT_MSA(8);
1519
1520// 4x4
1521VPX_SAD_4xHEIGHT_MSA(4);
1522VPX_SAD_4xHEIGHTx3_MSA(4);
1523VPX_SAD_4xHEIGHTx8_MSA(4);
1524VPX_SAD_4xHEIGHTx4D_MSA(4);
1525VPX_AVGSAD_4xHEIGHT_MSA(4);
1526