1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_dsp_rtcd.h"
12#include "vpx_dsp/mips/macros_msa.h"
13
14#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out)       \
15  {                                                        \
16    out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
17    out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
18    out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
19    out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
20  }
21#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
22
23static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
24                               const uint8_t *ref_ptr, int32_t ref_stride,
25                               int32_t height) {
26  int32_t ht_cnt;
27  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
28  v16u8 src = { 0 };
29  v16u8 ref = { 0 };
30  v16u8 diff;
31  v8u16 sad = { 0 };
32
33  for (ht_cnt = (height >> 2); ht_cnt--;) {
34    LW4(src_ptr, src_stride, src0, src1, src2, src3);
35    src_ptr += (4 * src_stride);
36    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
37    ref_ptr += (4 * ref_stride);
38
39    INSERT_W4_UB(src0, src1, src2, src3, src);
40    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
41
42    diff = __msa_asub_u_b(src, ref);
43    sad += __msa_hadd_u_h(diff, diff);
44  }
45
46  return HADD_UH_U32(sad);
47}
48
49static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
50                               const uint8_t *ref, int32_t ref_stride,
51                               int32_t height) {
52  int32_t ht_cnt;
53  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
54  v8u16 sad = { 0 };
55
56  for (ht_cnt = (height >> 2); ht_cnt--;) {
57    LD_UB4(src, src_stride, src0, src1, src2, src3);
58    src += (4 * src_stride);
59    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
60    ref += (4 * ref_stride);
61
62    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
63                ref0, ref1);
64    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
65  }
66
67  return HADD_UH_U32(sad);
68}
69
70static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
71                                const uint8_t *ref, int32_t ref_stride,
72                                int32_t height) {
73  int32_t ht_cnt;
74  v16u8 src0, src1, ref0, ref1;
75  v8u16 sad = { 0 };
76
77  for (ht_cnt = (height >> 2); ht_cnt--;) {
78    LD_UB2(src, src_stride, src0, src1);
79    src += (2 * src_stride);
80    LD_UB2(ref, ref_stride, ref0, ref1);
81    ref += (2 * ref_stride);
82    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
83
84    LD_UB2(src, src_stride, src0, src1);
85    src += (2 * src_stride);
86    LD_UB2(ref, ref_stride, ref0, ref1);
87    ref += (2 * ref_stride);
88    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
89  }
90
91  return HADD_UH_U32(sad);
92}
93
94static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
95                                const uint8_t *ref, int32_t ref_stride,
96                                int32_t height) {
97  int32_t ht_cnt;
98  v16u8 src0, src1, ref0, ref1;
99  v8u16 sad = { 0 };
100
101  for (ht_cnt = (height >> 2); ht_cnt--;) {
102    LD_UB2(src, 16, src0, src1);
103    src += src_stride;
104    LD_UB2(ref, 16, ref0, ref1);
105    ref += ref_stride;
106    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
107
108    LD_UB2(src, 16, src0, src1);
109    src += src_stride;
110    LD_UB2(ref, 16, ref0, ref1);
111    ref += ref_stride;
112    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
113
114    LD_UB2(src, 16, src0, src1);
115    src += src_stride;
116    LD_UB2(ref, 16, ref0, ref1);
117    ref += ref_stride;
118    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
119
120    LD_UB2(src, 16, src0, src1);
121    src += src_stride;
122    LD_UB2(ref, 16, ref0, ref1);
123    ref += ref_stride;
124    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
125  }
126
127  return HADD_UH_U32(sad);
128}
129
130static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
131                                const uint8_t *ref, int32_t ref_stride,
132                                int32_t height) {
133  int32_t ht_cnt;
134  uint32_t sad = 0;
135  v16u8 src0, src1, src2, src3;
136  v16u8 ref0, ref1, ref2, ref3;
137  v8u16 sad0 = { 0 };
138  v8u16 sad1 = { 0 };
139
140  for (ht_cnt = (height >> 1); ht_cnt--;) {
141    LD_UB4(src, 16, src0, src1, src2, src3);
142    src += src_stride;
143    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
144    ref += ref_stride;
145    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
146    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
147
148    LD_UB4(src, 16, src0, src1, src2, src3);
149    src += src_stride;
150    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
151    ref += ref_stride;
152    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
153    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
154  }
155
156  sad = HADD_UH_U32(sad0);
157  sad += HADD_UH_U32(sad1);
158
159  return sad;
160}
161
162static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
163                              const uint8_t *ref_ptr, int32_t ref_stride,
164                              int32_t height, uint32_t *sad_array) {
165  int32_t ht_cnt;
166  uint32_t src0, src1, src2, src3;
167  v16u8 src = { 0 };
168  v16u8 ref = { 0 };
169  v16u8 ref0, ref1, ref2, ref3, diff;
170  v8u16 sad0 = { 0 };
171  v8u16 sad1 = { 0 };
172  v8u16 sad2 = { 0 };
173
174  for (ht_cnt = (height >> 2); ht_cnt--;) {
175    LW4(src_ptr, src_stride, src0, src1, src2, src3);
176    src_ptr += (4 * src_stride);
177    INSERT_W4_UB(src0, src1, src2, src3, src);
178
179    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
180    ref_ptr += (4 * ref_stride);
181    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
182    diff = __msa_asub_u_b(src, ref);
183    sad0 += __msa_hadd_u_h(diff, diff);
184
185    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
186    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
187    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
188    diff = __msa_asub_u_b(src, ref);
189    sad1 += __msa_hadd_u_h(diff, diff);
190
191    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
192    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
193    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
194    diff = __msa_asub_u_b(src, ref);
195    sad2 += __msa_hadd_u_h(diff, diff);
196  }
197
198  sad_array[0] = HADD_UH_U32(sad0);
199  sad_array[1] = HADD_UH_U32(sad1);
200  sad_array[2] = HADD_UH_U32(sad2);
201}
202
203static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
204                              const uint8_t *ref, int32_t ref_stride,
205                              int32_t height, uint32_t *sad_array) {
206  int32_t ht_cnt;
207  v16u8 src0, src1, src2, src3;
208  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
209  v8u16 sad0 = { 0 };
210  v8u16 sad1 = { 0 };
211  v8u16 sad2 = { 0 };
212
213  for (ht_cnt = (height >> 2); ht_cnt--;) {
214    LD_UB4(src, src_stride, src0, src1, src2, src3);
215    src += (4 * src_stride);
216    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
217    ref += (4 * ref_stride);
218    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
219                ref0, ref1);
220    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
221
222    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
223    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
224    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
225    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
226
227    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
228    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
229    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
230    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
231  }
232
233  sad_array[0] = HADD_UH_U32(sad0);
234  sad_array[1] = HADD_UH_U32(sad1);
235  sad_array[2] = HADD_UH_U32(sad2);
236}
237
238static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
239                               const uint8_t *ref_ptr, int32_t ref_stride,
240                               int32_t height, uint32_t *sad_array) {
241  int32_t ht_cnt;
242  v16u8 src, ref, ref0, ref1, diff;
243  v8u16 sad0 = { 0 };
244  v8u16 sad1 = { 0 };
245  v8u16 sad2 = { 0 };
246
247  for (ht_cnt = (height >> 1); ht_cnt--;) {
248    src = LD_UB(src_ptr);
249    src_ptr += src_stride;
250    LD_UB2(ref_ptr, 16, ref0, ref1);
251    ref_ptr += ref_stride;
252
253    diff = __msa_asub_u_b(src, ref0);
254    sad0 += __msa_hadd_u_h(diff, diff);
255
256    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
257    diff = __msa_asub_u_b(src, ref);
258    sad1 += __msa_hadd_u_h(diff, diff);
259
260    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
261    diff = __msa_asub_u_b(src, ref);
262    sad2 += __msa_hadd_u_h(diff, diff);
263
264    src = LD_UB(src_ptr);
265    src_ptr += src_stride;
266    LD_UB2(ref_ptr, 16, ref0, ref1);
267    ref_ptr += ref_stride;
268
269    diff = __msa_asub_u_b(src, ref0);
270    sad0 += __msa_hadd_u_h(diff, diff);
271
272    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
273    diff = __msa_asub_u_b(src, ref);
274    sad1 += __msa_hadd_u_h(diff, diff);
275
276    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
277    diff = __msa_asub_u_b(src, ref);
278    sad2 += __msa_hadd_u_h(diff, diff);
279  }
280
281  sad_array[0] = HADD_UH_U32(sad0);
282  sad_array[1] = HADD_UH_U32(sad1);
283  sad_array[2] = HADD_UH_U32(sad2);
284}
285
286static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
287                               const uint8_t *ref, int32_t ref_stride,
288                               int32_t height, uint32_t *sad_array) {
289  int32_t ht_cnt;
290  v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
291  v8u16 sad0 = { 0 };
292  v8u16 sad1 = { 0 };
293  v8u16 sad2 = { 0 };
294
295  for (ht_cnt = height >> 1; ht_cnt--;) {
296    LD_UB2(src, 16, src0, src1);
297    src += src_stride;
298    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
299    ref += ref_stride;
300
301    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
302
303    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
304    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
305
306    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
307    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
308
309    LD_UB2(src, 16, src0, src1);
310    src += src_stride;
311    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
312    ref += ref_stride;
313
314    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
315
316    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
317    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
318
319    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
320    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
321  }
322
323  sad_array[0] = HADD_UH_U32(sad0);
324  sad_array[1] = HADD_UH_U32(sad1);
325  sad_array[2] = HADD_UH_U32(sad2);
326}
327
328static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
329                               const uint8_t *ref, int32_t ref_stride,
330                               int32_t height, uint32_t *sad_array) {
331  int32_t ht_cnt;
332  v16u8 src0, src1, src2, src3;
333  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
334  v8u16 sad0_0 = { 0 };
335  v8u16 sad0_1 = { 0 };
336  v8u16 sad1_0 = { 0 };
337  v8u16 sad1_1 = { 0 };
338  v8u16 sad2_0 = { 0 };
339  v8u16 sad2_1 = { 0 };
340  v4u32 sad;
341
342  for (ht_cnt = height; ht_cnt--;) {
343    LD_UB4(src, 16, src0, src1, src2, src3);
344    src += src_stride;
345    LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
346    ref0_4 = LD_UB(ref + 64);
347    ref += ref_stride;
348
349    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
350    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
351
352    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
353    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
354    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
355    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
356
357    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
358    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
359    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
360    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
361  }
362
363  sad = __msa_hadd_u_w(sad0_0, sad0_0);
364  sad += __msa_hadd_u_w(sad0_1, sad0_1);
365  sad_array[0] = HADD_SW_S32((v4i32)sad);
366
367  sad = __msa_hadd_u_w(sad1_0, sad1_0);
368  sad += __msa_hadd_u_w(sad1_1, sad1_1);
369  sad_array[1] = HADD_SW_S32((v4i32)sad);
370
371  sad = __msa_hadd_u_w(sad2_0, sad2_0);
372  sad += __msa_hadd_u_w(sad2_1, sad2_1);
373  sad_array[2] = HADD_SW_S32((v4i32)sad);
374}
375
376static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
377                              const uint8_t *ref_ptr, int32_t ref_stride,
378                              int32_t height, uint32_t *sad_array) {
379  int32_t ht_cnt;
380  uint32_t src0, src1, src2, src3;
381  v16u8 ref0, ref1, ref2, ref3, diff;
382  v16u8 src = { 0 };
383  v16u8 ref = { 0 };
384  v8u16 sad0 = { 0 };
385  v8u16 sad1 = { 0 };
386  v8u16 sad2 = { 0 };
387  v8u16 sad3 = { 0 };
388  v8u16 sad4 = { 0 };
389  v8u16 sad5 = { 0 };
390  v8u16 sad6 = { 0 };
391  v8u16 sad7 = { 0 };
392
393  for (ht_cnt = (height >> 2); ht_cnt--;) {
394    LW4(src_ptr, src_stride, src0, src1, src2, src3);
395    INSERT_W4_UB(src0, src1, src2, src3, src);
396    src_ptr += (4 * src_stride);
397    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
398    ref_ptr += (4 * ref_stride);
399
400    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
401    diff = __msa_asub_u_b(src, ref);
402    sad0 += __msa_hadd_u_h(diff, diff);
403
404    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
405    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
406    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
407    diff = __msa_asub_u_b(src, ref);
408    sad1 += __msa_hadd_u_h(diff, diff);
409
410    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
411    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
412    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
413    diff = __msa_asub_u_b(src, ref);
414    sad2 += __msa_hadd_u_h(diff, diff);
415
416    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
417    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
418    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
419    diff = __msa_asub_u_b(src, ref);
420    sad3 += __msa_hadd_u_h(diff, diff);
421
422    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
423    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
424    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
425    diff = __msa_asub_u_b(src, ref);
426    sad4 += __msa_hadd_u_h(diff, diff);
427
428    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
429    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
430    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
431    diff = __msa_asub_u_b(src, ref);
432    sad5 += __msa_hadd_u_h(diff, diff);
433
434    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
435    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
436    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
437    diff = __msa_asub_u_b(src, ref);
438    sad6 += __msa_hadd_u_h(diff, diff);
439
440    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
441    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
442    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
443    diff = __msa_asub_u_b(src, ref);
444    sad7 += __msa_hadd_u_h(diff, diff);
445  }
446
447  sad_array[0] = HADD_UH_U32(sad0);
448  sad_array[1] = HADD_UH_U32(sad1);
449  sad_array[2] = HADD_UH_U32(sad2);
450  sad_array[3] = HADD_UH_U32(sad3);
451  sad_array[4] = HADD_UH_U32(sad4);
452  sad_array[5] = HADD_UH_U32(sad5);
453  sad_array[6] = HADD_UH_U32(sad6);
454  sad_array[7] = HADD_UH_U32(sad7);
455}
456
457static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
458                              const uint8_t *ref, int32_t ref_stride,
459                              int32_t height, uint32_t *sad_array) {
460  int32_t ht_cnt;
461  v16u8 src0, src1, src2, src3;
462  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
463  v8u16 sad0 = { 0 };
464  v8u16 sad1 = { 0 };
465  v8u16 sad2 = { 0 };
466  v8u16 sad3 = { 0 };
467  v8u16 sad4 = { 0 };
468  v8u16 sad5 = { 0 };
469  v8u16 sad6 = { 0 };
470  v8u16 sad7 = { 0 };
471
472  for (ht_cnt = (height >> 2); ht_cnt--;) {
473    LD_UB4(src, src_stride, src0, src1, src2, src3);
474    src += (4 * src_stride);
475    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
476    ref += (4 * ref_stride);
477    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
478                ref0, ref1);
479    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
480
481    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
482    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
483    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
484    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
485
486    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
487    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
488    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
489    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
490
491    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
492    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
493    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
494    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
495
496    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
497    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
498    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
499    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
500
501    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
502    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
503    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
504    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
505
506    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
507    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
508    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
509    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
510
511    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
512    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
513    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
514    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
515  }
516
517  sad_array[0] = HADD_UH_U32(sad0);
518  sad_array[1] = HADD_UH_U32(sad1);
519  sad_array[2] = HADD_UH_U32(sad2);
520  sad_array[3] = HADD_UH_U32(sad3);
521  sad_array[4] = HADD_UH_U32(sad4);
522  sad_array[5] = HADD_UH_U32(sad5);
523  sad_array[6] = HADD_UH_U32(sad6);
524  sad_array[7] = HADD_UH_U32(sad7);
525}
526
527static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
528                               const uint8_t *ref_ptr, int32_t ref_stride,
529                               int32_t height, uint32_t *sad_array) {
530  int32_t ht_cnt;
531  v16u8 src, ref0, ref1, ref;
532  v16u8 diff;
533  v8u16 sad0 = { 0 };
534  v8u16 sad1 = { 0 };
535  v8u16 sad2 = { 0 };
536  v8u16 sad3 = { 0 };
537  v8u16 sad4 = { 0 };
538  v8u16 sad5 = { 0 };
539  v8u16 sad6 = { 0 };
540  v8u16 sad7 = { 0 };
541
542  for (ht_cnt = (height >> 1); ht_cnt--;) {
543    src = LD_UB(src_ptr);
544    src_ptr += src_stride;
545    LD_UB2(ref_ptr, 16, ref0, ref1);
546    ref_ptr += ref_stride;
547
548    diff = __msa_asub_u_b(src, ref0);
549    sad0 += __msa_hadd_u_h(diff, diff);
550
551    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
552    diff = __msa_asub_u_b(src, ref);
553    sad1 += __msa_hadd_u_h(diff, diff);
554
555    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
556    diff = __msa_asub_u_b(src, ref);
557    sad2 += __msa_hadd_u_h(diff, diff);
558
559    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
560    diff = __msa_asub_u_b(src, ref);
561    sad3 += __msa_hadd_u_h(diff, diff);
562
563    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
564    diff = __msa_asub_u_b(src, ref);
565    sad4 += __msa_hadd_u_h(diff, diff);
566
567    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
568    diff = __msa_asub_u_b(src, ref);
569    sad5 += __msa_hadd_u_h(diff, diff);
570
571    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
572    diff = __msa_asub_u_b(src, ref);
573    sad6 += __msa_hadd_u_h(diff, diff);
574
575    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
576    diff = __msa_asub_u_b(src, ref);
577    sad7 += __msa_hadd_u_h(diff, diff);
578
579    src = LD_UB(src_ptr);
580    src_ptr += src_stride;
581    LD_UB2(ref_ptr, 16, ref0, ref1);
582    ref_ptr += ref_stride;
583
584    diff = __msa_asub_u_b(src, ref0);
585    sad0 += __msa_hadd_u_h(diff, diff);
586
587    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
588    diff = __msa_asub_u_b(src, ref);
589    sad1 += __msa_hadd_u_h(diff, diff);
590
591    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
592    diff = __msa_asub_u_b(src, ref);
593    sad2 += __msa_hadd_u_h(diff, diff);
594
595    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
596    diff = __msa_asub_u_b(src, ref);
597    sad3 += __msa_hadd_u_h(diff, diff);
598
599    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
600    diff = __msa_asub_u_b(src, ref);
601    sad4 += __msa_hadd_u_h(diff, diff);
602
603    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
604    diff = __msa_asub_u_b(src, ref);
605    sad5 += __msa_hadd_u_h(diff, diff);
606
607    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
608    diff = __msa_asub_u_b(src, ref);
609    sad6 += __msa_hadd_u_h(diff, diff);
610
611    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
612    diff = __msa_asub_u_b(src, ref);
613    sad7 += __msa_hadd_u_h(diff, diff);
614  }
615
616  sad_array[0] = HADD_UH_U32(sad0);
617  sad_array[1] = HADD_UH_U32(sad1);
618  sad_array[2] = HADD_UH_U32(sad2);
619  sad_array[3] = HADD_UH_U32(sad3);
620  sad_array[4] = HADD_UH_U32(sad4);
621  sad_array[5] = HADD_UH_U32(sad5);
622  sad_array[6] = HADD_UH_U32(sad6);
623  sad_array[7] = HADD_UH_U32(sad7);
624}
625
626static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
627                               const uint8_t *ref, int32_t ref_stride,
628                               int32_t height, uint32_t *sad_array) {
629  int32_t ht_cnt;
630  v16u8 src0, src1;
631  v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
632  v8u16 sad0 = { 0 };
633  v8u16 sad1 = { 0 };
634  v8u16 sad2 = { 0 };
635  v8u16 sad3 = { 0 };
636  v8u16 sad4 = { 0 };
637  v8u16 sad5 = { 0 };
638  v8u16 sad6 = { 0 };
639  v8u16 sad7 = { 0 };
640
641  for (ht_cnt = height; ht_cnt--;) {
642    LD_UB2(src, 16, src0, src1);
643    src += src_stride;
644    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
645    ref += ref_stride;
646
647    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
648
649    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
650    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
651
652    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
653    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
654
655    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
656    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
657
658    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
659    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
660
661    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
662    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
663
664    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
665    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
666
667    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
668    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
669  }
670
671  sad_array[0] = HADD_UH_U32(sad0);
672  sad_array[1] = HADD_UH_U32(sad1);
673  sad_array[2] = HADD_UH_U32(sad2);
674  sad_array[3] = HADD_UH_U32(sad3);
675  sad_array[4] = HADD_UH_U32(sad4);
676  sad_array[5] = HADD_UH_U32(sad5);
677  sad_array[6] = HADD_UH_U32(sad6);
678  sad_array[7] = HADD_UH_U32(sad7);
679}
680
681static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
682                               const uint8_t *ref, int32_t ref_stride,
683                               int32_t height, uint32_t *sad_array) {
684  const uint8_t *src_dup, *ref_dup;
685  int32_t ht_cnt;
686  v16u8 src0, src1, src2, src3;
687  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
688  v16u8 ref0, ref1, ref2, ref3;
689  v8u16 sad0_0 = { 0 };
690  v8u16 sad0_1 = { 0 };
691  v8u16 sad1_0 = { 0 };
692  v8u16 sad1_1 = { 0 };
693  v8u16 sad2_0 = { 0 };
694  v8u16 sad2_1 = { 0 };
695  v8u16 sad3_0 = { 0 };
696  v8u16 sad3_1 = { 0 };
697  v4u32 sad;
698
699  src_dup = src;
700  ref_dup = ref;
701
702  for (ht_cnt = height; ht_cnt--;) {
703    LD_UB4(src, 16, src0, src1, src2, src3);
704    src += src_stride;
705    LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
706    ref += ref_stride;
707
708    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
709    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
710
711    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
712    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
713    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
714    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
715
716    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
717    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
718    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
719    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
720
721    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
722    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
723    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
724    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
725  }
726
727  sad = __msa_hadd_u_w(sad0_0, sad0_0);
728  sad += __msa_hadd_u_w(sad0_1, sad0_1);
729  sad_array[0] = HADD_SW_S32(sad);
730
731  sad = __msa_hadd_u_w(sad1_0, sad1_0);
732  sad += __msa_hadd_u_w(sad1_1, sad1_1);
733  sad_array[1] = HADD_SW_S32(sad);
734
735  sad = __msa_hadd_u_w(sad2_0, sad2_0);
736  sad += __msa_hadd_u_w(sad2_1, sad2_1);
737  sad_array[2] = HADD_SW_S32(sad);
738
739  sad = __msa_hadd_u_w(sad3_0, sad3_0);
740  sad += __msa_hadd_u_w(sad3_1, sad3_1);
741  sad_array[3] = HADD_SW_S32(sad);
742
743  sad0_0 = (v8u16)__msa_ldi_h(0);
744  sad0_1 = (v8u16)__msa_ldi_h(0);
745  sad1_0 = (v8u16)__msa_ldi_h(0);
746  sad1_1 = (v8u16)__msa_ldi_h(0);
747  sad2_0 = (v8u16)__msa_ldi_h(0);
748  sad2_1 = (v8u16)__msa_ldi_h(0);
749  sad3_0 = (v8u16)__msa_ldi_h(0);
750  sad3_1 = (v8u16)__msa_ldi_h(0);
751
752  for (ht_cnt = 64; ht_cnt--;) {
753    LD_UB4(src_dup, 16, src0, src1, src2, src3);
754    src_dup += src_stride;
755    LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
756    ref_dup += ref_stride;
757
758    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
759    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
760    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
761    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
762
763    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
764    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
765    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
766    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
767
768    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
769    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
770    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
771    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
772
773    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
774    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
775    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
776    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
777  }
778
779  sad = __msa_hadd_u_w(sad0_0, sad0_0);
780  sad += __msa_hadd_u_w(sad0_1, sad0_1);
781  sad_array[4] = HADD_SW_S32(sad);
782
783  sad = __msa_hadd_u_w(sad1_0, sad1_0);
784  sad += __msa_hadd_u_w(sad1_1, sad1_1);
785  sad_array[5] = HADD_SW_S32(sad);
786
787  sad = __msa_hadd_u_w(sad2_0, sad2_0);
788  sad += __msa_hadd_u_w(sad2_1, sad2_1);
789  sad_array[6] = HADD_SW_S32(sad);
790
791  sad = __msa_hadd_u_w(sad3_0, sad3_0);
792  sad += __msa_hadd_u_w(sad3_1, sad3_1);
793  sad_array[7] = HADD_SW_S32(sad);
794}
795
796static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
797                               const uint8_t *const aref_ptr[],
798                               int32_t ref_stride, int32_t height,
799                               uint32_t *sad_array) {
800  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
801  int32_t ht_cnt;
802  uint32_t src0, src1, src2, src3;
803  uint32_t ref0, ref1, ref2, ref3;
804  v16u8 src = { 0 };
805  v16u8 ref = { 0 };
806  v16u8 diff;
807  v8u16 sad0 = { 0 };
808  v8u16 sad1 = { 0 };
809  v8u16 sad2 = { 0 };
810  v8u16 sad3 = { 0 };
811
812  ref0_ptr = aref_ptr[0];
813  ref1_ptr = aref_ptr[1];
814  ref2_ptr = aref_ptr[2];
815  ref3_ptr = aref_ptr[3];
816
817  for (ht_cnt = (height >> 2); ht_cnt--;) {
818    LW4(src_ptr, src_stride, src0, src1, src2, src3);
819    INSERT_W4_UB(src0, src1, src2, src3, src);
820    src_ptr += (4 * src_stride);
821
822    LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
823    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
824    ref0_ptr += (4 * ref_stride);
825
826    diff = __msa_asub_u_b(src, ref);
827    sad0 += __msa_hadd_u_h(diff, diff);
828
829    LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
830    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
831    ref1_ptr += (4 * ref_stride);
832
833    diff = __msa_asub_u_b(src, ref);
834    sad1 += __msa_hadd_u_h(diff, diff);
835
836    LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
837    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
838    ref2_ptr += (4 * ref_stride);
839
840    diff = __msa_asub_u_b(src, ref);
841    sad2 += __msa_hadd_u_h(diff, diff);
842
843    LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
844    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
845    ref3_ptr += (4 * ref_stride);
846
847    diff = __msa_asub_u_b(src, ref);
848    sad3 += __msa_hadd_u_h(diff, diff);
849  }
850
851  sad_array[0] = HADD_UH_U32(sad0);
852  sad_array[1] = HADD_UH_U32(sad1);
853  sad_array[2] = HADD_UH_U32(sad2);
854  sad_array[3] = HADD_UH_U32(sad3);
855}
856
857static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
858                               const uint8_t *const aref_ptr[],
859                               int32_t ref_stride, int32_t height,
860                               uint32_t *sad_array) {
861  int32_t ht_cnt;
862  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
863  v16u8 src0, src1, src2, src3;
864  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
865  v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
866  v8u16 sad0 = { 0 };
867  v8u16 sad1 = { 0 };
868  v8u16 sad2 = { 0 };
869  v8u16 sad3 = { 0 };
870
871  ref0_ptr = aref_ptr[0];
872  ref1_ptr = aref_ptr[1];
873  ref2_ptr = aref_ptr[2];
874  ref3_ptr = aref_ptr[3];
875
876  for (ht_cnt = (height >> 2); ht_cnt--;) {
877    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
878    src_ptr += (4 * src_stride);
879    LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
880    ref0_ptr += (4 * ref_stride);
881    LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
882    ref1_ptr += (4 * ref_stride);
883    LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
884    ref2_ptr += (4 * ref_stride);
885    LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
886    ref3_ptr += (4 * ref_stride);
887
888    PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
889    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
890    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
891
892    PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
893    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
894
895    PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
896    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
897
898    PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
899    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
900  }
901
902  sad_array[0] = HADD_UH_U32(sad0);
903  sad_array[1] = HADD_UH_U32(sad1);
904  sad_array[2] = HADD_UH_U32(sad2);
905  sad_array[3] = HADD_UH_U32(sad3);
906}
907
908static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
909                                const uint8_t *const aref_ptr[],
910                                int32_t ref_stride, int32_t height,
911                                uint32_t *sad_array) {
912  int32_t ht_cnt;
913  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
914  v16u8 src, ref0, ref1, ref2, ref3, diff;
915  v8u16 sad0 = { 0 };
916  v8u16 sad1 = { 0 };
917  v8u16 sad2 = { 0 };
918  v8u16 sad3 = { 0 };
919
920  ref0_ptr = aref_ptr[0];
921  ref1_ptr = aref_ptr[1];
922  ref2_ptr = aref_ptr[2];
923  ref3_ptr = aref_ptr[3];
924
925  for (ht_cnt = (height >> 1); ht_cnt--;) {
926    src = LD_UB(src_ptr);
927    src_ptr += src_stride;
928    ref0 = LD_UB(ref0_ptr);
929    ref0_ptr += ref_stride;
930    ref1 = LD_UB(ref1_ptr);
931    ref1_ptr += ref_stride;
932    ref2 = LD_UB(ref2_ptr);
933    ref2_ptr += ref_stride;
934    ref3 = LD_UB(ref3_ptr);
935    ref3_ptr += ref_stride;
936
937    diff = __msa_asub_u_b(src, ref0);
938    sad0 += __msa_hadd_u_h(diff, diff);
939    diff = __msa_asub_u_b(src, ref1);
940    sad1 += __msa_hadd_u_h(diff, diff);
941    diff = __msa_asub_u_b(src, ref2);
942    sad2 += __msa_hadd_u_h(diff, diff);
943    diff = __msa_asub_u_b(src, ref3);
944    sad3 += __msa_hadd_u_h(diff, diff);
945
946    src = LD_UB(src_ptr);
947    src_ptr += src_stride;
948    ref0 = LD_UB(ref0_ptr);
949    ref0_ptr += ref_stride;
950    ref1 = LD_UB(ref1_ptr);
951    ref1_ptr += ref_stride;
952    ref2 = LD_UB(ref2_ptr);
953    ref2_ptr += ref_stride;
954    ref3 = LD_UB(ref3_ptr);
955    ref3_ptr += ref_stride;
956
957    diff = __msa_asub_u_b(src, ref0);
958    sad0 += __msa_hadd_u_h(diff, diff);
959    diff = __msa_asub_u_b(src, ref1);
960    sad1 += __msa_hadd_u_h(diff, diff);
961    diff = __msa_asub_u_b(src, ref2);
962    sad2 += __msa_hadd_u_h(diff, diff);
963    diff = __msa_asub_u_b(src, ref3);
964    sad3 += __msa_hadd_u_h(diff, diff);
965  }
966
967  sad_array[0] = HADD_UH_U32(sad0);
968  sad_array[1] = HADD_UH_U32(sad1);
969  sad_array[2] = HADD_UH_U32(sad2);
970  sad_array[3] = HADD_UH_U32(sad3);
971}
972
973static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
974                                const uint8_t *const aref_ptr[],
975                                int32_t ref_stride, int32_t height,
976                                uint32_t *sad_array) {
977  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
978  int32_t ht_cnt;
979  v16u8 src0, src1, ref0, ref1;
980  v8u16 sad0 = { 0 };
981  v8u16 sad1 = { 0 };
982  v8u16 sad2 = { 0 };
983  v8u16 sad3 = { 0 };
984
985  ref0_ptr = aref_ptr[0];
986  ref1_ptr = aref_ptr[1];
987  ref2_ptr = aref_ptr[2];
988  ref3_ptr = aref_ptr[3];
989
990  for (ht_cnt = height; ht_cnt--;) {
991    LD_UB2(src, 16, src0, src1);
992    src += src_stride;
993
994    LD_UB2(ref0_ptr, 16, ref0, ref1);
995    ref0_ptr += ref_stride;
996    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
997
998    LD_UB2(ref1_ptr, 16, ref0, ref1);
999    ref1_ptr += ref_stride;
1000    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
1001
1002    LD_UB2(ref2_ptr, 16, ref0, ref1);
1003    ref2_ptr += ref_stride;
1004    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
1005
1006    LD_UB2(ref3_ptr, 16, ref0, ref1);
1007    ref3_ptr += ref_stride;
1008    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
1009  }
1010
1011  sad_array[0] = HADD_UH_U32(sad0);
1012  sad_array[1] = HADD_UH_U32(sad1);
1013  sad_array[2] = HADD_UH_U32(sad2);
1014  sad_array[3] = HADD_UH_U32(sad3);
1015}
1016
1017static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
1018                                const uint8_t *const aref_ptr[],
1019                                int32_t ref_stride, int32_t height,
1020                                uint32_t *sad_array) {
1021  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
1022  int32_t ht_cnt;
1023  v16u8 src0, src1, src2, src3;
1024  v16u8 ref0, ref1, ref2, ref3;
1025  v8u16 sad0_0 = { 0 };
1026  v8u16 sad0_1 = { 0 };
1027  v8u16 sad1_0 = { 0 };
1028  v8u16 sad1_1 = { 0 };
1029  v8u16 sad2_0 = { 0 };
1030  v8u16 sad2_1 = { 0 };
1031  v8u16 sad3_0 = { 0 };
1032  v8u16 sad3_1 = { 0 };
1033  v4u32 sad;
1034
1035  ref0_ptr = aref_ptr[0];
1036  ref1_ptr = aref_ptr[1];
1037  ref2_ptr = aref_ptr[2];
1038  ref3_ptr = aref_ptr[3];
1039
1040  for (ht_cnt = height; ht_cnt--;) {
1041    LD_UB4(src, 16, src0, src1, src2, src3);
1042    src += src_stride;
1043
1044    LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
1045    ref0_ptr += ref_stride;
1046    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1047    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1048
1049    LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
1050    ref1_ptr += ref_stride;
1051    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1052    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1053
1054    LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
1055    ref2_ptr += ref_stride;
1056    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1057    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1058
1059    LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
1060    ref3_ptr += ref_stride;
1061    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1062    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1063  }
1064
1065  sad = __msa_hadd_u_w(sad0_0, sad0_0);
1066  sad += __msa_hadd_u_w(sad0_1, sad0_1);
1067  sad_array[0] = HADD_UW_U32(sad);
1068
1069  sad = __msa_hadd_u_w(sad1_0, sad1_0);
1070  sad += __msa_hadd_u_w(sad1_1, sad1_1);
1071  sad_array[1] = HADD_UW_U32(sad);
1072
1073  sad = __msa_hadd_u_w(sad2_0, sad2_0);
1074  sad += __msa_hadd_u_w(sad2_1, sad2_1);
1075  sad_array[2] = HADD_UW_U32(sad);
1076
1077  sad = __msa_hadd_u_w(sad3_0, sad3_0);
1078  sad += __msa_hadd_u_w(sad3_1, sad3_1);
1079  sad_array[3] = HADD_UW_U32(sad);
1080}
1081
1082static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
1083                                  const uint8_t *ref_ptr, int32_t ref_stride,
1084                                  int32_t height, const uint8_t *sec_pred) {
1085  int32_t ht_cnt;
1086  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1087  v16u8 src = { 0 };
1088  v16u8 ref = { 0 };
1089  v16u8 diff, pred, comp;
1090  v8u16 sad = { 0 };
1091
1092  for (ht_cnt = (height >> 2); ht_cnt--;) {
1093    LW4(src_ptr, src_stride, src0, src1, src2, src3);
1094    src_ptr += (4 * src_stride);
1095    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
1096    ref_ptr += (4 * ref_stride);
1097    pred = LD_UB(sec_pred);
1098    sec_pred += 16;
1099
1100    INSERT_W4_UB(src0, src1, src2, src3, src);
1101    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1102
1103    comp = __msa_aver_u_b(pred, ref);
1104    diff = __msa_asub_u_b(src, comp);
1105    sad += __msa_hadd_u_h(diff, diff);
1106  }
1107
1108  return HADD_UH_U32(sad);
1109}
1110
1111static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
1112                                  const uint8_t *ref, int32_t ref_stride,
1113                                  int32_t height, const uint8_t *sec_pred) {
1114  int32_t ht_cnt;
1115  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1116  v16u8 diff0, diff1, pred0, pred1;
1117  v8u16 sad = { 0 };
1118
1119  for (ht_cnt = (height >> 2); ht_cnt--;) {
1120    LD_UB4(src, src_stride, src0, src1, src2, src3);
1121    src += (4 * src_stride);
1122    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1123    ref += (4 * ref_stride);
1124    LD_UB2(sec_pred, 16, pred0, pred1);
1125    sec_pred += 32;
1126    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
1127                ref0, ref1);
1128    AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
1129    sad += SAD_UB2_UH(src0, src1, diff0, diff1);
1130  }
1131
1132  return HADD_UH_U32(sad);
1133}
1134
1135static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
1136                                   const uint8_t *ref, int32_t ref_stride,
1137                                   int32_t height, const uint8_t *sec_pred) {
1138  int32_t ht_cnt;
1139  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1140  v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
1141  v8u16 sad = { 0 };
1142
1143  for (ht_cnt = (height >> 3); ht_cnt--;) {
1144    LD_UB4(src, src_stride, src0, src1, src2, src3);
1145    src += (4 * src_stride);
1146    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1147    ref += (4 * ref_stride);
1148    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1149    sec_pred += (4 * 16);
1150    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1151    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1152    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1153    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1154
1155    LD_UB4(src, src_stride, src0, src1, src2, src3);
1156    src += (4 * src_stride);
1157    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1158    ref += (4 * ref_stride);
1159    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1160    sec_pred += (4 * 16);
1161    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1162    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1163    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1164    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1165  }
1166
1167  return HADD_UH_U32(sad);
1168}
1169
1170static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
1171                                   const uint8_t *ref, int32_t ref_stride,
1172                                   int32_t height, const uint8_t *sec_pred) {
1173  int32_t ht_cnt;
1174  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1175  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
1176  v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
1177  v16u8 comp0, comp1;
1178  v8u16 sad = { 0 };
1179
1180  for (ht_cnt = (height >> 2); ht_cnt--;) {
1181    LD_UB4(src, src_stride, src0, src2, src4, src6);
1182    LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
1183    src += (4 * src_stride);
1184
1185    LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
1186    LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
1187    ref += (4 * ref_stride);
1188
1189    LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
1190    LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
1191    sec_pred += (4 * 32);
1192
1193    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1194    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1195    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1196    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1197    AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
1198    sad += SAD_UB2_UH(src4, src5, comp0, comp1);
1199    AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
1200    sad += SAD_UB2_UH(src6, src7, comp0, comp1);
1201  }
1202
1203  return HADD_UH_U32(sad);
1204}
1205
1206static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
1207                                   const uint8_t *ref, int32_t ref_stride,
1208                                   int32_t height, const uint8_t *sec_pred) {
1209  int32_t ht_cnt;
1210  v16u8 src0, src1, src2, src3;
1211  v16u8 ref0, ref1, ref2, ref3;
1212  v16u8 comp0, comp1, comp2, comp3;
1213  v16u8 pred0, pred1, pred2, pred3;
1214  v8u16 sad0 = { 0 };
1215  v8u16 sad1 = { 0 };
1216  v4u32 sad;
1217
1218  for (ht_cnt = (height >> 2); ht_cnt--;) {
1219    LD_UB4(src, 16, src0, src1, src2, src3);
1220    src += src_stride;
1221    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1222    ref += ref_stride;
1223    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1224    sec_pred += 64;
1225    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
1226                comp1, comp2, comp3);
1227    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1228    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1229
1230    LD_UB4(src, 16, src0, src1, src2, src3);
1231    src += src_stride;
1232    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1233    ref += ref_stride;
1234    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1235    sec_pred += 64;
1236    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
1237                comp1, comp2, comp3);
1238    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1239    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1240
1241    LD_UB4(src, 16, src0, src1, src2, src3);
1242    src += src_stride;
1243    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1244    ref += ref_stride;
1245    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1246    sec_pred += 64;
1247    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
1248                comp1, comp2, comp3);
1249    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1250    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1251
1252    LD_UB4(src, 16, src0, src1, src2, src3);
1253    src += src_stride;
1254    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1255    ref += ref_stride;
1256    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1257    sec_pred += 64;
1258    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
1259                comp1, comp2, comp3);
1260    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1261    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1262  }
1263
1264  sad = __msa_hadd_u_w(sad0, sad0);
1265  sad += __msa_hadd_u_w(sad1, sad1);
1266
1267  return HADD_SW_S32(sad);
1268}
1269
1270#define VPX_SAD_4xHEIGHT_MSA(height)                                         \
1271  uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1272                                   const uint8_t *ref, int32_t ref_stride) { \
1273    return sad_4width_msa(src, src_stride, ref, ref_stride, height);         \
1274  }
1275
1276#define VPX_SAD_8xHEIGHT_MSA(height)                                         \
1277  uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1278                                   const uint8_t *ref, int32_t ref_stride) { \
1279    return sad_8width_msa(src, src_stride, ref, ref_stride, height);         \
1280  }
1281
1282#define VPX_SAD_16xHEIGHT_MSA(height)                                         \
1283  uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1284                                    const uint8_t *ref, int32_t ref_stride) { \
1285    return sad_16width_msa(src, src_stride, ref, ref_stride, height);         \
1286  }
1287
1288#define VPX_SAD_32xHEIGHT_MSA(height)                                         \
1289  uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1290                                    const uint8_t *ref, int32_t ref_stride) { \
1291    return sad_32width_msa(src, src_stride, ref, ref_stride, height);         \
1292  }
1293
1294#define VPX_SAD_64xHEIGHT_MSA(height)                                         \
1295  uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1296                                    const uint8_t *ref, int32_t ref_stride) { \
1297    return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
1298  }
1299
1300#define VPX_SAD_4xHEIGHTx3_MSA(height)                                   \
1301  void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1302                                 const uint8_t *ref, int32_t ref_stride, \
1303                                 uint32_t *sads) {                       \
1304    sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1305  }
1306
1307#define VPX_SAD_8xHEIGHTx3_MSA(height)                                   \
1308  void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1309                                 const uint8_t *ref, int32_t ref_stride, \
1310                                 uint32_t *sads) {                       \
1311    sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1312  }
1313
1314#define VPX_SAD_16xHEIGHTx3_MSA(height)                                   \
1315  void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1316                                  const uint8_t *ref, int32_t ref_stride, \
1317                                  uint32_t *sads) {                       \
1318    sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1319  }
1320
1321#define VPX_SAD_32xHEIGHTx3_MSA(height)                                   \
1322  void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1323                                  const uint8_t *ref, int32_t ref_stride, \
1324                                  uint32_t *sads) {                       \
1325    sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1326  }
1327
1328#define VPX_SAD_64xHEIGHTx3_MSA(height)                                   \
1329  void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1330                                  const uint8_t *ref, int32_t ref_stride, \
1331                                  uint32_t *sads) {                       \
1332    sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1333  }
1334
1335#define VPX_SAD_4xHEIGHTx8_MSA(height)                                   \
1336  void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1337                                 const uint8_t *ref, int32_t ref_stride, \
1338                                 uint32_t *sads) {                       \
1339    sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1340  }
1341
1342#define VPX_SAD_8xHEIGHTx8_MSA(height)                                   \
1343  void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1344                                 const uint8_t *ref, int32_t ref_stride, \
1345                                 uint32_t *sads) {                       \
1346    sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1347  }
1348
1349#define VPX_SAD_16xHEIGHTx8_MSA(height)                                   \
1350  void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1351                                  const uint8_t *ref, int32_t ref_stride, \
1352                                  uint32_t *sads) {                       \
1353    sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1354  }
1355
1356#define VPX_SAD_32xHEIGHTx8_MSA(height)                                   \
1357  void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1358                                  const uint8_t *ref, int32_t ref_stride, \
1359                                  uint32_t *sads) {                       \
1360    sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1361  }
1362
1363#define VPX_SAD_64xHEIGHTx8_MSA(height)                                   \
1364  void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1365                                  const uint8_t *ref, int32_t ref_stride, \
1366                                  uint32_t *sads) {                       \
1367    sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1368  }
1369
1370#define VPX_SAD_4xHEIGHTx4D_MSA(height)                                   \
1371  void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1372                                  const uint8_t *const refs[],            \
1373                                  int32_t ref_stride, uint32_t *sads) {   \
1374    sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1375  }
1376
1377#define VPX_SAD_8xHEIGHTx4D_MSA(height)                                   \
1378  void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1379                                  const uint8_t *const refs[],            \
1380                                  int32_t ref_stride, uint32_t *sads) {   \
1381    sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1382  }
1383
1384#define VPX_SAD_16xHEIGHTx4D_MSA(height)                                   \
1385  void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1386                                   const uint8_t *const refs[],            \
1387                                   int32_t ref_stride, uint32_t *sads) {   \
1388    sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1389  }
1390
1391#define VPX_SAD_32xHEIGHTx4D_MSA(height)                                   \
1392  void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1393                                   const uint8_t *const refs[],            \
1394                                   int32_t ref_stride, uint32_t *sads) {   \
1395    sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1396  }
1397
1398#define VPX_SAD_64xHEIGHTx4D_MSA(height)                                   \
1399  void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1400                                   const uint8_t *const refs[],            \
1401                                   int32_t ref_stride, uint32_t *sads) {   \
1402    sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1403  }
1404
1405#define VPX_AVGSAD_4xHEIGHT_MSA(height)                                        \
1406  uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1407                                       const uint8_t *ref, int32_t ref_stride, \
1408                                       const uint8_t *second_pred) {           \
1409    return avgsad_4width_msa(src, src_stride, ref, ref_stride, height,         \
1410                             second_pred);                                     \
1411  }
1412
1413#define VPX_AVGSAD_8xHEIGHT_MSA(height)                                        \
1414  uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1415                                       const uint8_t *ref, int32_t ref_stride, \
1416                                       const uint8_t *second_pred) {           \
1417    return avgsad_8width_msa(src, src_stride, ref, ref_stride, height,         \
1418                             second_pred);                                     \
1419  }
1420
1421#define VPX_AVGSAD_16xHEIGHT_MSA(height)                                \
1422  uint32_t vpx_sad16x##height##_avg_msa(                                \
1423      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
1424      int32_t ref_stride, const uint8_t *second_pred) {                 \
1425    return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
1426                              second_pred);                             \
1427  }
1428
1429#define VPX_AVGSAD_32xHEIGHT_MSA(height)                                \
1430  uint32_t vpx_sad32x##height##_avg_msa(                                \
1431      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
1432      int32_t ref_stride, const uint8_t *second_pred) {                 \
1433    return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
1434                              second_pred);                             \
1435  }
1436
1437#define VPX_AVGSAD_64xHEIGHT_MSA(height)                                \
1438  uint32_t vpx_sad64x##height##_avg_msa(                                \
1439      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
1440      int32_t ref_stride, const uint8_t *second_pred) {                 \
1441    return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
1442                              second_pred);                             \
1443  }
1444
1445// 64x64
1446VPX_SAD_64xHEIGHT_MSA(64);
1447VPX_SAD_64xHEIGHTx3_MSA(64);
1448VPX_SAD_64xHEIGHTx8_MSA(64);
1449VPX_SAD_64xHEIGHTx4D_MSA(64);
1450VPX_AVGSAD_64xHEIGHT_MSA(64);
1451
1452// 64x32
1453VPX_SAD_64xHEIGHT_MSA(32);
1454VPX_SAD_64xHEIGHTx3_MSA(32);
1455VPX_SAD_64xHEIGHTx8_MSA(32);
1456VPX_SAD_64xHEIGHTx4D_MSA(32);
1457VPX_AVGSAD_64xHEIGHT_MSA(32);
1458
1459// 32x64
1460VPX_SAD_32xHEIGHT_MSA(64);
1461VPX_SAD_32xHEIGHTx3_MSA(64);
1462VPX_SAD_32xHEIGHTx8_MSA(64);
1463VPX_SAD_32xHEIGHTx4D_MSA(64);
1464VPX_AVGSAD_32xHEIGHT_MSA(64);
1465
1466// 32x32
1467VPX_SAD_32xHEIGHT_MSA(32);
1468VPX_SAD_32xHEIGHTx3_MSA(32);
1469VPX_SAD_32xHEIGHTx8_MSA(32);
1470VPX_SAD_32xHEIGHTx4D_MSA(32);
1471VPX_AVGSAD_32xHEIGHT_MSA(32);
1472
1473// 32x16
1474VPX_SAD_32xHEIGHT_MSA(16);
1475VPX_SAD_32xHEIGHTx3_MSA(16);
1476VPX_SAD_32xHEIGHTx8_MSA(16);
1477VPX_SAD_32xHEIGHTx4D_MSA(16);
1478VPX_AVGSAD_32xHEIGHT_MSA(16);
1479
1480// 16x32
1481VPX_SAD_16xHEIGHT_MSA(32);
1482VPX_SAD_16xHEIGHTx3_MSA(32);
1483VPX_SAD_16xHEIGHTx8_MSA(32);
1484VPX_SAD_16xHEIGHTx4D_MSA(32);
1485VPX_AVGSAD_16xHEIGHT_MSA(32);
1486
1487// 16x16
1488VPX_SAD_16xHEIGHT_MSA(16);
1489VPX_SAD_16xHEIGHTx3_MSA(16);
1490VPX_SAD_16xHEIGHTx8_MSA(16);
1491VPX_SAD_16xHEIGHTx4D_MSA(16);
1492VPX_AVGSAD_16xHEIGHT_MSA(16);
1493
1494// 16x8
1495VPX_SAD_16xHEIGHT_MSA(8);
1496VPX_SAD_16xHEIGHTx3_MSA(8);
1497VPX_SAD_16xHEIGHTx8_MSA(8);
1498VPX_SAD_16xHEIGHTx4D_MSA(8);
1499VPX_AVGSAD_16xHEIGHT_MSA(8);
1500
1501// 8x16
1502VPX_SAD_8xHEIGHT_MSA(16);
1503VPX_SAD_8xHEIGHTx3_MSA(16);
1504VPX_SAD_8xHEIGHTx8_MSA(16);
1505VPX_SAD_8xHEIGHTx4D_MSA(16);
1506VPX_AVGSAD_8xHEIGHT_MSA(16);
1507
1508// 8x8
1509VPX_SAD_8xHEIGHT_MSA(8);
1510VPX_SAD_8xHEIGHTx3_MSA(8);
1511VPX_SAD_8xHEIGHTx8_MSA(8);
1512VPX_SAD_8xHEIGHTx4D_MSA(8);
1513VPX_AVGSAD_8xHEIGHT_MSA(8);
1514
1515// 8x4
1516VPX_SAD_8xHEIGHT_MSA(4);
1517VPX_SAD_8xHEIGHTx3_MSA(4);
1518VPX_SAD_8xHEIGHTx8_MSA(4);
1519VPX_SAD_8xHEIGHTx4D_MSA(4);
1520VPX_AVGSAD_8xHEIGHT_MSA(4);
1521
1522// 4x8
1523VPX_SAD_4xHEIGHT_MSA(8);
1524VPX_SAD_4xHEIGHTx3_MSA(8);
1525VPX_SAD_4xHEIGHTx8_MSA(8);
1526VPX_SAD_4xHEIGHTx4D_MSA(8);
1527VPX_AVGSAD_4xHEIGHT_MSA(8);
1528
1529// 4x4
1530VPX_SAD_4xHEIGHT_MSA(4);
1531VPX_SAD_4xHEIGHTx3_MSA(4);
1532VPX_SAD_4xHEIGHTx8_MSA(4);
1533VPX_SAD_4xHEIGHTx4D_MSA(4);
1534VPX_AVGSAD_4xHEIGHT_MSA(4);
1535