1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_dsp_rtcd.h"
12#include "vpx_dsp/mips/macros_msa.h"
13
14#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out)       \
15  {                                                        \
16    out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
17    out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
18    out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
19    out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
20  }
21#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
22
23static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
24                               const uint8_t *ref_ptr, int32_t ref_stride,
25                               int32_t height) {
26  int32_t ht_cnt;
27  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
28  v16u8 src = { 0 };
29  v16u8 ref = { 0 };
30  v16u8 diff;
31  v8u16 sad = { 0 };
32
33  for (ht_cnt = (height >> 2); ht_cnt--;) {
34    LW4(src_ptr, src_stride, src0, src1, src2, src3);
35    src_ptr += (4 * src_stride);
36    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
37    ref_ptr += (4 * ref_stride);
38
39    INSERT_W4_UB(src0, src1, src2, src3, src);
40    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
41
42    diff = __msa_asub_u_b(src, ref);
43    sad += __msa_hadd_u_h(diff, diff);
44  }
45
46  return HADD_UH_U32(sad);
47}
48
49static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
50                               const uint8_t *ref, int32_t ref_stride,
51                               int32_t height) {
52  int32_t ht_cnt;
53  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
54  v8u16 sad = { 0 };
55
56  for (ht_cnt = (height >> 2); ht_cnt--;) {
57    LD_UB4(src, src_stride, src0, src1, src2, src3);
58    src += (4 * src_stride);
59    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
60    ref += (4 * ref_stride);
61
62    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
63                ref0, ref1);
64    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
65  }
66
67  return HADD_UH_U32(sad);
68}
69
70static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
71                                const uint8_t *ref, int32_t ref_stride,
72                                int32_t height) {
73  int32_t ht_cnt;
74  v16u8 src0, src1, ref0, ref1;
75  v8u16 sad = { 0 };
76
77  for (ht_cnt = (height >> 2); ht_cnt--;) {
78    LD_UB2(src, src_stride, src0, src1);
79    src += (2 * src_stride);
80    LD_UB2(ref, ref_stride, ref0, ref1);
81    ref += (2 * ref_stride);
82    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
83
84    LD_UB2(src, src_stride, src0, src1);
85    src += (2 * src_stride);
86    LD_UB2(ref, ref_stride, ref0, ref1);
87    ref += (2 * ref_stride);
88    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
89  }
90
91  return HADD_UH_U32(sad);
92}
93
94static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
95                                const uint8_t *ref, int32_t ref_stride,
96                                int32_t height) {
97  int32_t ht_cnt;
98  v16u8 src0, src1, ref0, ref1;
99  v8u16 sad = { 0 };
100
101  for (ht_cnt = (height >> 2); ht_cnt--;) {
102    LD_UB2(src, 16, src0, src1);
103    src += src_stride;
104    LD_UB2(ref, 16, ref0, ref1);
105    ref += ref_stride;
106    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
107
108    LD_UB2(src, 16, src0, src1);
109    src += src_stride;
110    LD_UB2(ref, 16, ref0, ref1);
111    ref += ref_stride;
112    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
113
114    LD_UB2(src, 16, src0, src1);
115    src += src_stride;
116    LD_UB2(ref, 16, ref0, ref1);
117    ref += ref_stride;
118    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
119
120    LD_UB2(src, 16, src0, src1);
121    src += src_stride;
122    LD_UB2(ref, 16, ref0, ref1);
123    ref += ref_stride;
124    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
125  }
126
127  return HADD_UH_U32(sad);
128}
129
130static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
131                                const uint8_t *ref, int32_t ref_stride,
132                                int32_t height) {
133  int32_t ht_cnt;
134  uint32_t sad = 0;
135  v16u8 src0, src1, src2, src3;
136  v16u8 ref0, ref1, ref2, ref3;
137  v8u16 sad0 = { 0 };
138  v8u16 sad1 = { 0 };
139
140  for (ht_cnt = (height >> 1); ht_cnt--;) {
141    LD_UB4(src, 16, src0, src1, src2, src3);
142    src += src_stride;
143    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
144    ref += ref_stride;
145    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
146    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
147
148    LD_UB4(src, 16, src0, src1, src2, src3);
149    src += src_stride;
150    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
151    ref += ref_stride;
152    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
153    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
154  }
155
156  sad = HADD_UH_U32(sad0);
157  sad += HADD_UH_U32(sad1);
158
159  return sad;
160}
161
162static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
163                              const uint8_t *ref_ptr, int32_t ref_stride,
164                              int32_t height, uint32_t *sad_array) {
165  int32_t ht_cnt;
166  uint32_t src0, src1, src2, src3;
167  v16u8 src = { 0 };
168  v16u8 ref = { 0 };
169  v16u8 ref0, ref1, ref2, ref3, diff;
170  v8u16 sad0 = { 0 };
171  v8u16 sad1 = { 0 };
172  v8u16 sad2 = { 0 };
173
174  for (ht_cnt = (height >> 2); ht_cnt--;) {
175    LW4(src_ptr, src_stride, src0, src1, src2, src3);
176    src_ptr += (4 * src_stride);
177    INSERT_W4_UB(src0, src1, src2, src3, src);
178
179    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
180    ref_ptr += (4 * ref_stride);
181    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
182    diff = __msa_asub_u_b(src, ref);
183    sad0 += __msa_hadd_u_h(diff, diff);
184
185    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
186    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
187    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
188    diff = __msa_asub_u_b(src, ref);
189    sad1 += __msa_hadd_u_h(diff, diff);
190
191    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
192    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
193    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
194    diff = __msa_asub_u_b(src, ref);
195    sad2 += __msa_hadd_u_h(diff, diff);
196  }
197
198  sad_array[0] = HADD_UH_U32(sad0);
199  sad_array[1] = HADD_UH_U32(sad1);
200  sad_array[2] = HADD_UH_U32(sad2);
201}
202
203static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
204                              const uint8_t *ref, int32_t ref_stride,
205                              int32_t height, uint32_t *sad_array) {
206  int32_t ht_cnt;
207  v16u8 src0, src1, src2, src3;
208  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
209  v8u16 sad0 = { 0 };
210  v8u16 sad1 = { 0 };
211  v8u16 sad2 = { 0 };
212
213  for (ht_cnt = (height >> 2); ht_cnt--;) {
214    LD_UB4(src, src_stride, src0, src1, src2, src3);
215    src += (4 * src_stride);
216    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
217    ref += (4 * ref_stride);
218    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
219                ref0, ref1);
220    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
221
222    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
223    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
224    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
225    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
226
227    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
228    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
229    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
230    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
231  }
232
233  sad_array[0] = HADD_UH_U32(sad0);
234  sad_array[1] = HADD_UH_U32(sad1);
235  sad_array[2] = HADD_UH_U32(sad2);
236}
237
238static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
239                               const uint8_t *ref_ptr, int32_t ref_stride,
240                               int32_t height, uint32_t *sad_array) {
241  int32_t ht_cnt;
242  v16u8 src, ref, ref0, ref1, diff;
243  v8u16 sad0 = { 0 };
244  v8u16 sad1 = { 0 };
245  v8u16 sad2 = { 0 };
246
247  for (ht_cnt = (height >> 1); ht_cnt--;) {
248    src = LD_UB(src_ptr);
249    src_ptr += src_stride;
250    LD_UB2(ref_ptr, 16, ref0, ref1);
251    ref_ptr += ref_stride;
252
253    diff = __msa_asub_u_b(src, ref0);
254    sad0 += __msa_hadd_u_h(diff, diff);
255
256    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
257    diff = __msa_asub_u_b(src, ref);
258    sad1 += __msa_hadd_u_h(diff, diff);
259
260    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
261    diff = __msa_asub_u_b(src, ref);
262    sad2 += __msa_hadd_u_h(diff, diff);
263
264    src = LD_UB(src_ptr);
265    src_ptr += src_stride;
266    LD_UB2(ref_ptr, 16, ref0, ref1);
267    ref_ptr += ref_stride;
268
269    diff = __msa_asub_u_b(src, ref0);
270    sad0 += __msa_hadd_u_h(diff, diff);
271
272    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
273    diff = __msa_asub_u_b(src, ref);
274    sad1 += __msa_hadd_u_h(diff, diff);
275
276    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
277    diff = __msa_asub_u_b(src, ref);
278    sad2 += __msa_hadd_u_h(diff, diff);
279  }
280
281  sad_array[0] = HADD_UH_U32(sad0);
282  sad_array[1] = HADD_UH_U32(sad1);
283  sad_array[2] = HADD_UH_U32(sad2);
284}
285
286static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
287                              const uint8_t *ref_ptr, int32_t ref_stride,
288                              int32_t height, uint32_t *sad_array) {
289  int32_t ht_cnt;
290  uint32_t src0, src1, src2, src3;
291  v16u8 ref0, ref1, ref2, ref3, diff;
292  v16u8 src = { 0 };
293  v16u8 ref = { 0 };
294  v8u16 sad0 = { 0 };
295  v8u16 sad1 = { 0 };
296  v8u16 sad2 = { 0 };
297  v8u16 sad3 = { 0 };
298  v8u16 sad4 = { 0 };
299  v8u16 sad5 = { 0 };
300  v8u16 sad6 = { 0 };
301  v8u16 sad7 = { 0 };
302
303  for (ht_cnt = (height >> 2); ht_cnt--;) {
304    LW4(src_ptr, src_stride, src0, src1, src2, src3);
305    INSERT_W4_UB(src0, src1, src2, src3, src);
306    src_ptr += (4 * src_stride);
307    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
308    ref_ptr += (4 * ref_stride);
309
310    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
311    diff = __msa_asub_u_b(src, ref);
312    sad0 += __msa_hadd_u_h(diff, diff);
313
314    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
315    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
316    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
317    diff = __msa_asub_u_b(src, ref);
318    sad1 += __msa_hadd_u_h(diff, diff);
319
320    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
321    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
322    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
323    diff = __msa_asub_u_b(src, ref);
324    sad2 += __msa_hadd_u_h(diff, diff);
325
326    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
327    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
328    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
329    diff = __msa_asub_u_b(src, ref);
330    sad3 += __msa_hadd_u_h(diff, diff);
331
332    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
333    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
334    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
335    diff = __msa_asub_u_b(src, ref);
336    sad4 += __msa_hadd_u_h(diff, diff);
337
338    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
339    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
340    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
341    diff = __msa_asub_u_b(src, ref);
342    sad5 += __msa_hadd_u_h(diff, diff);
343
344    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
345    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
346    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
347    diff = __msa_asub_u_b(src, ref);
348    sad6 += __msa_hadd_u_h(diff, diff);
349
350    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
351    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
352    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
353    diff = __msa_asub_u_b(src, ref);
354    sad7 += __msa_hadd_u_h(diff, diff);
355  }
356
357  sad_array[0] = HADD_UH_U32(sad0);
358  sad_array[1] = HADD_UH_U32(sad1);
359  sad_array[2] = HADD_UH_U32(sad2);
360  sad_array[3] = HADD_UH_U32(sad3);
361  sad_array[4] = HADD_UH_U32(sad4);
362  sad_array[5] = HADD_UH_U32(sad5);
363  sad_array[6] = HADD_UH_U32(sad6);
364  sad_array[7] = HADD_UH_U32(sad7);
365}
366
367static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
368                              const uint8_t *ref, int32_t ref_stride,
369                              int32_t height, uint32_t *sad_array) {
370  int32_t ht_cnt;
371  v16u8 src0, src1, src2, src3;
372  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
373  v8u16 sad0 = { 0 };
374  v8u16 sad1 = { 0 };
375  v8u16 sad2 = { 0 };
376  v8u16 sad3 = { 0 };
377  v8u16 sad4 = { 0 };
378  v8u16 sad5 = { 0 };
379  v8u16 sad6 = { 0 };
380  v8u16 sad7 = { 0 };
381
382  for (ht_cnt = (height >> 2); ht_cnt--;) {
383    LD_UB4(src, src_stride, src0, src1, src2, src3);
384    src += (4 * src_stride);
385    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
386    ref += (4 * ref_stride);
387    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
388                ref0, ref1);
389    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
390
391    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
392    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
393    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
394    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
395
396    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
397    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
398    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
399    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
400
401    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
402    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
403    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
404    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
405
406    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
407    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
408    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
409    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
410
411    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
412    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
413    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
414    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
415
416    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
417    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
418    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
419    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
420
421    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
422    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
423    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
424    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
425  }
426
427  sad_array[0] = HADD_UH_U32(sad0);
428  sad_array[1] = HADD_UH_U32(sad1);
429  sad_array[2] = HADD_UH_U32(sad2);
430  sad_array[3] = HADD_UH_U32(sad3);
431  sad_array[4] = HADD_UH_U32(sad4);
432  sad_array[5] = HADD_UH_U32(sad5);
433  sad_array[6] = HADD_UH_U32(sad6);
434  sad_array[7] = HADD_UH_U32(sad7);
435}
436
437static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
438                               const uint8_t *ref_ptr, int32_t ref_stride,
439                               int32_t height, uint32_t *sad_array) {
440  int32_t ht_cnt;
441  v16u8 src, ref0, ref1, ref;
442  v16u8 diff;
443  v8u16 sad0 = { 0 };
444  v8u16 sad1 = { 0 };
445  v8u16 sad2 = { 0 };
446  v8u16 sad3 = { 0 };
447  v8u16 sad4 = { 0 };
448  v8u16 sad5 = { 0 };
449  v8u16 sad6 = { 0 };
450  v8u16 sad7 = { 0 };
451
452  for (ht_cnt = (height >> 1); ht_cnt--;) {
453    src = LD_UB(src_ptr);
454    src_ptr += src_stride;
455    LD_UB2(ref_ptr, 16, ref0, ref1);
456    ref_ptr += ref_stride;
457
458    diff = __msa_asub_u_b(src, ref0);
459    sad0 += __msa_hadd_u_h(diff, diff);
460
461    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
462    diff = __msa_asub_u_b(src, ref);
463    sad1 += __msa_hadd_u_h(diff, diff);
464
465    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
466    diff = __msa_asub_u_b(src, ref);
467    sad2 += __msa_hadd_u_h(diff, diff);
468
469    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
470    diff = __msa_asub_u_b(src, ref);
471    sad3 += __msa_hadd_u_h(diff, diff);
472
473    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
474    diff = __msa_asub_u_b(src, ref);
475    sad4 += __msa_hadd_u_h(diff, diff);
476
477    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
478    diff = __msa_asub_u_b(src, ref);
479    sad5 += __msa_hadd_u_h(diff, diff);
480
481    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
482    diff = __msa_asub_u_b(src, ref);
483    sad6 += __msa_hadd_u_h(diff, diff);
484
485    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
486    diff = __msa_asub_u_b(src, ref);
487    sad7 += __msa_hadd_u_h(diff, diff);
488
489    src = LD_UB(src_ptr);
490    src_ptr += src_stride;
491    LD_UB2(ref_ptr, 16, ref0, ref1);
492    ref_ptr += ref_stride;
493
494    diff = __msa_asub_u_b(src, ref0);
495    sad0 += __msa_hadd_u_h(diff, diff);
496
497    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
498    diff = __msa_asub_u_b(src, ref);
499    sad1 += __msa_hadd_u_h(diff, diff);
500
501    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
502    diff = __msa_asub_u_b(src, ref);
503    sad2 += __msa_hadd_u_h(diff, diff);
504
505    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
506    diff = __msa_asub_u_b(src, ref);
507    sad3 += __msa_hadd_u_h(diff, diff);
508
509    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
510    diff = __msa_asub_u_b(src, ref);
511    sad4 += __msa_hadd_u_h(diff, diff);
512
513    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
514    diff = __msa_asub_u_b(src, ref);
515    sad5 += __msa_hadd_u_h(diff, diff);
516
517    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
518    diff = __msa_asub_u_b(src, ref);
519    sad6 += __msa_hadd_u_h(diff, diff);
520
521    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
522    diff = __msa_asub_u_b(src, ref);
523    sad7 += __msa_hadd_u_h(diff, diff);
524  }
525
526  sad_array[0] = HADD_UH_U32(sad0);
527  sad_array[1] = HADD_UH_U32(sad1);
528  sad_array[2] = HADD_UH_U32(sad2);
529  sad_array[3] = HADD_UH_U32(sad3);
530  sad_array[4] = HADD_UH_U32(sad4);
531  sad_array[5] = HADD_UH_U32(sad5);
532  sad_array[6] = HADD_UH_U32(sad6);
533  sad_array[7] = HADD_UH_U32(sad7);
534}
535
536static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
537                               const uint8_t *const aref_ptr[],
538                               int32_t ref_stride, int32_t height,
539                               uint32_t *sad_array) {
540  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
541  int32_t ht_cnt;
542  uint32_t src0, src1, src2, src3;
543  uint32_t ref0, ref1, ref2, ref3;
544  v16u8 src = { 0 };
545  v16u8 ref = { 0 };
546  v16u8 diff;
547  v8u16 sad0 = { 0 };
548  v8u16 sad1 = { 0 };
549  v8u16 sad2 = { 0 };
550  v8u16 sad3 = { 0 };
551
552  ref0_ptr = aref_ptr[0];
553  ref1_ptr = aref_ptr[1];
554  ref2_ptr = aref_ptr[2];
555  ref3_ptr = aref_ptr[3];
556
557  for (ht_cnt = (height >> 2); ht_cnt--;) {
558    LW4(src_ptr, src_stride, src0, src1, src2, src3);
559    INSERT_W4_UB(src0, src1, src2, src3, src);
560    src_ptr += (4 * src_stride);
561
562    LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
563    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
564    ref0_ptr += (4 * ref_stride);
565
566    diff = __msa_asub_u_b(src, ref);
567    sad0 += __msa_hadd_u_h(diff, diff);
568
569    LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
570    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
571    ref1_ptr += (4 * ref_stride);
572
573    diff = __msa_asub_u_b(src, ref);
574    sad1 += __msa_hadd_u_h(diff, diff);
575
576    LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
577    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
578    ref2_ptr += (4 * ref_stride);
579
580    diff = __msa_asub_u_b(src, ref);
581    sad2 += __msa_hadd_u_h(diff, diff);
582
583    LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
584    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
585    ref3_ptr += (4 * ref_stride);
586
587    diff = __msa_asub_u_b(src, ref);
588    sad3 += __msa_hadd_u_h(diff, diff);
589  }
590
591  sad_array[0] = HADD_UH_U32(sad0);
592  sad_array[1] = HADD_UH_U32(sad1);
593  sad_array[2] = HADD_UH_U32(sad2);
594  sad_array[3] = HADD_UH_U32(sad3);
595}
596
597static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
598                               const uint8_t *const aref_ptr[],
599                               int32_t ref_stride, int32_t height,
600                               uint32_t *sad_array) {
601  int32_t ht_cnt;
602  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
603  v16u8 src0, src1, src2, src3;
604  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
605  v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
606  v8u16 sad0 = { 0 };
607  v8u16 sad1 = { 0 };
608  v8u16 sad2 = { 0 };
609  v8u16 sad3 = { 0 };
610
611  ref0_ptr = aref_ptr[0];
612  ref1_ptr = aref_ptr[1];
613  ref2_ptr = aref_ptr[2];
614  ref3_ptr = aref_ptr[3];
615
616  for (ht_cnt = (height >> 2); ht_cnt--;) {
617    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
618    src_ptr += (4 * src_stride);
619    LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
620    ref0_ptr += (4 * ref_stride);
621    LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
622    ref1_ptr += (4 * ref_stride);
623    LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
624    ref2_ptr += (4 * ref_stride);
625    LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
626    ref3_ptr += (4 * ref_stride);
627
628    PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
629    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
630    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
631
632    PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
633    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
634
635    PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
636    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
637
638    PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
639    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
640  }
641
642  sad_array[0] = HADD_UH_U32(sad0);
643  sad_array[1] = HADD_UH_U32(sad1);
644  sad_array[2] = HADD_UH_U32(sad2);
645  sad_array[3] = HADD_UH_U32(sad3);
646}
647
648static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
649                                const uint8_t *const aref_ptr[],
650                                int32_t ref_stride, int32_t height,
651                                uint32_t *sad_array) {
652  int32_t ht_cnt;
653  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
654  v16u8 src, ref0, ref1, ref2, ref3, diff;
655  v8u16 sad0 = { 0 };
656  v8u16 sad1 = { 0 };
657  v8u16 sad2 = { 0 };
658  v8u16 sad3 = { 0 };
659
660  ref0_ptr = aref_ptr[0];
661  ref1_ptr = aref_ptr[1];
662  ref2_ptr = aref_ptr[2];
663  ref3_ptr = aref_ptr[3];
664
665  for (ht_cnt = (height >> 1); ht_cnt--;) {
666    src = LD_UB(src_ptr);
667    src_ptr += src_stride;
668    ref0 = LD_UB(ref0_ptr);
669    ref0_ptr += ref_stride;
670    ref1 = LD_UB(ref1_ptr);
671    ref1_ptr += ref_stride;
672    ref2 = LD_UB(ref2_ptr);
673    ref2_ptr += ref_stride;
674    ref3 = LD_UB(ref3_ptr);
675    ref3_ptr += ref_stride;
676
677    diff = __msa_asub_u_b(src, ref0);
678    sad0 += __msa_hadd_u_h(diff, diff);
679    diff = __msa_asub_u_b(src, ref1);
680    sad1 += __msa_hadd_u_h(diff, diff);
681    diff = __msa_asub_u_b(src, ref2);
682    sad2 += __msa_hadd_u_h(diff, diff);
683    diff = __msa_asub_u_b(src, ref3);
684    sad3 += __msa_hadd_u_h(diff, diff);
685
686    src = LD_UB(src_ptr);
687    src_ptr += src_stride;
688    ref0 = LD_UB(ref0_ptr);
689    ref0_ptr += ref_stride;
690    ref1 = LD_UB(ref1_ptr);
691    ref1_ptr += ref_stride;
692    ref2 = LD_UB(ref2_ptr);
693    ref2_ptr += ref_stride;
694    ref3 = LD_UB(ref3_ptr);
695    ref3_ptr += ref_stride;
696
697    diff = __msa_asub_u_b(src, ref0);
698    sad0 += __msa_hadd_u_h(diff, diff);
699    diff = __msa_asub_u_b(src, ref1);
700    sad1 += __msa_hadd_u_h(diff, diff);
701    diff = __msa_asub_u_b(src, ref2);
702    sad2 += __msa_hadd_u_h(diff, diff);
703    diff = __msa_asub_u_b(src, ref3);
704    sad3 += __msa_hadd_u_h(diff, diff);
705  }
706
707  sad_array[0] = HADD_UH_U32(sad0);
708  sad_array[1] = HADD_UH_U32(sad1);
709  sad_array[2] = HADD_UH_U32(sad2);
710  sad_array[3] = HADD_UH_U32(sad3);
711}
712
713static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
714                                const uint8_t *const aref_ptr[],
715                                int32_t ref_stride, int32_t height,
716                                uint32_t *sad_array) {
717  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
718  int32_t ht_cnt;
719  v16u8 src0, src1, ref0, ref1;
720  v8u16 sad0 = { 0 };
721  v8u16 sad1 = { 0 };
722  v8u16 sad2 = { 0 };
723  v8u16 sad3 = { 0 };
724
725  ref0_ptr = aref_ptr[0];
726  ref1_ptr = aref_ptr[1];
727  ref2_ptr = aref_ptr[2];
728  ref3_ptr = aref_ptr[3];
729
730  for (ht_cnt = height; ht_cnt--;) {
731    LD_UB2(src, 16, src0, src1);
732    src += src_stride;
733
734    LD_UB2(ref0_ptr, 16, ref0, ref1);
735    ref0_ptr += ref_stride;
736    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
737
738    LD_UB2(ref1_ptr, 16, ref0, ref1);
739    ref1_ptr += ref_stride;
740    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
741
742    LD_UB2(ref2_ptr, 16, ref0, ref1);
743    ref2_ptr += ref_stride;
744    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
745
746    LD_UB2(ref3_ptr, 16, ref0, ref1);
747    ref3_ptr += ref_stride;
748    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
749  }
750
751  sad_array[0] = HADD_UH_U32(sad0);
752  sad_array[1] = HADD_UH_U32(sad1);
753  sad_array[2] = HADD_UH_U32(sad2);
754  sad_array[3] = HADD_UH_U32(sad3);
755}
756
757static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
758                                const uint8_t *const aref_ptr[],
759                                int32_t ref_stride, int32_t height,
760                                uint32_t *sad_array) {
761  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
762  int32_t ht_cnt;
763  v16u8 src0, src1, src2, src3;
764  v16u8 ref0, ref1, ref2, ref3;
765  v8u16 sad0_0 = { 0 };
766  v8u16 sad0_1 = { 0 };
767  v8u16 sad1_0 = { 0 };
768  v8u16 sad1_1 = { 0 };
769  v8u16 sad2_0 = { 0 };
770  v8u16 sad2_1 = { 0 };
771  v8u16 sad3_0 = { 0 };
772  v8u16 sad3_1 = { 0 };
773  v4u32 sad;
774
775  ref0_ptr = aref_ptr[0];
776  ref1_ptr = aref_ptr[1];
777  ref2_ptr = aref_ptr[2];
778  ref3_ptr = aref_ptr[3];
779
780  for (ht_cnt = height; ht_cnt--;) {
781    LD_UB4(src, 16, src0, src1, src2, src3);
782    src += src_stride;
783
784    LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
785    ref0_ptr += ref_stride;
786    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
787    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
788
789    LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
790    ref1_ptr += ref_stride;
791    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
792    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
793
794    LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
795    ref2_ptr += ref_stride;
796    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
797    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
798
799    LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
800    ref3_ptr += ref_stride;
801    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
802    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
803  }
804
805  sad = __msa_hadd_u_w(sad0_0, sad0_0);
806  sad += __msa_hadd_u_w(sad0_1, sad0_1);
807  sad_array[0] = HADD_UW_U32(sad);
808
809  sad = __msa_hadd_u_w(sad1_0, sad1_0);
810  sad += __msa_hadd_u_w(sad1_1, sad1_1);
811  sad_array[1] = HADD_UW_U32(sad);
812
813  sad = __msa_hadd_u_w(sad2_0, sad2_0);
814  sad += __msa_hadd_u_w(sad2_1, sad2_1);
815  sad_array[2] = HADD_UW_U32(sad);
816
817  sad = __msa_hadd_u_w(sad3_0, sad3_0);
818  sad += __msa_hadd_u_w(sad3_1, sad3_1);
819  sad_array[3] = HADD_UW_U32(sad);
820}
821
822static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
823                                  const uint8_t *ref_ptr, int32_t ref_stride,
824                                  int32_t height, const uint8_t *sec_pred) {
825  int32_t ht_cnt;
826  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
827  v16u8 src = { 0 };
828  v16u8 ref = { 0 };
829  v16u8 diff, pred, comp;
830  v8u16 sad = { 0 };
831
832  for (ht_cnt = (height >> 2); ht_cnt--;) {
833    LW4(src_ptr, src_stride, src0, src1, src2, src3);
834    src_ptr += (4 * src_stride);
835    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
836    ref_ptr += (4 * ref_stride);
837    pred = LD_UB(sec_pred);
838    sec_pred += 16;
839
840    INSERT_W4_UB(src0, src1, src2, src3, src);
841    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
842
843    comp = __msa_aver_u_b(pred, ref);
844    diff = __msa_asub_u_b(src, comp);
845    sad += __msa_hadd_u_h(diff, diff);
846  }
847
848  return HADD_UH_U32(sad);
849}
850
851static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
852                                  const uint8_t *ref, int32_t ref_stride,
853                                  int32_t height, const uint8_t *sec_pred) {
854  int32_t ht_cnt;
855  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
856  v16u8 diff0, diff1, pred0, pred1;
857  v8u16 sad = { 0 };
858
859  for (ht_cnt = (height >> 2); ht_cnt--;) {
860    LD_UB4(src, src_stride, src0, src1, src2, src3);
861    src += (4 * src_stride);
862    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
863    ref += (4 * ref_stride);
864    LD_UB2(sec_pred, 16, pred0, pred1);
865    sec_pred += 32;
866    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
867                ref0, ref1);
868    AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
869    sad += SAD_UB2_UH(src0, src1, diff0, diff1);
870  }
871
872  return HADD_UH_U32(sad);
873}
874
875static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
876                                   const uint8_t *ref, int32_t ref_stride,
877                                   int32_t height, const uint8_t *sec_pred) {
878  int32_t ht_cnt;
879  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
880  v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
881  v8u16 sad = { 0 };
882
883  for (ht_cnt = (height >> 3); ht_cnt--;) {
884    LD_UB4(src, src_stride, src0, src1, src2, src3);
885    src += (4 * src_stride);
886    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
887    ref += (4 * ref_stride);
888    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
889    sec_pred += (4 * 16);
890    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
891    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
892    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
893    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
894
895    LD_UB4(src, src_stride, src0, src1, src2, src3);
896    src += (4 * src_stride);
897    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
898    ref += (4 * ref_stride);
899    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
900    sec_pred += (4 * 16);
901    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
902    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
903    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
904    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
905  }
906
907  return HADD_UH_U32(sad);
908}
909
910static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
911                                   const uint8_t *ref, int32_t ref_stride,
912                                   int32_t height, const uint8_t *sec_pred) {
913  int32_t ht_cnt;
914  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
915  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
916  v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
917  v16u8 comp0, comp1;
918  v8u16 sad = { 0 };
919
920  for (ht_cnt = (height >> 2); ht_cnt--;) {
921    LD_UB4(src, src_stride, src0, src2, src4, src6);
922    LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
923    src += (4 * src_stride);
924
925    LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
926    LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
927    ref += (4 * ref_stride);
928
929    LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
930    LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
931    sec_pred += (4 * 32);
932
933    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
934    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
935    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
936    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
937    AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
938    sad += SAD_UB2_UH(src4, src5, comp0, comp1);
939    AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
940    sad += SAD_UB2_UH(src6, src7, comp0, comp1);
941  }
942
943  return HADD_UH_U32(sad);
944}
945
946static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
947                                   const uint8_t *ref, int32_t ref_stride,
948                                   int32_t height, const uint8_t *sec_pred) {
949  int32_t ht_cnt;
950  v16u8 src0, src1, src2, src3;
951  v16u8 ref0, ref1, ref2, ref3;
952  v16u8 comp0, comp1, comp2, comp3;
953  v16u8 pred0, pred1, pred2, pred3;
954  v8u16 sad0 = { 0 };
955  v8u16 sad1 = { 0 };
956  v4u32 sad;
957
958  for (ht_cnt = (height >> 2); ht_cnt--;) {
959    LD_UB4(src, 16, src0, src1, src2, src3);
960    src += src_stride;
961    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
962    ref += ref_stride;
963    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
964    sec_pred += 64;
965    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
966                comp1, comp2, comp3);
967    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
968    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
969
970    LD_UB4(src, 16, src0, src1, src2, src3);
971    src += src_stride;
972    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
973    ref += ref_stride;
974    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
975    sec_pred += 64;
976    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
977                comp1, comp2, comp3);
978    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
979    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
980
981    LD_UB4(src, 16, src0, src1, src2, src3);
982    src += src_stride;
983    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
984    ref += ref_stride;
985    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
986    sec_pred += 64;
987    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
988                comp1, comp2, comp3);
989    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
990    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
991
992    LD_UB4(src, 16, src0, src1, src2, src3);
993    src += src_stride;
994    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
995    ref += ref_stride;
996    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
997    sec_pred += 64;
998    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
999                comp1, comp2, comp3);
1000    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1001    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1002  }
1003
1004  sad = __msa_hadd_u_w(sad0, sad0);
1005  sad += __msa_hadd_u_w(sad1, sad1);
1006
1007  return HADD_SW_S32(sad);
1008}
1009
1010#define VPX_SAD_4xHEIGHT_MSA(height)                                         \
1011  uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1012                                   const uint8_t *ref, int32_t ref_stride) { \
1013    return sad_4width_msa(src, src_stride, ref, ref_stride, height);         \
1014  }
1015
1016#define VPX_SAD_8xHEIGHT_MSA(height)                                         \
1017  uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1018                                   const uint8_t *ref, int32_t ref_stride) { \
1019    return sad_8width_msa(src, src_stride, ref, ref_stride, height);         \
1020  }
1021
1022#define VPX_SAD_16xHEIGHT_MSA(height)                                         \
1023  uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1024                                    const uint8_t *ref, int32_t ref_stride) { \
1025    return sad_16width_msa(src, src_stride, ref, ref_stride, height);         \
1026  }
1027
1028#define VPX_SAD_32xHEIGHT_MSA(height)                                         \
1029  uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1030                                    const uint8_t *ref, int32_t ref_stride) { \
1031    return sad_32width_msa(src, src_stride, ref, ref_stride, height);         \
1032  }
1033
1034#define VPX_SAD_64xHEIGHT_MSA(height)                                         \
1035  uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1036                                    const uint8_t *ref, int32_t ref_stride) { \
1037    return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
1038  }
1039
1040#define VPX_SAD_4xHEIGHTx3_MSA(height)                                   \
1041  void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1042                                 const uint8_t *ref, int32_t ref_stride, \
1043                                 uint32_t *sads) {                       \
1044    sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1045  }
1046
1047#define VPX_SAD_8xHEIGHTx3_MSA(height)                                   \
1048  void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1049                                 const uint8_t *ref, int32_t ref_stride, \
1050                                 uint32_t *sads) {                       \
1051    sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1052  }
1053
1054#define VPX_SAD_16xHEIGHTx3_MSA(height)                                   \
1055  void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1056                                  const uint8_t *ref, int32_t ref_stride, \
1057                                  uint32_t *sads) {                       \
1058    sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1059  }
1060
1061#define VPX_SAD_4xHEIGHTx8_MSA(height)                                   \
1062  void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1063                                 const uint8_t *ref, int32_t ref_stride, \
1064                                 uint32_t *sads) {                       \
1065    sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1066  }
1067
1068#define VPX_SAD_8xHEIGHTx8_MSA(height)                                   \
1069  void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1070                                 const uint8_t *ref, int32_t ref_stride, \
1071                                 uint32_t *sads) {                       \
1072    sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1073  }
1074
1075#define VPX_SAD_16xHEIGHTx8_MSA(height)                                   \
1076  void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1077                                  const uint8_t *ref, int32_t ref_stride, \
1078                                  uint32_t *sads) {                       \
1079    sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1080  }
1081
1082#define VPX_SAD_4xHEIGHTx4D_MSA(height)                                   \
1083  void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1084                                  const uint8_t *const refs[],            \
1085                                  int32_t ref_stride, uint32_t *sads) {   \
1086    sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1087  }
1088
1089#define VPX_SAD_8xHEIGHTx4D_MSA(height)                                   \
1090  void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1091                                  const uint8_t *const refs[],            \
1092                                  int32_t ref_stride, uint32_t *sads) {   \
1093    sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1094  }
1095
1096#define VPX_SAD_16xHEIGHTx4D_MSA(height)                                   \
1097  void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1098                                   const uint8_t *const refs[],            \
1099                                   int32_t ref_stride, uint32_t *sads) {   \
1100    sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1101  }
1102
1103#define VPX_SAD_32xHEIGHTx4D_MSA(height)                                   \
1104  void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1105                                   const uint8_t *const refs[],            \
1106                                   int32_t ref_stride, uint32_t *sads) {   \
1107    sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1108  }
1109
1110#define VPX_SAD_64xHEIGHTx4D_MSA(height)                                   \
1111  void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1112                                   const uint8_t *const refs[],            \
1113                                   int32_t ref_stride, uint32_t *sads) {   \
1114    sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1115  }
1116
1117#define VPX_AVGSAD_4xHEIGHT_MSA(height)                                        \
1118  uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1119                                       const uint8_t *ref, int32_t ref_stride, \
1120                                       const uint8_t *second_pred) {           \
1121    return avgsad_4width_msa(src, src_stride, ref, ref_stride, height,         \
1122                             second_pred);                                     \
1123  }
1124
1125#define VPX_AVGSAD_8xHEIGHT_MSA(height)                                        \
1126  uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1127                                       const uint8_t *ref, int32_t ref_stride, \
1128                                       const uint8_t *second_pred) {           \
1129    return avgsad_8width_msa(src, src_stride, ref, ref_stride, height,         \
1130                             second_pred);                                     \
1131  }
1132
1133#define VPX_AVGSAD_16xHEIGHT_MSA(height)                                \
1134  uint32_t vpx_sad16x##height##_avg_msa(                                \
1135      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
1136      int32_t ref_stride, const uint8_t *second_pred) {                 \
1137    return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
1138                              second_pred);                             \
1139  }
1140
1141#define VPX_AVGSAD_32xHEIGHT_MSA(height)                                \
1142  uint32_t vpx_sad32x##height##_avg_msa(                                \
1143      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
1144      int32_t ref_stride, const uint8_t *second_pred) {                 \
1145    return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
1146                              second_pred);                             \
1147  }
1148
1149#define VPX_AVGSAD_64xHEIGHT_MSA(height)                                \
1150  uint32_t vpx_sad64x##height##_avg_msa(                                \
1151      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
1152      int32_t ref_stride, const uint8_t *second_pred) {                 \
1153    return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
1154                              second_pred);                             \
1155  }
1156
1157// 64x64
1158VPX_SAD_64xHEIGHT_MSA(64);
1159VPX_SAD_64xHEIGHTx4D_MSA(64);
1160VPX_AVGSAD_64xHEIGHT_MSA(64);
1161
1162// 64x32
1163VPX_SAD_64xHEIGHT_MSA(32);
1164VPX_SAD_64xHEIGHTx4D_MSA(32);
1165VPX_AVGSAD_64xHEIGHT_MSA(32);
1166
1167// 32x64
1168VPX_SAD_32xHEIGHT_MSA(64);
1169VPX_SAD_32xHEIGHTx4D_MSA(64);
1170VPX_AVGSAD_32xHEIGHT_MSA(64);
1171
1172// 32x32
1173VPX_SAD_32xHEIGHT_MSA(32);
1174VPX_SAD_32xHEIGHTx4D_MSA(32);
1175VPX_AVGSAD_32xHEIGHT_MSA(32);
1176
1177// 32x16
1178VPX_SAD_32xHEIGHT_MSA(16);
1179VPX_SAD_32xHEIGHTx4D_MSA(16);
1180VPX_AVGSAD_32xHEIGHT_MSA(16);
1181
1182// 16x32
1183VPX_SAD_16xHEIGHT_MSA(32);
1184VPX_SAD_16xHEIGHTx4D_MSA(32);
1185VPX_AVGSAD_16xHEIGHT_MSA(32);
1186
1187// 16x16
1188VPX_SAD_16xHEIGHT_MSA(16);
1189VPX_SAD_16xHEIGHTx3_MSA(16);
1190VPX_SAD_16xHEIGHTx8_MSA(16);
1191VPX_SAD_16xHEIGHTx4D_MSA(16);
1192VPX_AVGSAD_16xHEIGHT_MSA(16);
1193
1194// 16x8
1195VPX_SAD_16xHEIGHT_MSA(8);
1196VPX_SAD_16xHEIGHTx3_MSA(8);
1197VPX_SAD_16xHEIGHTx8_MSA(8);
1198VPX_SAD_16xHEIGHTx4D_MSA(8);
1199VPX_AVGSAD_16xHEIGHT_MSA(8);
1200
1201// 8x16
1202VPX_SAD_8xHEIGHT_MSA(16);
1203VPX_SAD_8xHEIGHTx3_MSA(16);
1204VPX_SAD_8xHEIGHTx8_MSA(16);
1205VPX_SAD_8xHEIGHTx4D_MSA(16);
1206VPX_AVGSAD_8xHEIGHT_MSA(16);
1207
1208// 8x8
1209VPX_SAD_8xHEIGHT_MSA(8);
1210VPX_SAD_8xHEIGHTx3_MSA(8);
1211VPX_SAD_8xHEIGHTx8_MSA(8);
1212VPX_SAD_8xHEIGHTx4D_MSA(8);
1213VPX_AVGSAD_8xHEIGHT_MSA(8);
1214
1215// 8x4
1216VPX_SAD_8xHEIGHT_MSA(4);
1217VPX_SAD_8xHEIGHTx4D_MSA(4);
1218VPX_AVGSAD_8xHEIGHT_MSA(4);
1219
1220// 4x8
1221VPX_SAD_4xHEIGHT_MSA(8);
1222VPX_SAD_4xHEIGHTx4D_MSA(8);
1223VPX_AVGSAD_4xHEIGHT_MSA(8);
1224
1225// 4x4
1226VPX_SAD_4xHEIGHT_MSA(4);
1227VPX_SAD_4xHEIGHTx3_MSA(4);
1228VPX_SAD_4xHEIGHTx8_MSA(4);
1229VPX_SAD_4xHEIGHTx4D_MSA(4);
1230VPX_AVGSAD_4xHEIGHT_MSA(4);
1231