1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vpx_dsp_rtcd.h" 12#include "vpx_dsp/mips/macros_msa.h" 13 14#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \ 15 { \ 16 out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \ 17 out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \ 18 out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \ 19 out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \ 20 } 21#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__) 22 23static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, 24 const uint8_t *ref_ptr, int32_t ref_stride, 25 int32_t height) { 26 int32_t ht_cnt; 27 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; 28 v16u8 src = { 0 }; 29 v16u8 ref = { 0 }; 30 v16u8 diff; 31 v8u16 sad = { 0 }; 32 33 for (ht_cnt = (height >> 2); ht_cnt--;) { 34 LW4(src_ptr, src_stride, src0, src1, src2, src3); 35 src_ptr += (4 * src_stride); 36 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 37 ref_ptr += (4 * ref_stride); 38 39 INSERT_W4_UB(src0, src1, src2, src3, src); 40 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 41 42 diff = __msa_asub_u_b(src, ref); 43 sad += __msa_hadd_u_h(diff, diff); 44 } 45 46 return HADD_UH_U32(sad); 47} 48 49static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride, 50 const uint8_t *ref, int32_t ref_stride, 51 int32_t height) { 52 int32_t ht_cnt; 53 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; 54 v8u16 sad = { 0 }; 55 56 for (ht_cnt = (height >> 2); ht_cnt--;) { 57 LD_UB4(src, src_stride, src0, src1, src2, src3); 58 src += (4 * src_stride); 59 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 60 ref += (4 * ref_stride); 61 62 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, 63 ref0, ref1); 64 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 65 } 66 67 return HADD_UH_U32(sad); 68} 69 70static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride, 71 const uint8_t *ref, int32_t ref_stride, 72 int32_t height) { 73 int32_t ht_cnt; 74 v16u8 src0, src1, ref0, ref1; 75 v8u16 sad = { 0 }; 76 77 for (ht_cnt = (height >> 2); ht_cnt--;) { 78 LD_UB2(src, src_stride, src0, src1); 79 src += (2 * src_stride); 80 LD_UB2(ref, ref_stride, ref0, ref1); 81 ref += (2 * ref_stride); 82 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 83 84 LD_UB2(src, src_stride, src0, src1); 85 src += (2 * src_stride); 86 LD_UB2(ref, ref_stride, ref0, ref1); 87 ref += (2 * ref_stride); 88 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 89 } 90 91 return HADD_UH_U32(sad); 92} 93 94static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride, 95 const uint8_t *ref, int32_t ref_stride, 96 int32_t height) { 97 int32_t ht_cnt; 98 v16u8 src0, src1, ref0, ref1; 99 v8u16 sad = { 0 }; 100 101 for (ht_cnt = (height >> 2); ht_cnt--;) { 102 LD_UB2(src, 16, src0, src1); 103 src += src_stride; 104 LD_UB2(ref, 16, ref0, ref1); 105 ref += ref_stride; 106 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 107 108 LD_UB2(src, 16, src0, src1); 109 src += src_stride; 110 LD_UB2(ref, 16, ref0, ref1); 111 ref += ref_stride; 112 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 113 114 LD_UB2(src, 16, src0, src1); 115 src += src_stride; 116 LD_UB2(ref, 16, ref0, ref1); 117 ref += ref_stride; 118 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 119 120 LD_UB2(src, 16, src0, src1); 121 src += src_stride; 122 LD_UB2(ref, 16, ref0, ref1); 123 ref += ref_stride; 124 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 125 } 126 127 return HADD_UH_U32(sad); 128} 129 130static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride, 131 const uint8_t *ref, int32_t ref_stride, 132 int32_t height) { 133 int32_t ht_cnt; 134 uint32_t sad = 0; 135 v16u8 src0, src1, src2, src3; 136 v16u8 ref0, ref1, ref2, ref3; 137 v8u16 sad0 = { 0 }; 138 v8u16 sad1 = { 0 }; 139 140 for (ht_cnt = (height >> 1); ht_cnt--;) { 141 LD_UB4(src, 16, src0, src1, src2, src3); 142 src += src_stride; 143 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 144 ref += ref_stride; 145 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 146 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); 147 148 LD_UB4(src, 16, src0, src1, src2, src3); 149 src += src_stride; 150 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 151 ref += ref_stride; 152 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 153 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); 154 } 155 156 sad = HADD_UH_U32(sad0); 157 sad += HADD_UH_U32(sad1); 158 159 return sad; 160} 161 162static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, 163 const uint8_t *ref_ptr, int32_t ref_stride, 164 int32_t height, uint32_t *sad_array) { 165 int32_t ht_cnt; 166 uint32_t src0, src1, src2, src3; 167 v16u8 src = { 0 }; 168 v16u8 ref = { 0 }; 169 v16u8 ref0, ref1, ref2, ref3, diff; 170 v8u16 sad0 = { 0 }; 171 v8u16 sad1 = { 0 }; 172 v8u16 sad2 = { 0 }; 173 174 for (ht_cnt = (height >> 2); ht_cnt--;) { 175 LW4(src_ptr, src_stride, src0, src1, src2, src3); 176 src_ptr += (4 * src_stride); 177 INSERT_W4_UB(src0, src1, src2, src3, src); 178 179 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 180 ref_ptr += (4 * ref_stride); 181 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 182 diff = __msa_asub_u_b(src, ref); 183 sad0 += __msa_hadd_u_h(diff, diff); 184 185 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 186 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 187 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 188 diff = __msa_asub_u_b(src, ref); 189 sad1 += __msa_hadd_u_h(diff, diff); 190 191 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 192 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 193 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 194 diff = __msa_asub_u_b(src, ref); 195 sad2 += __msa_hadd_u_h(diff, diff); 196 } 197 198 sad_array[0] = HADD_UH_U32(sad0); 199 sad_array[1] = HADD_UH_U32(sad1); 200 sad_array[2] = HADD_UH_U32(sad2); 201} 202 203static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride, 204 const uint8_t *ref, int32_t ref_stride, 205 int32_t height, uint32_t *sad_array) { 206 int32_t ht_cnt; 207 v16u8 src0, src1, src2, src3; 208 v16u8 ref0, ref1, ref00, ref11, ref22, ref33; 209 v8u16 sad0 = { 0 }; 210 v8u16 sad1 = { 0 }; 211 v8u16 sad2 = { 0 }; 212 213 for (ht_cnt = (height >> 2); ht_cnt--;) { 214 LD_UB4(src, src_stride, src0, src1, src2, src3); 215 src += (4 * src_stride); 216 LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); 217 ref += (4 * ref_stride); 218 PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, 219 ref0, ref1); 220 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 221 222 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 223 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 224 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 225 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 226 227 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 228 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 229 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 230 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 231 } 232 233 sad_array[0] = HADD_UH_U32(sad0); 234 sad_array[1] = HADD_UH_U32(sad1); 235 sad_array[2] = HADD_UH_U32(sad2); 236} 237 238static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, 239 const uint8_t *ref_ptr, int32_t ref_stride, 240 int32_t height, uint32_t *sad_array) { 241 int32_t ht_cnt; 242 v16u8 src, ref, ref0, ref1, diff; 243 v8u16 sad0 = { 0 }; 244 v8u16 sad1 = { 0 }; 245 v8u16 sad2 = { 0 }; 246 247 for (ht_cnt = (height >> 1); ht_cnt--;) { 248 src = LD_UB(src_ptr); 249 src_ptr += src_stride; 250 LD_UB2(ref_ptr, 16, ref0, ref1); 251 ref_ptr += ref_stride; 252 253 diff = __msa_asub_u_b(src, ref0); 254 sad0 += __msa_hadd_u_h(diff, diff); 255 256 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); 257 diff = __msa_asub_u_b(src, ref); 258 sad1 += __msa_hadd_u_h(diff, diff); 259 260 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); 261 diff = __msa_asub_u_b(src, ref); 262 sad2 += __msa_hadd_u_h(diff, diff); 263 264 src = LD_UB(src_ptr); 265 src_ptr += src_stride; 266 LD_UB2(ref_ptr, 16, ref0, ref1); 267 ref_ptr += ref_stride; 268 269 diff = __msa_asub_u_b(src, ref0); 270 sad0 += __msa_hadd_u_h(diff, diff); 271 272 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); 273 diff = __msa_asub_u_b(src, ref); 274 sad1 += __msa_hadd_u_h(diff, diff); 275 276 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); 277 diff = __msa_asub_u_b(src, ref); 278 sad2 += __msa_hadd_u_h(diff, diff); 279 } 280 281 sad_array[0] = HADD_UH_U32(sad0); 282 sad_array[1] = HADD_UH_U32(sad1); 283 sad_array[2] = HADD_UH_U32(sad2); 284} 285 286static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride, 287 const uint8_t *ref, int32_t ref_stride, 288 int32_t height, uint32_t *sad_array) { 289 int32_t ht_cnt; 290 v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1; 291 v8u16 sad0 = { 0 }; 292 v8u16 sad1 = { 0 }; 293 v8u16 sad2 = { 0 }; 294 295 for (ht_cnt = height >> 1; ht_cnt--;) { 296 LD_UB2(src, 16, src0, src1); 297 src += src_stride; 298 LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); 299 ref += ref_stride; 300 301 sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); 302 303 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); 304 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 305 306 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); 307 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 308 309 LD_UB2(src, 16, src0, src1); 310 src += src_stride; 311 LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); 312 ref += ref_stride; 313 314 sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); 315 316 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); 317 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 318 319 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); 320 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 321 } 322 323 sad_array[0] = HADD_UH_U32(sad0); 324 sad_array[1] = HADD_UH_U32(sad1); 325 sad_array[2] = HADD_UH_U32(sad2); 326} 327 328static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride, 329 const uint8_t *ref, int32_t ref_stride, 330 int32_t height, uint32_t *sad_array) { 331 int32_t ht_cnt; 332 v16u8 src0, src1, src2, src3; 333 v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3; 334 v8u16 sad0_0 = { 0 }; 335 v8u16 sad0_1 = { 0 }; 336 v8u16 sad1_0 = { 0 }; 337 v8u16 sad1_1 = { 0 }; 338 v8u16 sad2_0 = { 0 }; 339 v8u16 sad2_1 = { 0 }; 340 v4u32 sad; 341 342 for (ht_cnt = height; ht_cnt--;) { 343 LD_UB4(src, 16, src0, src1, src2, src3); 344 src += src_stride; 345 LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3); 346 ref0_4 = LD_UB(ref + 64); 347 ref += ref_stride; 348 349 sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); 350 sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); 351 352 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); 353 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); 354 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 355 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 356 357 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); 358 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); 359 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 360 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 361 } 362 363 sad = __msa_hadd_u_w(sad0_0, sad0_0); 364 sad += __msa_hadd_u_w(sad0_1, sad0_1); 365 sad_array[0] = HADD_SW_S32((v4i32)sad); 366 367 sad = __msa_hadd_u_w(sad1_0, sad1_0); 368 sad += __msa_hadd_u_w(sad1_1, sad1_1); 369 sad_array[1] = HADD_SW_S32((v4i32)sad); 370 371 sad = __msa_hadd_u_w(sad2_0, sad2_0); 372 sad += __msa_hadd_u_w(sad2_1, sad2_1); 373 sad_array[2] = HADD_SW_S32((v4i32)sad); 374} 375 376static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, 377 const uint8_t *ref_ptr, int32_t ref_stride, 378 int32_t height, uint32_t *sad_array) { 379 int32_t ht_cnt; 380 uint32_t src0, src1, src2, src3; 381 v16u8 ref0, ref1, ref2, ref3, diff; 382 v16u8 src = { 0 }; 383 v16u8 ref = { 0 }; 384 v8u16 sad0 = { 0 }; 385 v8u16 sad1 = { 0 }; 386 v8u16 sad2 = { 0 }; 387 v8u16 sad3 = { 0 }; 388 v8u16 sad4 = { 0 }; 389 v8u16 sad5 = { 0 }; 390 v8u16 sad6 = { 0 }; 391 v8u16 sad7 = { 0 }; 392 393 for (ht_cnt = (height >> 2); ht_cnt--;) { 394 LW4(src_ptr, src_stride, src0, src1, src2, src3); 395 INSERT_W4_UB(src0, src1, src2, src3, src); 396 src_ptr += (4 * src_stride); 397 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 398 ref_ptr += (4 * ref_stride); 399 400 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 401 diff = __msa_asub_u_b(src, ref); 402 sad0 += __msa_hadd_u_h(diff, diff); 403 404 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 405 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 406 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 407 diff = __msa_asub_u_b(src, ref); 408 sad1 += __msa_hadd_u_h(diff, diff); 409 410 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 411 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 412 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 413 diff = __msa_asub_u_b(src, ref); 414 sad2 += __msa_hadd_u_h(diff, diff); 415 416 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 417 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 418 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 419 diff = __msa_asub_u_b(src, ref); 420 sad3 += __msa_hadd_u_h(diff, diff); 421 422 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 423 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 424 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 425 diff = __msa_asub_u_b(src, ref); 426 sad4 += __msa_hadd_u_h(diff, diff); 427 428 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 429 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 430 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 431 diff = __msa_asub_u_b(src, ref); 432 sad5 += __msa_hadd_u_h(diff, diff); 433 434 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 435 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 436 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 437 diff = __msa_asub_u_b(src, ref); 438 sad6 += __msa_hadd_u_h(diff, diff); 439 440 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 441 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 442 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 443 diff = __msa_asub_u_b(src, ref); 444 sad7 += __msa_hadd_u_h(diff, diff); 445 } 446 447 sad_array[0] = HADD_UH_U32(sad0); 448 sad_array[1] = HADD_UH_U32(sad1); 449 sad_array[2] = HADD_UH_U32(sad2); 450 sad_array[3] = HADD_UH_U32(sad3); 451 sad_array[4] = HADD_UH_U32(sad4); 452 sad_array[5] = HADD_UH_U32(sad5); 453 sad_array[6] = HADD_UH_U32(sad6); 454 sad_array[7] = HADD_UH_U32(sad7); 455} 456 457static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride, 458 const uint8_t *ref, int32_t ref_stride, 459 int32_t height, uint32_t *sad_array) { 460 int32_t ht_cnt; 461 v16u8 src0, src1, src2, src3; 462 v16u8 ref0, ref1, ref00, ref11, ref22, ref33; 463 v8u16 sad0 = { 0 }; 464 v8u16 sad1 = { 0 }; 465 v8u16 sad2 = { 0 }; 466 v8u16 sad3 = { 0 }; 467 v8u16 sad4 = { 0 }; 468 v8u16 sad5 = { 0 }; 469 v8u16 sad6 = { 0 }; 470 v8u16 sad7 = { 0 }; 471 472 for (ht_cnt = (height >> 2); ht_cnt--;) { 473 LD_UB4(src, src_stride, src0, src1, src2, src3); 474 src += (4 * src_stride); 475 LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); 476 ref += (4 * ref_stride); 477 PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, 478 ref0, ref1); 479 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 480 481 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 482 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 483 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 484 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 485 486 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 487 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 488 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 489 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 490 491 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 492 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 493 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 494 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); 495 496 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 497 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 498 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 499 sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); 500 501 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 502 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 503 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 504 sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); 505 506 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 507 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 508 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 509 sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); 510 511 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 512 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 513 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 514 sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); 515 } 516 517 sad_array[0] = HADD_UH_U32(sad0); 518 sad_array[1] = HADD_UH_U32(sad1); 519 sad_array[2] = HADD_UH_U32(sad2); 520 sad_array[3] = HADD_UH_U32(sad3); 521 sad_array[4] = HADD_UH_U32(sad4); 522 sad_array[5] = HADD_UH_U32(sad5); 523 sad_array[6] = HADD_UH_U32(sad6); 524 sad_array[7] = HADD_UH_U32(sad7); 525} 526 527static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, 528 const uint8_t *ref_ptr, int32_t ref_stride, 529 int32_t height, uint32_t *sad_array) { 530 int32_t ht_cnt; 531 v16u8 src, ref0, ref1, ref; 532 v16u8 diff; 533 v8u16 sad0 = { 0 }; 534 v8u16 sad1 = { 0 }; 535 v8u16 sad2 = { 0 }; 536 v8u16 sad3 = { 0 }; 537 v8u16 sad4 = { 0 }; 538 v8u16 sad5 = { 0 }; 539 v8u16 sad6 = { 0 }; 540 v8u16 sad7 = { 0 }; 541 542 for (ht_cnt = (height >> 1); ht_cnt--;) { 543 src = LD_UB(src_ptr); 544 src_ptr += src_stride; 545 LD_UB2(ref_ptr, 16, ref0, ref1); 546 ref_ptr += ref_stride; 547 548 diff = __msa_asub_u_b(src, ref0); 549 sad0 += __msa_hadd_u_h(diff, diff); 550 551 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); 552 diff = __msa_asub_u_b(src, ref); 553 sad1 += __msa_hadd_u_h(diff, diff); 554 555 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); 556 diff = __msa_asub_u_b(src, ref); 557 sad2 += __msa_hadd_u_h(diff, diff); 558 559 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); 560 diff = __msa_asub_u_b(src, ref); 561 sad3 += __msa_hadd_u_h(diff, diff); 562 563 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); 564 diff = __msa_asub_u_b(src, ref); 565 sad4 += __msa_hadd_u_h(diff, diff); 566 567 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); 568 diff = __msa_asub_u_b(src, ref); 569 sad5 += __msa_hadd_u_h(diff, diff); 570 571 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); 572 diff = __msa_asub_u_b(src, ref); 573 sad6 += __msa_hadd_u_h(diff, diff); 574 575 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); 576 diff = __msa_asub_u_b(src, ref); 577 sad7 += __msa_hadd_u_h(diff, diff); 578 579 src = LD_UB(src_ptr); 580 src_ptr += src_stride; 581 LD_UB2(ref_ptr, 16, ref0, ref1); 582 ref_ptr += ref_stride; 583 584 diff = __msa_asub_u_b(src, ref0); 585 sad0 += __msa_hadd_u_h(diff, diff); 586 587 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); 588 diff = __msa_asub_u_b(src, ref); 589 sad1 += __msa_hadd_u_h(diff, diff); 590 591 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); 592 diff = __msa_asub_u_b(src, ref); 593 sad2 += __msa_hadd_u_h(diff, diff); 594 595 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); 596 diff = __msa_asub_u_b(src, ref); 597 sad3 += __msa_hadd_u_h(diff, diff); 598 599 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); 600 diff = __msa_asub_u_b(src, ref); 601 sad4 += __msa_hadd_u_h(diff, diff); 602 603 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); 604 diff = __msa_asub_u_b(src, ref); 605 sad5 += __msa_hadd_u_h(diff, diff); 606 607 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); 608 diff = __msa_asub_u_b(src, ref); 609 sad6 += __msa_hadd_u_h(diff, diff); 610 611 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); 612 diff = __msa_asub_u_b(src, ref); 613 sad7 += __msa_hadd_u_h(diff, diff); 614 } 615 616 sad_array[0] = HADD_UH_U32(sad0); 617 sad_array[1] = HADD_UH_U32(sad1); 618 sad_array[2] = HADD_UH_U32(sad2); 619 sad_array[3] = HADD_UH_U32(sad3); 620 sad_array[4] = HADD_UH_U32(sad4); 621 sad_array[5] = HADD_UH_U32(sad5); 622 sad_array[6] = HADD_UH_U32(sad6); 623 sad_array[7] = HADD_UH_U32(sad7); 624} 625 626static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride, 627 const uint8_t *ref, int32_t ref_stride, 628 int32_t height, uint32_t *sad_array) { 629 int32_t ht_cnt; 630 v16u8 src0, src1; 631 v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2; 632 v8u16 sad0 = { 0 }; 633 v8u16 sad1 = { 0 }; 634 v8u16 sad2 = { 0 }; 635 v8u16 sad3 = { 0 }; 636 v8u16 sad4 = { 0 }; 637 v8u16 sad5 = { 0 }; 638 v8u16 sad6 = { 0 }; 639 v8u16 sad7 = { 0 }; 640 641 for (ht_cnt = height; ht_cnt--;) { 642 LD_UB2(src, 16, src0, src1); 643 src += src_stride; 644 LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); 645 ref += ref_stride; 646 647 sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); 648 649 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); 650 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 651 652 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); 653 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 654 655 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); 656 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); 657 658 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); 659 sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); 660 661 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); 662 sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); 663 664 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); 665 sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); 666 667 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); 668 sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); 669 } 670 671 sad_array[0] = HADD_UH_U32(sad0); 672 sad_array[1] = HADD_UH_U32(sad1); 673 sad_array[2] = HADD_UH_U32(sad2); 674 sad_array[3] = HADD_UH_U32(sad3); 675 sad_array[4] = HADD_UH_U32(sad4); 676 sad_array[5] = HADD_UH_U32(sad5); 677 sad_array[6] = HADD_UH_U32(sad6); 678 sad_array[7] = HADD_UH_U32(sad7); 679} 680 681static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride, 682 const uint8_t *ref, int32_t ref_stride, 683 int32_t height, uint32_t *sad_array) { 684 const uint8_t *src_dup, *ref_dup; 685 int32_t ht_cnt; 686 v16u8 src0, src1, src2, src3; 687 v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4; 688 v16u8 ref0, ref1, ref2, ref3; 689 v8u16 sad0_0 = { 0 }; 690 v8u16 sad0_1 = { 0 }; 691 v8u16 sad1_0 = { 0 }; 692 v8u16 sad1_1 = { 0 }; 693 v8u16 sad2_0 = { 0 }; 694 v8u16 sad2_1 = { 0 }; 695 v8u16 sad3_0 = { 0 }; 696 v8u16 sad3_1 = { 0 }; 697 v4u32 sad; 698 699 src_dup = src; 700 ref_dup = ref; 701 702 for (ht_cnt = height; ht_cnt--;) { 703 LD_UB4(src, 16, src0, src1, src2, src3); 704 src += src_stride; 705 LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); 706 ref += ref_stride; 707 708 sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); 709 sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); 710 711 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); 712 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); 713 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 714 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 715 716 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); 717 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); 718 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 719 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 720 721 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); 722 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3); 723 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 724 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 725 } 726 727 sad = __msa_hadd_u_w(sad0_0, sad0_0); 728 sad += __msa_hadd_u_w(sad0_1, sad0_1); 729 sad_array[0] = HADD_SW_S32(sad); 730 731 sad = __msa_hadd_u_w(sad1_0, sad1_0); 732 sad += __msa_hadd_u_w(sad1_1, sad1_1); 733 sad_array[1] = HADD_SW_S32(sad); 734 735 sad = __msa_hadd_u_w(sad2_0, sad2_0); 736 sad += __msa_hadd_u_w(sad2_1, sad2_1); 737 sad_array[2] = HADD_SW_S32(sad); 738 739 sad = __msa_hadd_u_w(sad3_0, sad3_0); 740 sad += __msa_hadd_u_w(sad3_1, sad3_1); 741 sad_array[3] = HADD_SW_S32(sad); 742 743 sad0_0 = (v8u16)__msa_ldi_h(0); 744 sad0_1 = (v8u16)__msa_ldi_h(0); 745 sad1_0 = (v8u16)__msa_ldi_h(0); 746 sad1_1 = (v8u16)__msa_ldi_h(0); 747 sad2_0 = (v8u16)__msa_ldi_h(0); 748 sad2_1 = (v8u16)__msa_ldi_h(0); 749 sad3_0 = (v8u16)__msa_ldi_h(0); 750 sad3_1 = (v8u16)__msa_ldi_h(0); 751 752 for (ht_cnt = 64; ht_cnt--;) { 753 LD_UB4(src_dup, 16, src0, src1, src2, src3); 754 src_dup += src_stride; 755 LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); 756 ref_dup += ref_stride; 757 758 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); 759 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4); 760 sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 761 sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 762 763 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); 764 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5); 765 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 766 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 767 768 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); 769 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6); 770 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 771 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 772 773 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); 774 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7); 775 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 776 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 777 } 778 779 sad = __msa_hadd_u_w(sad0_0, sad0_0); 780 sad += __msa_hadd_u_w(sad0_1, sad0_1); 781 sad_array[4] = HADD_SW_S32(sad); 782 783 sad = __msa_hadd_u_w(sad1_0, sad1_0); 784 sad += __msa_hadd_u_w(sad1_1, sad1_1); 785 sad_array[5] = HADD_SW_S32(sad); 786 787 sad = __msa_hadd_u_w(sad2_0, sad2_0); 788 sad += __msa_hadd_u_w(sad2_1, sad2_1); 789 sad_array[6] = HADD_SW_S32(sad); 790 791 sad = __msa_hadd_u_w(sad3_0, sad3_0); 792 sad += __msa_hadd_u_w(sad3_1, sad3_1); 793 sad_array[7] = HADD_SW_S32(sad); 794} 795 796static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, 797 const uint8_t *const aref_ptr[], 798 int32_t ref_stride, int32_t height, 799 uint32_t *sad_array) { 800 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; 801 int32_t ht_cnt; 802 uint32_t src0, src1, src2, src3; 803 uint32_t ref0, ref1, ref2, ref3; 804 v16u8 src = { 0 }; 805 v16u8 ref = { 0 }; 806 v16u8 diff; 807 v8u16 sad0 = { 0 }; 808 v8u16 sad1 = { 0 }; 809 v8u16 sad2 = { 0 }; 810 v8u16 sad3 = { 0 }; 811 812 ref0_ptr = aref_ptr[0]; 813 ref1_ptr = aref_ptr[1]; 814 ref2_ptr = aref_ptr[2]; 815 ref3_ptr = aref_ptr[3]; 816 817 for (ht_cnt = (height >> 2); ht_cnt--;) { 818 LW4(src_ptr, src_stride, src0, src1, src2, src3); 819 INSERT_W4_UB(src0, src1, src2, src3, src); 820 src_ptr += (4 * src_stride); 821 822 LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); 823 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 824 ref0_ptr += (4 * ref_stride); 825 826 diff = __msa_asub_u_b(src, ref); 827 sad0 += __msa_hadd_u_h(diff, diff); 828 829 LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3); 830 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 831 ref1_ptr += (4 * ref_stride); 832 833 diff = __msa_asub_u_b(src, ref); 834 sad1 += __msa_hadd_u_h(diff, diff); 835 836 LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3); 837 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 838 ref2_ptr += (4 * ref_stride); 839 840 diff = __msa_asub_u_b(src, ref); 841 sad2 += __msa_hadd_u_h(diff, diff); 842 843 LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3); 844 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 845 ref3_ptr += (4 * ref_stride); 846 847 diff = __msa_asub_u_b(src, ref); 848 sad3 += __msa_hadd_u_h(diff, diff); 849 } 850 851 sad_array[0] = HADD_UH_U32(sad0); 852 sad_array[1] = HADD_UH_U32(sad1); 853 sad_array[2] = HADD_UH_U32(sad2); 854 sad_array[3] = HADD_UH_U32(sad3); 855} 856 857static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, 858 const uint8_t *const aref_ptr[], 859 int32_t ref_stride, int32_t height, 860 uint32_t *sad_array) { 861 int32_t ht_cnt; 862 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; 863 v16u8 src0, src1, src2, src3; 864 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; 865 v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; 866 v8u16 sad0 = { 0 }; 867 v8u16 sad1 = { 0 }; 868 v8u16 sad2 = { 0 }; 869 v8u16 sad3 = { 0 }; 870 871 ref0_ptr = aref_ptr[0]; 872 ref1_ptr = aref_ptr[1]; 873 ref2_ptr = aref_ptr[2]; 874 ref3_ptr = aref_ptr[3]; 875 876 for (ht_cnt = (height >> 2); ht_cnt--;) { 877 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); 878 src_ptr += (4 * src_stride); 879 LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); 880 ref0_ptr += (4 * ref_stride); 881 LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7); 882 ref1_ptr += (4 * ref_stride); 883 LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11); 884 ref2_ptr += (4 * ref_stride); 885 LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15); 886 ref3_ptr += (4 * ref_stride); 887 888 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); 889 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 890 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 891 892 PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1); 893 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 894 895 PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1); 896 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 897 898 PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1); 899 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); 900 } 901 902 sad_array[0] = HADD_UH_U32(sad0); 903 sad_array[1] = HADD_UH_U32(sad1); 904 sad_array[2] = HADD_UH_U32(sad2); 905 sad_array[3] = HADD_UH_U32(sad3); 906} 907 908static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, 909 const uint8_t *const aref_ptr[], 910 int32_t ref_stride, int32_t height, 911 uint32_t *sad_array) { 912 int32_t ht_cnt; 913 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; 914 v16u8 src, ref0, ref1, ref2, ref3, diff; 915 v8u16 sad0 = { 0 }; 916 v8u16 sad1 = { 0 }; 917 v8u16 sad2 = { 0 }; 918 v8u16 sad3 = { 0 }; 919 920 ref0_ptr = aref_ptr[0]; 921 ref1_ptr = aref_ptr[1]; 922 ref2_ptr = aref_ptr[2]; 923 ref3_ptr = aref_ptr[3]; 924 925 for (ht_cnt = (height >> 1); ht_cnt--;) { 926 src = LD_UB(src_ptr); 927 src_ptr += src_stride; 928 ref0 = LD_UB(ref0_ptr); 929 ref0_ptr += ref_stride; 930 ref1 = LD_UB(ref1_ptr); 931 ref1_ptr += ref_stride; 932 ref2 = LD_UB(ref2_ptr); 933 ref2_ptr += ref_stride; 934 ref3 = LD_UB(ref3_ptr); 935 ref3_ptr += ref_stride; 936 937 diff = __msa_asub_u_b(src, ref0); 938 sad0 += __msa_hadd_u_h(diff, diff); 939 diff = __msa_asub_u_b(src, ref1); 940 sad1 += __msa_hadd_u_h(diff, diff); 941 diff = __msa_asub_u_b(src, ref2); 942 sad2 += __msa_hadd_u_h(diff, diff); 943 diff = __msa_asub_u_b(src, ref3); 944 sad3 += __msa_hadd_u_h(diff, diff); 945 946 src = LD_UB(src_ptr); 947 src_ptr += src_stride; 948 ref0 = LD_UB(ref0_ptr); 949 ref0_ptr += ref_stride; 950 ref1 = LD_UB(ref1_ptr); 951 ref1_ptr += ref_stride; 952 ref2 = LD_UB(ref2_ptr); 953 ref2_ptr += ref_stride; 954 ref3 = LD_UB(ref3_ptr); 955 ref3_ptr += ref_stride; 956 957 diff = __msa_asub_u_b(src, ref0); 958 sad0 += __msa_hadd_u_h(diff, diff); 959 diff = __msa_asub_u_b(src, ref1); 960 sad1 += __msa_hadd_u_h(diff, diff); 961 diff = __msa_asub_u_b(src, ref2); 962 sad2 += __msa_hadd_u_h(diff, diff); 963 diff = __msa_asub_u_b(src, ref3); 964 sad3 += __msa_hadd_u_h(diff, diff); 965 } 966 967 sad_array[0] = HADD_UH_U32(sad0); 968 sad_array[1] = HADD_UH_U32(sad1); 969 sad_array[2] = HADD_UH_U32(sad2); 970 sad_array[3] = HADD_UH_U32(sad3); 971} 972 973static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride, 974 const uint8_t *const aref_ptr[], 975 int32_t ref_stride, int32_t height, 976 uint32_t *sad_array) { 977 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; 978 int32_t ht_cnt; 979 v16u8 src0, src1, ref0, ref1; 980 v8u16 sad0 = { 0 }; 981 v8u16 sad1 = { 0 }; 982 v8u16 sad2 = { 0 }; 983 v8u16 sad3 = { 0 }; 984 985 ref0_ptr = aref_ptr[0]; 986 ref1_ptr = aref_ptr[1]; 987 ref2_ptr = aref_ptr[2]; 988 ref3_ptr = aref_ptr[3]; 989 990 for (ht_cnt = height; ht_cnt--;) { 991 LD_UB2(src, 16, src0, src1); 992 src += src_stride; 993 994 LD_UB2(ref0_ptr, 16, ref0, ref1); 995 ref0_ptr += ref_stride; 996 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 997 998 LD_UB2(ref1_ptr, 16, ref0, ref1); 999 ref1_ptr += ref_stride; 1000 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 1001 1002 LD_UB2(ref2_ptr, 16, ref0, ref1); 1003 ref2_ptr += ref_stride; 1004 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 1005 1006 LD_UB2(ref3_ptr, 16, ref0, ref1); 1007 ref3_ptr += ref_stride; 1008 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); 1009 } 1010 1011 sad_array[0] = HADD_UH_U32(sad0); 1012 sad_array[1] = HADD_UH_U32(sad1); 1013 sad_array[2] = HADD_UH_U32(sad2); 1014 sad_array[3] = HADD_UH_U32(sad3); 1015} 1016 1017static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, 1018 const uint8_t *const aref_ptr[], 1019 int32_t ref_stride, int32_t height, 1020 uint32_t *sad_array) { 1021 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; 1022 int32_t ht_cnt; 1023 v16u8 src0, src1, src2, src3; 1024 v16u8 ref0, ref1, ref2, ref3; 1025 v8u16 sad0_0 = { 0 }; 1026 v8u16 sad0_1 = { 0 }; 1027 v8u16 sad1_0 = { 0 }; 1028 v8u16 sad1_1 = { 0 }; 1029 v8u16 sad2_0 = { 0 }; 1030 v8u16 sad2_1 = { 0 }; 1031 v8u16 sad3_0 = { 0 }; 1032 v8u16 sad3_1 = { 0 }; 1033 v4u32 sad; 1034 1035 ref0_ptr = aref_ptr[0]; 1036 ref1_ptr = aref_ptr[1]; 1037 ref2_ptr = aref_ptr[2]; 1038 ref3_ptr = aref_ptr[3]; 1039 1040 for (ht_cnt = height; ht_cnt--;) { 1041 LD_UB4(src, 16, src0, src1, src2, src3); 1042 src += src_stride; 1043 1044 LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3); 1045 ref0_ptr += ref_stride; 1046 sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 1047 sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 1048 1049 LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3); 1050 ref1_ptr += ref_stride; 1051 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 1052 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 1053 1054 LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3); 1055 ref2_ptr += ref_stride; 1056 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 1057 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 1058 1059 LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3); 1060 ref3_ptr += ref_stride; 1061 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 1062 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 1063 } 1064 1065 sad = __msa_hadd_u_w(sad0_0, sad0_0); 1066 sad += __msa_hadd_u_w(sad0_1, sad0_1); 1067 sad_array[0] = HADD_UW_U32(sad); 1068 1069 sad = __msa_hadd_u_w(sad1_0, sad1_0); 1070 sad += __msa_hadd_u_w(sad1_1, sad1_1); 1071 sad_array[1] = HADD_UW_U32(sad); 1072 1073 sad = __msa_hadd_u_w(sad2_0, sad2_0); 1074 sad += __msa_hadd_u_w(sad2_1, sad2_1); 1075 sad_array[2] = HADD_UW_U32(sad); 1076 1077 sad = __msa_hadd_u_w(sad3_0, sad3_0); 1078 sad += __msa_hadd_u_w(sad3_1, sad3_1); 1079 sad_array[3] = HADD_UW_U32(sad); 1080} 1081 1082static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, 1083 const uint8_t *ref_ptr, int32_t ref_stride, 1084 int32_t height, const uint8_t *sec_pred) { 1085 int32_t ht_cnt; 1086 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; 1087 v16u8 src = { 0 }; 1088 v16u8 ref = { 0 }; 1089 v16u8 diff, pred, comp; 1090 v8u16 sad = { 0 }; 1091 1092 for (ht_cnt = (height >> 2); ht_cnt--;) { 1093 LW4(src_ptr, src_stride, src0, src1, src2, src3); 1094 src_ptr += (4 * src_stride); 1095 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 1096 ref_ptr += (4 * ref_stride); 1097 pred = LD_UB(sec_pred); 1098 sec_pred += 16; 1099 1100 INSERT_W4_UB(src0, src1, src2, src3, src); 1101 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 1102 1103 comp = __msa_aver_u_b(pred, ref); 1104 diff = __msa_asub_u_b(src, comp); 1105 sad += __msa_hadd_u_h(diff, diff); 1106 } 1107 1108 return HADD_UH_U32(sad); 1109} 1110 1111static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride, 1112 const uint8_t *ref, int32_t ref_stride, 1113 int32_t height, const uint8_t *sec_pred) { 1114 int32_t ht_cnt; 1115 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; 1116 v16u8 diff0, diff1, pred0, pred1; 1117 v8u16 sad = { 0 }; 1118 1119 for (ht_cnt = (height >> 2); ht_cnt--;) { 1120 LD_UB4(src, src_stride, src0, src1, src2, src3); 1121 src += (4 * src_stride); 1122 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 1123 ref += (4 * ref_stride); 1124 LD_UB2(sec_pred, 16, pred0, pred1); 1125 sec_pred += 32; 1126 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, 1127 ref0, ref1); 1128 AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1); 1129 sad += SAD_UB2_UH(src0, src1, diff0, diff1); 1130 } 1131 1132 return HADD_UH_U32(sad); 1133} 1134 1135static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride, 1136 const uint8_t *ref, int32_t ref_stride, 1137 int32_t height, const uint8_t *sec_pred) { 1138 int32_t ht_cnt; 1139 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; 1140 v16u8 pred0, pred1, pred2, pred3, comp0, comp1; 1141 v8u16 sad = { 0 }; 1142 1143 for (ht_cnt = (height >> 3); ht_cnt--;) { 1144 LD_UB4(src, src_stride, src0, src1, src2, src3); 1145 src += (4 * src_stride); 1146 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 1147 ref += (4 * ref_stride); 1148 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 1149 sec_pred += (4 * 16); 1150 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); 1151 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 1152 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); 1153 sad += SAD_UB2_UH(src2, src3, comp0, comp1); 1154 1155 LD_UB4(src, src_stride, src0, src1, src2, src3); 1156 src += (4 * src_stride); 1157 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 1158 ref += (4 * ref_stride); 1159 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 1160 sec_pred += (4 * 16); 1161 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); 1162 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 1163 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); 1164 sad += SAD_UB2_UH(src2, src3, comp0, comp1); 1165 } 1166 1167 return HADD_UH_U32(sad); 1168} 1169 1170static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride, 1171 const uint8_t *ref, int32_t ref_stride, 1172 int32_t height, const uint8_t *sec_pred) { 1173 int32_t ht_cnt; 1174 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 1175 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; 1176 v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; 1177 v16u8 comp0, comp1; 1178 v8u16 sad = { 0 }; 1179 1180 for (ht_cnt = (height >> 2); ht_cnt--;) { 1181 LD_UB4(src, src_stride, src0, src2, src4, src6); 1182 LD_UB4(src + 16, src_stride, src1, src3, src5, src7); 1183 src += (4 * src_stride); 1184 1185 LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6); 1186 LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7); 1187 ref += (4 * ref_stride); 1188 1189 LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6); 1190 LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7); 1191 sec_pred += (4 * 32); 1192 1193 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); 1194 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 1195 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); 1196 sad += SAD_UB2_UH(src2, src3, comp0, comp1); 1197 AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1); 1198 sad += SAD_UB2_UH(src4, src5, comp0, comp1); 1199 AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1); 1200 sad += SAD_UB2_UH(src6, src7, comp0, comp1); 1201 } 1202 1203 return HADD_UH_U32(sad); 1204} 1205 1206static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, 1207 const uint8_t *ref, int32_t ref_stride, 1208 int32_t height, const uint8_t *sec_pred) { 1209 int32_t ht_cnt; 1210 v16u8 src0, src1, src2, src3; 1211 v16u8 ref0, ref1, ref2, ref3; 1212 v16u8 comp0, comp1, comp2, comp3; 1213 v16u8 pred0, pred1, pred2, pred3; 1214 v8u16 sad0 = { 0 }; 1215 v8u16 sad1 = { 0 }; 1216 v4u32 sad; 1217 1218 for (ht_cnt = (height >> 2); ht_cnt--;) { 1219 LD_UB4(src, 16, src0, src1, src2, src3); 1220 src += src_stride; 1221 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 1222 ref += ref_stride; 1223 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 1224 sec_pred += 64; 1225 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, 1226 comp1, comp2, comp3); 1227 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); 1228 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); 1229 1230 LD_UB4(src, 16, src0, src1, src2, src3); 1231 src += src_stride; 1232 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 1233 ref += ref_stride; 1234 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 1235 sec_pred += 64; 1236 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, 1237 comp1, comp2, comp3); 1238 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); 1239 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); 1240 1241 LD_UB4(src, 16, src0, src1, src2, src3); 1242 src += src_stride; 1243 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 1244 ref += ref_stride; 1245 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 1246 sec_pred += 64; 1247 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, 1248 comp1, comp2, comp3); 1249 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); 1250 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); 1251 1252 LD_UB4(src, 16, src0, src1, src2, src3); 1253 src += src_stride; 1254 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 1255 ref += ref_stride; 1256 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 1257 sec_pred += 64; 1258 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, 1259 comp1, comp2, comp3); 1260 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); 1261 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); 1262 } 1263 1264 sad = __msa_hadd_u_w(sad0, sad0); 1265 sad += __msa_hadd_u_w(sad1, sad1); 1266 1267 return HADD_SW_S32(sad); 1268} 1269 1270#define VPX_SAD_4xHEIGHT_MSA(height) \ 1271 uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \ 1272 const uint8_t *ref, int32_t ref_stride) { \ 1273 return sad_4width_msa(src, src_stride, ref, ref_stride, height); \ 1274 } 1275 1276#define VPX_SAD_8xHEIGHT_MSA(height) \ 1277 uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \ 1278 const uint8_t *ref, int32_t ref_stride) { \ 1279 return sad_8width_msa(src, src_stride, ref, ref_stride, height); \ 1280 } 1281 1282#define VPX_SAD_16xHEIGHT_MSA(height) \ 1283 uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \ 1284 const uint8_t *ref, int32_t ref_stride) { \ 1285 return sad_16width_msa(src, src_stride, ref, ref_stride, height); \ 1286 } 1287 1288#define VPX_SAD_32xHEIGHT_MSA(height) \ 1289 uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \ 1290 const uint8_t *ref, int32_t ref_stride) { \ 1291 return sad_32width_msa(src, src_stride, ref, ref_stride, height); \ 1292 } 1293 1294#define VPX_SAD_64xHEIGHT_MSA(height) \ 1295 uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \ 1296 const uint8_t *ref, int32_t ref_stride) { \ 1297 return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ 1298 } 1299 1300#define VPX_SAD_4xHEIGHTx3_MSA(height) \ 1301 void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ 1302 const uint8_t *ref, int32_t ref_stride, \ 1303 uint32_t *sads) { \ 1304 sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ 1305 } 1306 1307#define VPX_SAD_8xHEIGHTx3_MSA(height) \ 1308 void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ 1309 const uint8_t *ref, int32_t ref_stride, \ 1310 uint32_t *sads) { \ 1311 sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ 1312 } 1313 1314#define VPX_SAD_16xHEIGHTx3_MSA(height) \ 1315 void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ 1316 const uint8_t *ref, int32_t ref_stride, \ 1317 uint32_t *sads) { \ 1318 sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ 1319 } 1320 1321#define VPX_SAD_32xHEIGHTx3_MSA(height) \ 1322 void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ 1323 const uint8_t *ref, int32_t ref_stride, \ 1324 uint32_t *sads) { \ 1325 sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ 1326 } 1327 1328#define VPX_SAD_64xHEIGHTx3_MSA(height) \ 1329 void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ 1330 const uint8_t *ref, int32_t ref_stride, \ 1331 uint32_t *sads) { \ 1332 sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ 1333 } 1334 1335#define VPX_SAD_4xHEIGHTx8_MSA(height) \ 1336 void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ 1337 const uint8_t *ref, int32_t ref_stride, \ 1338 uint32_t *sads) { \ 1339 sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ 1340 } 1341 1342#define VPX_SAD_8xHEIGHTx8_MSA(height) \ 1343 void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ 1344 const uint8_t *ref, int32_t ref_stride, \ 1345 uint32_t *sads) { \ 1346 sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ 1347 } 1348 1349#define VPX_SAD_16xHEIGHTx8_MSA(height) \ 1350 void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ 1351 const uint8_t *ref, int32_t ref_stride, \ 1352 uint32_t *sads) { \ 1353 sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ 1354 } 1355 1356#define VPX_SAD_32xHEIGHTx8_MSA(height) \ 1357 void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ 1358 const uint8_t *ref, int32_t ref_stride, \ 1359 uint32_t *sads) { \ 1360 sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ 1361 } 1362 1363#define VPX_SAD_64xHEIGHTx8_MSA(height) \ 1364 void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ 1365 const uint8_t *ref, int32_t ref_stride, \ 1366 uint32_t *sads) { \ 1367 sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ 1368 } 1369 1370#define VPX_SAD_4xHEIGHTx4D_MSA(height) \ 1371 void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ 1372 const uint8_t *const refs[], \ 1373 int32_t ref_stride, uint32_t *sads) { \ 1374 sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ 1375 } 1376 1377#define VPX_SAD_8xHEIGHTx4D_MSA(height) \ 1378 void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ 1379 const uint8_t *const refs[], \ 1380 int32_t ref_stride, uint32_t *sads) { \ 1381 sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ 1382 } 1383 1384#define VPX_SAD_16xHEIGHTx4D_MSA(height) \ 1385 void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ 1386 const uint8_t *const refs[], \ 1387 int32_t ref_stride, uint32_t *sads) { \ 1388 sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ 1389 } 1390 1391#define VPX_SAD_32xHEIGHTx4D_MSA(height) \ 1392 void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ 1393 const uint8_t *const refs[], \ 1394 int32_t ref_stride, uint32_t *sads) { \ 1395 sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ 1396 } 1397 1398#define VPX_SAD_64xHEIGHTx4D_MSA(height) \ 1399 void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ 1400 const uint8_t *const refs[], \ 1401 int32_t ref_stride, uint32_t *sads) { \ 1402 sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ 1403 } 1404 1405#define VPX_AVGSAD_4xHEIGHT_MSA(height) \ 1406 uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ 1407 const uint8_t *ref, int32_t ref_stride, \ 1408 const uint8_t *second_pred) { \ 1409 return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \ 1410 second_pred); \ 1411 } 1412 1413#define VPX_AVGSAD_8xHEIGHT_MSA(height) \ 1414 uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ 1415 const uint8_t *ref, int32_t ref_stride, \ 1416 const uint8_t *second_pred) { \ 1417 return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \ 1418 second_pred); \ 1419 } 1420 1421#define VPX_AVGSAD_16xHEIGHT_MSA(height) \ 1422 uint32_t vpx_sad16x##height##_avg_msa( \ 1423 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ 1424 int32_t ref_stride, const uint8_t *second_pred) { \ 1425 return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \ 1426 second_pred); \ 1427 } 1428 1429#define VPX_AVGSAD_32xHEIGHT_MSA(height) \ 1430 uint32_t vpx_sad32x##height##_avg_msa( \ 1431 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ 1432 int32_t ref_stride, const uint8_t *second_pred) { \ 1433 return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \ 1434 second_pred); \ 1435 } 1436 1437#define VPX_AVGSAD_64xHEIGHT_MSA(height) \ 1438 uint32_t vpx_sad64x##height##_avg_msa( \ 1439 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ 1440 int32_t ref_stride, const uint8_t *second_pred) { \ 1441 return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \ 1442 second_pred); \ 1443 } 1444 1445// 64x64 1446VPX_SAD_64xHEIGHT_MSA(64); 1447VPX_SAD_64xHEIGHTx3_MSA(64); 1448VPX_SAD_64xHEIGHTx8_MSA(64); 1449VPX_SAD_64xHEIGHTx4D_MSA(64); 1450VPX_AVGSAD_64xHEIGHT_MSA(64); 1451 1452// 64x32 1453VPX_SAD_64xHEIGHT_MSA(32); 1454VPX_SAD_64xHEIGHTx3_MSA(32); 1455VPX_SAD_64xHEIGHTx8_MSA(32); 1456VPX_SAD_64xHEIGHTx4D_MSA(32); 1457VPX_AVGSAD_64xHEIGHT_MSA(32); 1458 1459// 32x64 1460VPX_SAD_32xHEIGHT_MSA(64); 1461VPX_SAD_32xHEIGHTx3_MSA(64); 1462VPX_SAD_32xHEIGHTx8_MSA(64); 1463VPX_SAD_32xHEIGHTx4D_MSA(64); 1464VPX_AVGSAD_32xHEIGHT_MSA(64); 1465 1466// 32x32 1467VPX_SAD_32xHEIGHT_MSA(32); 1468VPX_SAD_32xHEIGHTx3_MSA(32); 1469VPX_SAD_32xHEIGHTx8_MSA(32); 1470VPX_SAD_32xHEIGHTx4D_MSA(32); 1471VPX_AVGSAD_32xHEIGHT_MSA(32); 1472 1473// 32x16 1474VPX_SAD_32xHEIGHT_MSA(16); 1475VPX_SAD_32xHEIGHTx3_MSA(16); 1476VPX_SAD_32xHEIGHTx8_MSA(16); 1477VPX_SAD_32xHEIGHTx4D_MSA(16); 1478VPX_AVGSAD_32xHEIGHT_MSA(16); 1479 1480// 16x32 1481VPX_SAD_16xHEIGHT_MSA(32); 1482VPX_SAD_16xHEIGHTx3_MSA(32); 1483VPX_SAD_16xHEIGHTx8_MSA(32); 1484VPX_SAD_16xHEIGHTx4D_MSA(32); 1485VPX_AVGSAD_16xHEIGHT_MSA(32); 1486 1487// 16x16 1488VPX_SAD_16xHEIGHT_MSA(16); 1489VPX_SAD_16xHEIGHTx3_MSA(16); 1490VPX_SAD_16xHEIGHTx8_MSA(16); 1491VPX_SAD_16xHEIGHTx4D_MSA(16); 1492VPX_AVGSAD_16xHEIGHT_MSA(16); 1493 1494// 16x8 1495VPX_SAD_16xHEIGHT_MSA(8); 1496VPX_SAD_16xHEIGHTx3_MSA(8); 1497VPX_SAD_16xHEIGHTx8_MSA(8); 1498VPX_SAD_16xHEIGHTx4D_MSA(8); 1499VPX_AVGSAD_16xHEIGHT_MSA(8); 1500 1501// 8x16 1502VPX_SAD_8xHEIGHT_MSA(16); 1503VPX_SAD_8xHEIGHTx3_MSA(16); 1504VPX_SAD_8xHEIGHTx8_MSA(16); 1505VPX_SAD_8xHEIGHTx4D_MSA(16); 1506VPX_AVGSAD_8xHEIGHT_MSA(16); 1507 1508// 8x8 1509VPX_SAD_8xHEIGHT_MSA(8); 1510VPX_SAD_8xHEIGHTx3_MSA(8); 1511VPX_SAD_8xHEIGHTx8_MSA(8); 1512VPX_SAD_8xHEIGHTx4D_MSA(8); 1513VPX_AVGSAD_8xHEIGHT_MSA(8); 1514 1515// 8x4 1516VPX_SAD_8xHEIGHT_MSA(4); 1517VPX_SAD_8xHEIGHTx3_MSA(4); 1518VPX_SAD_8xHEIGHTx8_MSA(4); 1519VPX_SAD_8xHEIGHTx4D_MSA(4); 1520VPX_AVGSAD_8xHEIGHT_MSA(4); 1521 1522// 4x8 1523VPX_SAD_4xHEIGHT_MSA(8); 1524VPX_SAD_4xHEIGHTx3_MSA(8); 1525VPX_SAD_4xHEIGHTx8_MSA(8); 1526VPX_SAD_4xHEIGHTx4D_MSA(8); 1527VPX_AVGSAD_4xHEIGHT_MSA(8); 1528 1529// 4x4 1530VPX_SAD_4xHEIGHT_MSA(4); 1531VPX_SAD_4xHEIGHTx3_MSA(4); 1532VPX_SAD_4xHEIGHTx8_MSA(4); 1533VPX_SAD_4xHEIGHTx4D_MSA(4); 1534VPX_AVGSAD_4xHEIGHT_MSA(4); 1535