sad_msa.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vpx_dsp_rtcd.h" 12#include "vpx_dsp/mips/macros_msa.h" 13 14#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) { \ 15 out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \ 16 out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \ 17 out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \ 18 out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \ 19} 20#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__) 21 22static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, 23 const uint8_t *ref_ptr, int32_t ref_stride, 24 int32_t height) { 25 int32_t ht_cnt; 26 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; 27 v16u8 src = { 0 }; 28 v16u8 ref = { 0 }; 29 v16u8 diff; 30 v8u16 sad = { 0 }; 31 32 for (ht_cnt = (height >> 2); ht_cnt--;) { 33 LW4(src_ptr, src_stride, src0, src1, src2, src3); 34 src_ptr += (4 * src_stride); 35 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 36 ref_ptr += (4 * ref_stride); 37 38 INSERT_W4_UB(src0, src1, src2, src3, src); 39 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 40 41 diff = __msa_asub_u_b(src, ref); 42 sad += __msa_hadd_u_h(diff, diff); 43 } 44 45 return HADD_UH_U32(sad); 46} 47 48static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride, 49 const uint8_t *ref, int32_t ref_stride, 50 int32_t height) { 51 int32_t ht_cnt; 52 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; 53 v8u16 sad = { 0 }; 54 55 for (ht_cnt = (height >> 2); ht_cnt--;) { 56 LD_UB4(src, src_stride, src0, src1, src2, src3); 57 src += (4 * src_stride); 58 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 59 ref += (4 * ref_stride); 60 61 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, 62 src0, src1, ref0, ref1); 63 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 64 } 65 66 return HADD_UH_U32(sad); 67} 68 69static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride, 70 const uint8_t *ref, int32_t ref_stride, 71 int32_t height) { 72 int32_t ht_cnt; 73 v16u8 src0, src1, ref0, ref1; 74 v8u16 sad = { 0 }; 75 76 for (ht_cnt = (height >> 2); ht_cnt--;) { 77 LD_UB2(src, src_stride, src0, src1); 78 src += (2 * src_stride); 79 LD_UB2(ref, ref_stride, ref0, ref1); 80 ref += (2 * ref_stride); 81 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 82 83 LD_UB2(src, src_stride, src0, src1); 84 src += (2 * src_stride); 85 LD_UB2(ref, ref_stride, ref0, ref1); 86 ref += (2 * ref_stride); 87 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 88 } 89 90 return HADD_UH_U32(sad); 91} 92 93static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride, 94 const uint8_t *ref, int32_t ref_stride, 95 int32_t height) { 96 int32_t ht_cnt; 97 v16u8 src0, src1, ref0, ref1; 98 v8u16 sad = { 0 }; 99 100 for (ht_cnt = (height >> 2); ht_cnt--;) { 101 LD_UB2(src, 16, src0, src1); 102 src += src_stride; 103 LD_UB2(ref, 16, ref0, ref1); 104 ref += ref_stride; 105 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 106 107 LD_UB2(src, 16, src0, src1); 108 src += src_stride; 109 LD_UB2(ref, 16, ref0, ref1); 110 ref += ref_stride; 111 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 112 113 LD_UB2(src, 16, src0, src1); 114 src += src_stride; 115 LD_UB2(ref, 16, ref0, ref1); 116 ref += ref_stride; 117 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 118 119 LD_UB2(src, 16, src0, src1); 120 src += src_stride; 121 LD_UB2(ref, 16, ref0, ref1); 122 ref += ref_stride; 123 sad += SAD_UB2_UH(src0, src1, ref0, ref1); 124 } 125 126 return HADD_UH_U32(sad); 127} 128 129static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride, 130 const uint8_t *ref, int32_t ref_stride, 131 int32_t height) { 132 int32_t ht_cnt; 133 uint32_t sad = 0; 134 v16u8 src0, src1, src2, src3; 135 v16u8 ref0, ref1, ref2, ref3; 136 v8u16 sad0 = { 0 }; 137 v8u16 sad1 = { 0 }; 138 139 for (ht_cnt = (height >> 1); ht_cnt--;) { 140 LD_UB4(src, 16, src0, src1, src2, src3); 141 src += src_stride; 142 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 143 ref += ref_stride; 144 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 145 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); 146 147 LD_UB4(src, 16, src0, src1, src2, src3); 148 src += src_stride; 149 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 150 ref += ref_stride; 151 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 152 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); 153 } 154 155 sad = HADD_UH_U32(sad0); 156 sad += HADD_UH_U32(sad1); 157 158 return sad; 159} 160 161static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, 162 const uint8_t *ref_ptr, int32_t ref_stride, 163 int32_t height, uint32_t *sad_array) { 164 int32_t ht_cnt; 165 uint32_t src0, src1, src2, src3; 166 v16u8 src = { 0 }; 167 v16u8 ref = { 0 }; 168 v16u8 ref0, ref1, ref2, ref3, diff; 169 v8u16 sad0 = { 0 }; 170 v8u16 sad1 = { 0 }; 171 v8u16 sad2 = { 0 }; 172 173 for (ht_cnt = (height >> 2); ht_cnt--;) { 174 LW4(src_ptr, src_stride, src0, src1, src2, src3); 175 src_ptr += (4 * src_stride); 176 INSERT_W4_UB(src0, src1, src2, src3, src); 177 178 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 179 ref_ptr += (4 * ref_stride); 180 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 181 diff = __msa_asub_u_b(src, ref); 182 sad0 += __msa_hadd_u_h(diff, diff); 183 184 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 185 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 186 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 187 diff = __msa_asub_u_b(src, ref); 188 sad1 += __msa_hadd_u_h(diff, diff); 189 190 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 191 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 192 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 193 diff = __msa_asub_u_b(src, ref); 194 sad2 += __msa_hadd_u_h(diff, diff); 195 } 196 197 sad_array[0] = HADD_UH_U32(sad0); 198 sad_array[1] = HADD_UH_U32(sad1); 199 sad_array[2] = HADD_UH_U32(sad2); 200} 201 202static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride, 203 const uint8_t *ref, int32_t ref_stride, 204 int32_t height, uint32_t *sad_array) { 205 int32_t ht_cnt; 206 v16u8 src0, src1, src2, src3; 207 v16u8 ref0, ref1, ref00, ref11, ref22, ref33; 208 v8u16 sad0 = { 0 }; 209 v8u16 sad1 = { 0 }; 210 v8u16 sad2 = { 0 }; 211 212 for (ht_cnt = (height >> 2); ht_cnt--;) { 213 LD_UB4(src, src_stride, src0, src1, src2, src3); 214 src += (4 * src_stride); 215 LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); 216 ref += (4 * ref_stride); 217 PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, 218 src0, src1, ref0, ref1); 219 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 220 221 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 222 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 223 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 224 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 225 226 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 227 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 228 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 229 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 230 } 231 232 sad_array[0] = HADD_UH_U32(sad0); 233 sad_array[1] = HADD_UH_U32(sad1); 234 sad_array[2] = HADD_UH_U32(sad2); 235} 236 237static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, 238 const uint8_t *ref_ptr, int32_t ref_stride, 239 int32_t height, uint32_t *sad_array) { 240 int32_t ht_cnt; 241 v16u8 src, ref, ref0, ref1, diff; 242 v8u16 sad0 = { 0 }; 243 v8u16 sad1 = { 0 }; 244 v8u16 sad2 = { 0 }; 245 246 for (ht_cnt = (height >> 1); ht_cnt--;) { 247 src = LD_UB(src_ptr); 248 src_ptr += src_stride; 249 LD_UB2(ref_ptr, 16, ref0, ref1); 250 ref_ptr += ref_stride; 251 252 diff = __msa_asub_u_b(src, ref0); 253 sad0 += __msa_hadd_u_h(diff, diff); 254 255 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); 256 diff = __msa_asub_u_b(src, ref); 257 sad1 += __msa_hadd_u_h(diff, diff); 258 259 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); 260 diff = __msa_asub_u_b(src, ref); 261 sad2 += __msa_hadd_u_h(diff, diff); 262 263 src = LD_UB(src_ptr); 264 src_ptr += src_stride; 265 LD_UB2(ref_ptr, 16, ref0, ref1); 266 ref_ptr += ref_stride; 267 268 diff = __msa_asub_u_b(src, ref0); 269 sad0 += __msa_hadd_u_h(diff, diff); 270 271 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); 272 diff = __msa_asub_u_b(src, ref); 273 sad1 += __msa_hadd_u_h(diff, diff); 274 275 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); 276 diff = __msa_asub_u_b(src, ref); 277 sad2 += __msa_hadd_u_h(diff, diff); 278 } 279 280 sad_array[0] = HADD_UH_U32(sad0); 281 sad_array[1] = HADD_UH_U32(sad1); 282 sad_array[2] = HADD_UH_U32(sad2); 283} 284 285static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride, 286 const uint8_t *ref, int32_t ref_stride, 287 int32_t height, uint32_t *sad_array) { 288 int32_t ht_cnt; 289 v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1; 290 v8u16 sad0 = { 0 }; 291 v8u16 sad1 = { 0 }; 292 v8u16 sad2 = { 0 }; 293 294 for (ht_cnt = height >> 1; ht_cnt--;) { 295 LD_UB2(src, 16, src0, src1); 296 src += src_stride; 297 LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); 298 ref += ref_stride; 299 300 sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); 301 302 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); 303 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 304 305 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); 306 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 307 308 LD_UB2(src, 16, src0, src1); 309 src += src_stride; 310 LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); 311 ref += ref_stride; 312 313 sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); 314 315 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); 316 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 317 318 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); 319 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 320 } 321 322 sad_array[0] = HADD_UH_U32(sad0); 323 sad_array[1] = HADD_UH_U32(sad1); 324 sad_array[2] = HADD_UH_U32(sad2); 325} 326 327static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride, 328 const uint8_t *ref, int32_t ref_stride, 329 int32_t height, uint32_t *sad_array) { 330 int32_t ht_cnt; 331 v16u8 src0, src1, src2, src3; 332 v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3; 333 v8u16 sad0_0 = { 0 }; 334 v8u16 sad0_1 = { 0 }; 335 v8u16 sad1_0 = { 0 }; 336 v8u16 sad1_1 = { 0 }; 337 v8u16 sad2_0 = { 0 }; 338 v8u16 sad2_1 = { 0 }; 339 v4u32 sad; 340 341 for (ht_cnt = height; ht_cnt--;) { 342 LD_UB4(src, 16, src0, src1, src2, src3); 343 src += src_stride; 344 LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3); 345 ref0_4 = LD_UB(ref + 64); 346 ref += ref_stride; 347 348 sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); 349 sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); 350 351 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); 352 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); 353 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 354 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 355 356 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); 357 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); 358 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 359 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 360 } 361 362 sad = __msa_hadd_u_w(sad0_0, sad0_0); 363 sad += __msa_hadd_u_w(sad0_1, sad0_1); 364 sad_array[0] = HADD_SW_S32((v4i32)sad); 365 366 sad = __msa_hadd_u_w(sad1_0, sad1_0); 367 sad += __msa_hadd_u_w(sad1_1, sad1_1); 368 sad_array[1] = HADD_SW_S32((v4i32)sad); 369 370 sad = __msa_hadd_u_w(sad2_0, sad2_0); 371 sad += __msa_hadd_u_w(sad2_1, sad2_1); 372 sad_array[2] = HADD_SW_S32((v4i32)sad); 373} 374 375static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, 376 const uint8_t *ref_ptr, int32_t ref_stride, 377 int32_t height, uint32_t *sad_array) { 378 int32_t ht_cnt; 379 uint32_t src0, src1, src2, src3; 380 v16u8 ref0, ref1, ref2, ref3, diff; 381 v16u8 src = { 0 }; 382 v16u8 ref = { 0 }; 383 v8u16 sad0 = { 0 }; 384 v8u16 sad1 = { 0 }; 385 v8u16 sad2 = { 0 }; 386 v8u16 sad3 = { 0 }; 387 v8u16 sad4 = { 0 }; 388 v8u16 sad5 = { 0 }; 389 v8u16 sad6 = { 0 }; 390 v8u16 sad7 = { 0 }; 391 392 for (ht_cnt = (height >> 2); ht_cnt--;) { 393 LW4(src_ptr, src_stride, src0, src1, src2, src3); 394 INSERT_W4_UB(src0, src1, src2, src3, src); 395 src_ptr += (4 * src_stride); 396 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 397 ref_ptr += (4 * ref_stride); 398 399 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 400 diff = __msa_asub_u_b(src, ref); 401 sad0 += __msa_hadd_u_h(diff, diff); 402 403 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 404 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 405 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 406 diff = __msa_asub_u_b(src, ref); 407 sad1 += __msa_hadd_u_h(diff, diff); 408 409 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 410 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 411 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 412 diff = __msa_asub_u_b(src, ref); 413 sad2 += __msa_hadd_u_h(diff, diff); 414 415 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 416 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 417 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 418 diff = __msa_asub_u_b(src, ref); 419 sad3 += __msa_hadd_u_h(diff, diff); 420 421 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 422 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 423 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 424 diff = __msa_asub_u_b(src, ref); 425 sad4 += __msa_hadd_u_h(diff, diff); 426 427 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 428 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 429 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 430 diff = __msa_asub_u_b(src, ref); 431 sad5 += __msa_hadd_u_h(diff, diff); 432 433 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 434 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 435 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 436 diff = __msa_asub_u_b(src, ref); 437 sad6 += __msa_hadd_u_h(diff, diff); 438 439 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); 440 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); 441 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); 442 diff = __msa_asub_u_b(src, ref); 443 sad7 += __msa_hadd_u_h(diff, diff); 444 } 445 446 sad_array[0] = HADD_UH_U32(sad0); 447 sad_array[1] = HADD_UH_U32(sad1); 448 sad_array[2] = HADD_UH_U32(sad2); 449 sad_array[3] = HADD_UH_U32(sad3); 450 sad_array[4] = HADD_UH_U32(sad4); 451 sad_array[5] = HADD_UH_U32(sad5); 452 sad_array[6] = HADD_UH_U32(sad6); 453 sad_array[7] = HADD_UH_U32(sad7); 454} 455 456static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride, 457 const uint8_t *ref, int32_t ref_stride, 458 int32_t height, uint32_t *sad_array) { 459 int32_t ht_cnt; 460 v16u8 src0, src1, src2, src3; 461 v16u8 ref0, ref1, ref00, ref11, ref22, ref33; 462 v8u16 sad0 = { 0 }; 463 v8u16 sad1 = { 0 }; 464 v8u16 sad2 = { 0 }; 465 v8u16 sad3 = { 0 }; 466 v8u16 sad4 = { 0 }; 467 v8u16 sad5 = { 0 }; 468 v8u16 sad6 = { 0 }; 469 v8u16 sad7 = { 0 }; 470 471 for (ht_cnt = (height >> 2); ht_cnt--;) { 472 LD_UB4(src, src_stride, src0, src1, src2, src3); 473 src += (4 * src_stride); 474 LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); 475 ref += (4 * ref_stride); 476 PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, 477 src0, src1, ref0, ref1); 478 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 479 480 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 481 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 482 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 483 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 484 485 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 486 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 487 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 488 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 489 490 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 491 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 492 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 493 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); 494 495 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 496 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 497 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 498 sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); 499 500 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 501 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 502 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 503 sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); 504 505 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 506 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 507 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 508 sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); 509 510 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); 511 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); 512 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); 513 sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); 514 } 515 516 sad_array[0] = HADD_UH_U32(sad0); 517 sad_array[1] = HADD_UH_U32(sad1); 518 sad_array[2] = HADD_UH_U32(sad2); 519 sad_array[3] = HADD_UH_U32(sad3); 520 sad_array[4] = HADD_UH_U32(sad4); 521 sad_array[5] = HADD_UH_U32(sad5); 522 sad_array[6] = HADD_UH_U32(sad6); 523 sad_array[7] = HADD_UH_U32(sad7); 524} 525 526static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, 527 const uint8_t *ref_ptr, int32_t ref_stride, 528 int32_t height, uint32_t *sad_array) { 529 int32_t ht_cnt; 530 v16u8 src, ref0, ref1, ref; 531 v16u8 diff; 532 v8u16 sad0 = { 0 }; 533 v8u16 sad1 = { 0 }; 534 v8u16 sad2 = { 0 }; 535 v8u16 sad3 = { 0 }; 536 v8u16 sad4 = { 0 }; 537 v8u16 sad5 = { 0 }; 538 v8u16 sad6 = { 0 }; 539 v8u16 sad7 = { 0 }; 540 541 for (ht_cnt = (height >> 1); ht_cnt--;) { 542 src = LD_UB(src_ptr); 543 src_ptr += src_stride; 544 LD_UB2(ref_ptr, 16, ref0, ref1); 545 ref_ptr += ref_stride; 546 547 diff = __msa_asub_u_b(src, ref0); 548 sad0 += __msa_hadd_u_h(diff, diff); 549 550 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); 551 diff = __msa_asub_u_b(src, ref); 552 sad1 += __msa_hadd_u_h(diff, diff); 553 554 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); 555 diff = __msa_asub_u_b(src, ref); 556 sad2 += __msa_hadd_u_h(diff, diff); 557 558 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); 559 diff = __msa_asub_u_b(src, ref); 560 sad3 += __msa_hadd_u_h(diff, diff); 561 562 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); 563 diff = __msa_asub_u_b(src, ref); 564 sad4 += __msa_hadd_u_h(diff, diff); 565 566 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); 567 diff = __msa_asub_u_b(src, ref); 568 sad5 += __msa_hadd_u_h(diff, diff); 569 570 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); 571 diff = __msa_asub_u_b(src, ref); 572 sad6 += __msa_hadd_u_h(diff, diff); 573 574 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); 575 diff = __msa_asub_u_b(src, ref); 576 sad7 += __msa_hadd_u_h(diff, diff); 577 578 src = LD_UB(src_ptr); 579 src_ptr += src_stride; 580 LD_UB2(ref_ptr, 16, ref0, ref1); 581 ref_ptr += ref_stride; 582 583 diff = __msa_asub_u_b(src, ref0); 584 sad0 += __msa_hadd_u_h(diff, diff); 585 586 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); 587 diff = __msa_asub_u_b(src, ref); 588 sad1 += __msa_hadd_u_h(diff, diff); 589 590 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); 591 diff = __msa_asub_u_b(src, ref); 592 sad2 += __msa_hadd_u_h(diff, diff); 593 594 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); 595 diff = __msa_asub_u_b(src, ref); 596 sad3 += __msa_hadd_u_h(diff, diff); 597 598 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); 599 diff = __msa_asub_u_b(src, ref); 600 sad4 += __msa_hadd_u_h(diff, diff); 601 602 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); 603 diff = __msa_asub_u_b(src, ref); 604 sad5 += __msa_hadd_u_h(diff, diff); 605 606 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); 607 diff = __msa_asub_u_b(src, ref); 608 sad6 += __msa_hadd_u_h(diff, diff); 609 610 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); 611 diff = __msa_asub_u_b(src, ref); 612 sad7 += __msa_hadd_u_h(diff, diff); 613 } 614 615 sad_array[0] = HADD_UH_U32(sad0); 616 sad_array[1] = HADD_UH_U32(sad1); 617 sad_array[2] = HADD_UH_U32(sad2); 618 sad_array[3] = HADD_UH_U32(sad3); 619 sad_array[4] = HADD_UH_U32(sad4); 620 sad_array[5] = HADD_UH_U32(sad5); 621 sad_array[6] = HADD_UH_U32(sad6); 622 sad_array[7] = HADD_UH_U32(sad7); 623} 624 625static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride, 626 const uint8_t *ref, int32_t ref_stride, 627 int32_t height, uint32_t *sad_array) { 628 int32_t ht_cnt; 629 v16u8 src0, src1; 630 v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2; 631 v8u16 sad0 = { 0 }; 632 v8u16 sad1 = { 0 }; 633 v8u16 sad2 = { 0 }; 634 v8u16 sad3 = { 0 }; 635 v8u16 sad4 = { 0 }; 636 v8u16 sad5 = { 0 }; 637 v8u16 sad6 = { 0 }; 638 v8u16 sad7 = { 0 }; 639 640 for (ht_cnt = height; ht_cnt--;) { 641 LD_UB2(src, 16, src0, src1); 642 src += src_stride; 643 LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); 644 ref += ref_stride; 645 646 sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); 647 648 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); 649 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 650 651 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); 652 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 653 654 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); 655 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); 656 657 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); 658 sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); 659 660 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); 661 sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); 662 663 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); 664 sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); 665 666 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); 667 sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); 668 } 669 670 sad_array[0] = HADD_UH_U32(sad0); 671 sad_array[1] = HADD_UH_U32(sad1); 672 sad_array[2] = HADD_UH_U32(sad2); 673 sad_array[3] = HADD_UH_U32(sad3); 674 sad_array[4] = HADD_UH_U32(sad4); 675 sad_array[5] = HADD_UH_U32(sad5); 676 sad_array[6] = HADD_UH_U32(sad6); 677 sad_array[7] = HADD_UH_U32(sad7); 678} 679 680static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride, 681 const uint8_t *ref, int32_t ref_stride, 682 int32_t height, uint32_t *sad_array) { 683 const uint8_t *src_dup, *ref_dup; 684 int32_t ht_cnt; 685 v16u8 src0, src1, src2, src3; 686 v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4; 687 v16u8 ref0, ref1, ref2, ref3; 688 v8u16 sad0_0 = { 0 }; 689 v8u16 sad0_1 = { 0 }; 690 v8u16 sad1_0 = { 0 }; 691 v8u16 sad1_1 = { 0 }; 692 v8u16 sad2_0 = { 0 }; 693 v8u16 sad2_1 = { 0 }; 694 v8u16 sad3_0 = { 0 }; 695 v8u16 sad3_1 = { 0 }; 696 v4u32 sad; 697 698 src_dup = src; 699 ref_dup = ref; 700 701 for (ht_cnt = height; ht_cnt--;) { 702 LD_UB4(src, 16, src0, src1, src2, src3); 703 src += src_stride; 704 LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); 705 ref += ref_stride; 706 707 sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); 708 sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); 709 710 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); 711 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); 712 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 713 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 714 715 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); 716 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); 717 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 718 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 719 720 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); 721 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3); 722 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 723 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 724 } 725 726 sad = __msa_hadd_u_w(sad0_0, sad0_0); 727 sad += __msa_hadd_u_w(sad0_1, sad0_1); 728 sad_array[0] = HADD_SW_S32(sad); 729 730 sad = __msa_hadd_u_w(sad1_0, sad1_0); 731 sad += __msa_hadd_u_w(sad1_1, sad1_1); 732 sad_array[1] = HADD_SW_S32(sad); 733 734 sad = __msa_hadd_u_w(sad2_0, sad2_0); 735 sad += __msa_hadd_u_w(sad2_1, sad2_1); 736 sad_array[2] = HADD_SW_S32(sad); 737 738 sad = __msa_hadd_u_w(sad3_0, sad3_0); 739 sad += __msa_hadd_u_w(sad3_1, sad3_1); 740 sad_array[3] = HADD_SW_S32(sad); 741 742 sad0_0 = (v8u16)__msa_ldi_h(0); 743 sad0_1 = (v8u16)__msa_ldi_h(0); 744 sad1_0 = (v8u16)__msa_ldi_h(0); 745 sad1_1 = (v8u16)__msa_ldi_h(0); 746 sad2_0 = (v8u16)__msa_ldi_h(0); 747 sad2_1 = (v8u16)__msa_ldi_h(0); 748 sad3_0 = (v8u16)__msa_ldi_h(0); 749 sad3_1 = (v8u16)__msa_ldi_h(0); 750 751 for (ht_cnt = 64; ht_cnt--;) { 752 LD_UB4(src_dup, 16, src0, src1, src2, src3); 753 src_dup += src_stride; 754 LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); 755 ref_dup += ref_stride; 756 757 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); 758 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4); 759 sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 760 sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 761 762 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); 763 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5); 764 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 765 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 766 767 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); 768 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6); 769 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 770 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 771 772 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); 773 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7); 774 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 775 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 776 } 777 778 sad = __msa_hadd_u_w(sad0_0, sad0_0); 779 sad += __msa_hadd_u_w(sad0_1, sad0_1); 780 sad_array[4] = HADD_SW_S32(sad); 781 782 sad = __msa_hadd_u_w(sad1_0, sad1_0); 783 sad += __msa_hadd_u_w(sad1_1, sad1_1); 784 sad_array[5] = HADD_SW_S32(sad); 785 786 sad = __msa_hadd_u_w(sad2_0, sad2_0); 787 sad += __msa_hadd_u_w(sad2_1, sad2_1); 788 sad_array[6] = HADD_SW_S32(sad); 789 790 sad = __msa_hadd_u_w(sad3_0, sad3_0); 791 sad += __msa_hadd_u_w(sad3_1, sad3_1); 792 sad_array[7] = HADD_SW_S32(sad); 793} 794 795static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, 796 const uint8_t * const aref_ptr[], 797 int32_t ref_stride, 798 int32_t height, uint32_t *sad_array) { 799 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; 800 int32_t ht_cnt; 801 uint32_t src0, src1, src2, src3; 802 uint32_t ref0, ref1, ref2, ref3; 803 v16u8 src = { 0 }; 804 v16u8 ref = { 0 }; 805 v16u8 diff; 806 v8u16 sad0 = { 0 }; 807 v8u16 sad1 = { 0 }; 808 v8u16 sad2 = { 0 }; 809 v8u16 sad3 = { 0 }; 810 811 ref0_ptr = aref_ptr[0]; 812 ref1_ptr = aref_ptr[1]; 813 ref2_ptr = aref_ptr[2]; 814 ref3_ptr = aref_ptr[3]; 815 816 for (ht_cnt = (height >> 2); ht_cnt--;) { 817 LW4(src_ptr, src_stride, src0, src1, src2, src3); 818 INSERT_W4_UB(src0, src1, src2, src3, src); 819 src_ptr += (4 * src_stride); 820 821 LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); 822 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 823 ref0_ptr += (4 * ref_stride); 824 825 diff = __msa_asub_u_b(src, ref); 826 sad0 += __msa_hadd_u_h(diff, diff); 827 828 LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3); 829 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 830 ref1_ptr += (4 * ref_stride); 831 832 diff = __msa_asub_u_b(src, ref); 833 sad1 += __msa_hadd_u_h(diff, diff); 834 835 LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3); 836 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 837 ref2_ptr += (4 * ref_stride); 838 839 diff = __msa_asub_u_b(src, ref); 840 sad2 += __msa_hadd_u_h(diff, diff); 841 842 LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3); 843 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 844 ref3_ptr += (4 * ref_stride); 845 846 diff = __msa_asub_u_b(src, ref); 847 sad3 += __msa_hadd_u_h(diff, diff); 848 } 849 850 sad_array[0] = HADD_UH_U32(sad0); 851 sad_array[1] = HADD_UH_U32(sad1); 852 sad_array[2] = HADD_UH_U32(sad2); 853 sad_array[3] = HADD_UH_U32(sad3); 854} 855 856static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, 857 const uint8_t * const aref_ptr[], 858 int32_t ref_stride, 859 int32_t height, uint32_t *sad_array) { 860 int32_t ht_cnt; 861 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; 862 v16u8 src0, src1, src2, src3; 863 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; 864 v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; 865 v8u16 sad0 = { 0 }; 866 v8u16 sad1 = { 0 }; 867 v8u16 sad2 = { 0 }; 868 v8u16 sad3 = { 0 }; 869 870 ref0_ptr = aref_ptr[0]; 871 ref1_ptr = aref_ptr[1]; 872 ref2_ptr = aref_ptr[2]; 873 ref3_ptr = aref_ptr[3]; 874 875 for (ht_cnt = (height >> 2); ht_cnt--;) { 876 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); 877 src_ptr += (4 * src_stride); 878 LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); 879 ref0_ptr += (4 * ref_stride); 880 LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7); 881 ref1_ptr += (4 * ref_stride); 882 LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11); 883 ref2_ptr += (4 * ref_stride); 884 LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15); 885 ref3_ptr += (4 * ref_stride); 886 887 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); 888 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 889 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 890 891 PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1); 892 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 893 894 PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1); 895 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 896 897 PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1); 898 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); 899 } 900 901 sad_array[0] = HADD_UH_U32(sad0); 902 sad_array[1] = HADD_UH_U32(sad1); 903 sad_array[2] = HADD_UH_U32(sad2); 904 sad_array[3] = HADD_UH_U32(sad3); 905} 906 907static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, 908 const uint8_t * const aref_ptr[], 909 int32_t ref_stride, 910 int32_t height, uint32_t *sad_array) { 911 int32_t ht_cnt; 912 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; 913 v16u8 src, ref0, ref1, ref2, ref3, diff; 914 v8u16 sad0 = { 0 }; 915 v8u16 sad1 = { 0 }; 916 v8u16 sad2 = { 0 }; 917 v8u16 sad3 = { 0 }; 918 919 ref0_ptr = aref_ptr[0]; 920 ref1_ptr = aref_ptr[1]; 921 ref2_ptr = aref_ptr[2]; 922 ref3_ptr = aref_ptr[3]; 923 924 for (ht_cnt = (height >> 1); ht_cnt--;) { 925 src = LD_UB(src_ptr); 926 src_ptr += src_stride; 927 ref0 = LD_UB(ref0_ptr); 928 ref0_ptr += ref_stride; 929 ref1 = LD_UB(ref1_ptr); 930 ref1_ptr += ref_stride; 931 ref2 = LD_UB(ref2_ptr); 932 ref2_ptr += ref_stride; 933 ref3 = LD_UB(ref3_ptr); 934 ref3_ptr += ref_stride; 935 936 diff = __msa_asub_u_b(src, ref0); 937 sad0 += __msa_hadd_u_h(diff, diff); 938 diff = __msa_asub_u_b(src, ref1); 939 sad1 += __msa_hadd_u_h(diff, diff); 940 diff = __msa_asub_u_b(src, ref2); 941 sad2 += __msa_hadd_u_h(diff, diff); 942 diff = __msa_asub_u_b(src, ref3); 943 sad3 += __msa_hadd_u_h(diff, diff); 944 945 src = LD_UB(src_ptr); 946 src_ptr += src_stride; 947 ref0 = LD_UB(ref0_ptr); 948 ref0_ptr += ref_stride; 949 ref1 = LD_UB(ref1_ptr); 950 ref1_ptr += ref_stride; 951 ref2 = LD_UB(ref2_ptr); 952 ref2_ptr += ref_stride; 953 ref3 = LD_UB(ref3_ptr); 954 ref3_ptr += ref_stride; 955 956 diff = __msa_asub_u_b(src, ref0); 957 sad0 += __msa_hadd_u_h(diff, diff); 958 diff = __msa_asub_u_b(src, ref1); 959 sad1 += __msa_hadd_u_h(diff, diff); 960 diff = __msa_asub_u_b(src, ref2); 961 sad2 += __msa_hadd_u_h(diff, diff); 962 diff = __msa_asub_u_b(src, ref3); 963 sad3 += __msa_hadd_u_h(diff, diff); 964 } 965 966 sad_array[0] = HADD_UH_U32(sad0); 967 sad_array[1] = HADD_UH_U32(sad1); 968 sad_array[2] = HADD_UH_U32(sad2); 969 sad_array[3] = HADD_UH_U32(sad3); 970} 971 972static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride, 973 const uint8_t * const aref_ptr[], 974 int32_t ref_stride, 975 int32_t height, uint32_t *sad_array) { 976 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; 977 int32_t ht_cnt; 978 v16u8 src0, src1, ref0, ref1; 979 v8u16 sad0 = { 0 }; 980 v8u16 sad1 = { 0 }; 981 v8u16 sad2 = { 0 }; 982 v8u16 sad3 = { 0 }; 983 984 ref0_ptr = aref_ptr[0]; 985 ref1_ptr = aref_ptr[1]; 986 ref2_ptr = aref_ptr[2]; 987 ref3_ptr = aref_ptr[3]; 988 989 for (ht_cnt = height; ht_cnt--;) { 990 LD_UB2(src, 16, src0, src1); 991 src += src_stride; 992 993 LD_UB2(ref0_ptr, 16, ref0, ref1); 994 ref0_ptr += ref_stride; 995 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); 996 997 LD_UB2(ref1_ptr, 16, ref0, ref1); 998 ref1_ptr += ref_stride; 999 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); 1000 1001 LD_UB2(ref2_ptr, 16, ref0, ref1); 1002 ref2_ptr += ref_stride; 1003 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); 1004 1005 LD_UB2(ref3_ptr, 16, ref0, ref1); 1006 ref3_ptr += ref_stride; 1007 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); 1008 } 1009 1010 sad_array[0] = HADD_UH_U32(sad0); 1011 sad_array[1] = HADD_UH_U32(sad1); 1012 sad_array[2] = HADD_UH_U32(sad2); 1013 sad_array[3] = HADD_UH_U32(sad3); 1014} 1015 1016static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, 1017 const uint8_t * const aref_ptr[], 1018 int32_t ref_stride, 1019 int32_t height, uint32_t *sad_array) { 1020 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; 1021 int32_t ht_cnt; 1022 v16u8 src0, src1, src2, src3; 1023 v16u8 ref0, ref1, ref2, ref3; 1024 v8u16 sad0_0 = { 0 }; 1025 v8u16 sad0_1 = { 0 }; 1026 v8u16 sad1_0 = { 0 }; 1027 v8u16 sad1_1 = { 0 }; 1028 v8u16 sad2_0 = { 0 }; 1029 v8u16 sad2_1 = { 0 }; 1030 v8u16 sad3_0 = { 0 }; 1031 v8u16 sad3_1 = { 0 }; 1032 1033 ref0_ptr = aref_ptr[0]; 1034 ref1_ptr = aref_ptr[1]; 1035 ref2_ptr = aref_ptr[2]; 1036 ref3_ptr = aref_ptr[3]; 1037 1038 for (ht_cnt = height; ht_cnt--;) { 1039 LD_UB4(src, 16, src0, src1, src2, src3); 1040 src += src_stride; 1041 1042 LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3); 1043 ref0_ptr += ref_stride; 1044 sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 1045 sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 1046 1047 LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3); 1048 ref1_ptr += ref_stride; 1049 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 1050 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 1051 1052 LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3); 1053 ref2_ptr += ref_stride; 1054 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 1055 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 1056 1057 LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3); 1058 ref3_ptr += ref_stride; 1059 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); 1060 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); 1061 } 1062 1063 sad_array[0] = HADD_UH_U32(sad0_0); 1064 sad_array[0] += HADD_UH_U32(sad0_1); 1065 sad_array[1] = HADD_UH_U32(sad1_0); 1066 sad_array[1] += HADD_UH_U32(sad1_1); 1067 sad_array[2] = HADD_UH_U32(sad2_0); 1068 sad_array[2] += HADD_UH_U32(sad2_1); 1069 sad_array[3] = HADD_UH_U32(sad3_0); 1070 sad_array[3] += HADD_UH_U32(sad3_1); 1071} 1072 1073static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, 1074 const uint8_t *ref_ptr, int32_t ref_stride, 1075 int32_t height, const uint8_t *sec_pred) { 1076 int32_t ht_cnt; 1077 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; 1078 v16u8 src = { 0 }; 1079 v16u8 ref = { 0 }; 1080 v16u8 diff, pred, comp; 1081 v8u16 sad = { 0 }; 1082 1083 for (ht_cnt = (height >> 2); ht_cnt--;) { 1084 LW4(src_ptr, src_stride, src0, src1, src2, src3); 1085 src_ptr += (4 * src_stride); 1086 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 1087 ref_ptr += (4 * ref_stride); 1088 pred = LD_UB(sec_pred); 1089 sec_pred += 16; 1090 1091 INSERT_W4_UB(src0, src1, src2, src3, src); 1092 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 1093 1094 comp = __msa_aver_u_b(pred, ref); 1095 diff = __msa_asub_u_b(src, comp); 1096 sad += __msa_hadd_u_h(diff, diff); 1097 } 1098 1099 return HADD_UH_U32(sad); 1100} 1101 1102static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride, 1103 const uint8_t *ref, int32_t ref_stride, 1104 int32_t height, const uint8_t *sec_pred) { 1105 int32_t ht_cnt; 1106 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; 1107 v16u8 diff0, diff1, pred0, pred1; 1108 v8u16 sad = { 0 }; 1109 1110 for (ht_cnt = (height >> 2); ht_cnt--;) { 1111 LD_UB4(src, src_stride, src0, src1, src2, src3); 1112 src += (4 * src_stride); 1113 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 1114 ref += (4 * ref_stride); 1115 LD_UB2(sec_pred, 16, pred0, pred1); 1116 sec_pred += 32; 1117 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, 1118 src0, src1, ref0, ref1); 1119 AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1); 1120 sad += SAD_UB2_UH(src0, src1, diff0, diff1); 1121 } 1122 1123 return HADD_UH_U32(sad); 1124} 1125 1126static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride, 1127 const uint8_t *ref, int32_t ref_stride, 1128 int32_t height, const uint8_t *sec_pred) { 1129 int32_t ht_cnt; 1130 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; 1131 v16u8 pred0, pred1, pred2, pred3, comp0, comp1; 1132 v8u16 sad = { 0 }; 1133 1134 for (ht_cnt = (height >> 3); ht_cnt--;) { 1135 LD_UB4(src, src_stride, src0, src1, src2, src3); 1136 src += (4 * src_stride); 1137 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 1138 ref += (4 * ref_stride); 1139 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 1140 sec_pred += (4 * 16); 1141 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); 1142 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 1143 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); 1144 sad += SAD_UB2_UH(src2, src3, comp0, comp1); 1145 1146 LD_UB4(src, src_stride, src0, src1, src2, src3); 1147 src += (4 * src_stride); 1148 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); 1149 ref += (4 * ref_stride); 1150 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 1151 sec_pred += (4 * 16); 1152 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); 1153 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 1154 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); 1155 sad += SAD_UB2_UH(src2, src3, comp0, comp1); 1156 } 1157 1158 return HADD_UH_U32(sad); 1159} 1160 1161static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride, 1162 const uint8_t *ref, int32_t ref_stride, 1163 int32_t height, const uint8_t *sec_pred) { 1164 int32_t ht_cnt; 1165 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 1166 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; 1167 v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; 1168 v16u8 comp0, comp1; 1169 v8u16 sad = { 0 }; 1170 1171 for (ht_cnt = (height >> 2); ht_cnt--;) { 1172 LD_UB4(src, src_stride, src0, src2, src4, src6); 1173 LD_UB4(src + 16, src_stride, src1, src3, src5, src7); 1174 src += (4 * src_stride); 1175 1176 LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6); 1177 LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7); 1178 ref += (4 * ref_stride); 1179 1180 LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6); 1181 LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7); 1182 sec_pred += (4 * 32); 1183 1184 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); 1185 sad += SAD_UB2_UH(src0, src1, comp0, comp1); 1186 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); 1187 sad += SAD_UB2_UH(src2, src3, comp0, comp1); 1188 AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1); 1189 sad += SAD_UB2_UH(src4, src5, comp0, comp1); 1190 AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1); 1191 sad += SAD_UB2_UH(src6, src7, comp0, comp1); 1192 } 1193 1194 return HADD_UH_U32(sad); 1195} 1196 1197static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, 1198 const uint8_t *ref, int32_t ref_stride, 1199 int32_t height, const uint8_t *sec_pred) { 1200 int32_t ht_cnt; 1201 v16u8 src0, src1, src2, src3; 1202 v16u8 ref0, ref1, ref2, ref3; 1203 v16u8 comp0, comp1, comp2, comp3; 1204 v16u8 pred0, pred1, pred2, pred3; 1205 v8u16 sad0 = { 0 }; 1206 v8u16 sad1 = { 0 }; 1207 v4u32 sad; 1208 1209 for (ht_cnt = (height >> 2); ht_cnt--;) { 1210 LD_UB4(src, 16, src0, src1, src2, src3); 1211 src += src_stride; 1212 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 1213 ref += ref_stride; 1214 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 1215 sec_pred += 64; 1216 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, 1217 comp0, comp1, comp2, comp3); 1218 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); 1219 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); 1220 1221 LD_UB4(src, 16, src0, src1, src2, src3); 1222 src += src_stride; 1223 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 1224 ref += ref_stride; 1225 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 1226 sec_pred += 64; 1227 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, 1228 comp0, comp1, comp2, comp3); 1229 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); 1230 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); 1231 1232 LD_UB4(src, 16, src0, src1, src2, src3); 1233 src += src_stride; 1234 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 1235 ref += ref_stride; 1236 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 1237 sec_pred += 64; 1238 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, 1239 comp0, comp1, comp2, comp3); 1240 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); 1241 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); 1242 1243 LD_UB4(src, 16, src0, src1, src2, src3); 1244 src += src_stride; 1245 LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 1246 ref += ref_stride; 1247 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 1248 sec_pred += 64; 1249 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, 1250 comp0, comp1, comp2, comp3); 1251 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); 1252 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); 1253 } 1254 1255 sad = __msa_hadd_u_w(sad0, sad0); 1256 sad += __msa_hadd_u_w(sad1, sad1); 1257 1258 return HADD_SW_S32(sad); 1259} 1260 1261#define VPX_SAD_4xHEIGHT_MSA(height) \ 1262uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \ 1263 const uint8_t *ref, int32_t ref_stride) { \ 1264 return sad_4width_msa(src, src_stride, ref, ref_stride, height); \ 1265} 1266 1267#define VPX_SAD_8xHEIGHT_MSA(height) \ 1268uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \ 1269 const uint8_t *ref, int32_t ref_stride) { \ 1270 return sad_8width_msa(src, src_stride, ref, ref_stride, height); \ 1271} 1272 1273#define VPX_SAD_16xHEIGHT_MSA(height) \ 1274uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \ 1275 const uint8_t *ref, int32_t ref_stride) { \ 1276 return sad_16width_msa(src, src_stride, ref, ref_stride, height); \ 1277} 1278 1279#define VPX_SAD_32xHEIGHT_MSA(height) \ 1280uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \ 1281 const uint8_t *ref, int32_t ref_stride) { \ 1282 return sad_32width_msa(src, src_stride, ref, ref_stride, height); \ 1283} 1284 1285#define VPX_SAD_64xHEIGHT_MSA(height) \ 1286uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \ 1287 const uint8_t *ref, int32_t ref_stride) { \ 1288 return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ 1289} 1290 1291#define VPX_SAD_4xHEIGHTx3_MSA(height) \ 1292void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ 1293 const uint8_t *ref, int32_t ref_stride, \ 1294 uint32_t *sads) { \ 1295 sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ 1296} 1297 1298#define VPX_SAD_8xHEIGHTx3_MSA(height) \ 1299void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ 1300 const uint8_t *ref, int32_t ref_stride, \ 1301 uint32_t *sads) { \ 1302 sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ 1303} 1304 1305#define VPX_SAD_16xHEIGHTx3_MSA(height) \ 1306void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ 1307 const uint8_t *ref, int32_t ref_stride, \ 1308 uint32_t *sads) { \ 1309 sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ 1310} 1311 1312#define VPX_SAD_32xHEIGHTx3_MSA(height) \ 1313void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ 1314 const uint8_t *ref, int32_t ref_stride, \ 1315 uint32_t *sads) { \ 1316 sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ 1317} 1318 1319#define VPX_SAD_64xHEIGHTx3_MSA(height) \ 1320void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ 1321 const uint8_t *ref, int32_t ref_stride, \ 1322 uint32_t *sads) { \ 1323 sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ 1324} 1325 1326#define VPX_SAD_4xHEIGHTx8_MSA(height) \ 1327void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ 1328 const uint8_t *ref, int32_t ref_stride, \ 1329 uint32_t *sads) { \ 1330 sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ 1331} 1332 1333#define VPX_SAD_8xHEIGHTx8_MSA(height) \ 1334void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ 1335 const uint8_t *ref, int32_t ref_stride, \ 1336 uint32_t *sads) { \ 1337 sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ 1338} 1339 1340#define VPX_SAD_16xHEIGHTx8_MSA(height) \ 1341void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ 1342 const uint8_t *ref, int32_t ref_stride, \ 1343 uint32_t *sads) { \ 1344 sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ 1345} 1346 1347#define VPX_SAD_32xHEIGHTx8_MSA(height) \ 1348void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ 1349 const uint8_t *ref, int32_t ref_stride, \ 1350 uint32_t *sads) { \ 1351 sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ 1352} 1353 1354#define VPX_SAD_64xHEIGHTx8_MSA(height) \ 1355void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ 1356 const uint8_t *ref, int32_t ref_stride, \ 1357 uint32_t *sads) { \ 1358 sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ 1359} 1360 1361#define VPX_SAD_4xHEIGHTx4D_MSA(height) \ 1362void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ 1363 const uint8_t *const refs[], \ 1364 int32_t ref_stride, uint32_t *sads) { \ 1365 sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ 1366} 1367 1368#define VPX_SAD_8xHEIGHTx4D_MSA(height) \ 1369void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ 1370 const uint8_t *const refs[], \ 1371 int32_t ref_stride, uint32_t *sads) { \ 1372 sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ 1373} 1374 1375#define VPX_SAD_16xHEIGHTx4D_MSA(height) \ 1376void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ 1377 const uint8_t *const refs[], \ 1378 int32_t ref_stride, uint32_t *sads) { \ 1379 sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ 1380} 1381 1382#define VPX_SAD_32xHEIGHTx4D_MSA(height) \ 1383void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ 1384 const uint8_t *const refs[], \ 1385 int32_t ref_stride, uint32_t *sads) { \ 1386 sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ 1387} 1388 1389#define VPX_SAD_64xHEIGHTx4D_MSA(height) \ 1390void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ 1391 const uint8_t *const refs[], \ 1392 int32_t ref_stride, uint32_t *sads) { \ 1393 sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ 1394} 1395 1396#define VPX_AVGSAD_4xHEIGHT_MSA(height) \ 1397uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ 1398 const uint8_t *ref, int32_t ref_stride, \ 1399 const uint8_t *second_pred) { \ 1400 return avgsad_4width_msa(src, src_stride, ref, ref_stride, \ 1401 height, second_pred); \ 1402} 1403 1404#define VPX_AVGSAD_8xHEIGHT_MSA(height) \ 1405uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ 1406 const uint8_t *ref, int32_t ref_stride, \ 1407 const uint8_t *second_pred) { \ 1408 return avgsad_8width_msa(src, src_stride, ref, ref_stride, \ 1409 height, second_pred); \ 1410} 1411 1412#define VPX_AVGSAD_16xHEIGHT_MSA(height) \ 1413uint32_t vpx_sad16x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ 1414 const uint8_t *ref, int32_t ref_stride, \ 1415 const uint8_t *second_pred) { \ 1416 return avgsad_16width_msa(src, src_stride, ref, ref_stride, \ 1417 height, second_pred); \ 1418} 1419 1420#define VPX_AVGSAD_32xHEIGHT_MSA(height) \ 1421uint32_t vpx_sad32x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ 1422 const uint8_t *ref, int32_t ref_stride, \ 1423 const uint8_t *second_pred) { \ 1424 return avgsad_32width_msa(src, src_stride, ref, ref_stride, \ 1425 height, second_pred); \ 1426} 1427 1428#define VPX_AVGSAD_64xHEIGHT_MSA(height) \ 1429uint32_t vpx_sad64x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ 1430 const uint8_t *ref, int32_t ref_stride, \ 1431 const uint8_t *second_pred) { \ 1432 return avgsad_64width_msa(src, src_stride, ref, ref_stride, \ 1433 height, second_pred); \ 1434} 1435 1436// 64x64 1437VPX_SAD_64xHEIGHT_MSA(64); 1438VPX_SAD_64xHEIGHTx3_MSA(64); 1439VPX_SAD_64xHEIGHTx8_MSA(64); 1440VPX_SAD_64xHEIGHTx4D_MSA(64); 1441VPX_AVGSAD_64xHEIGHT_MSA(64); 1442 1443// 64x32 1444VPX_SAD_64xHEIGHT_MSA(32); 1445VPX_SAD_64xHEIGHTx3_MSA(32); 1446VPX_SAD_64xHEIGHTx8_MSA(32); 1447VPX_SAD_64xHEIGHTx4D_MSA(32); 1448VPX_AVGSAD_64xHEIGHT_MSA(32); 1449 1450// 32x64 1451VPX_SAD_32xHEIGHT_MSA(64); 1452VPX_SAD_32xHEIGHTx3_MSA(64); 1453VPX_SAD_32xHEIGHTx8_MSA(64); 1454VPX_SAD_32xHEIGHTx4D_MSA(64); 1455VPX_AVGSAD_32xHEIGHT_MSA(64); 1456 1457// 32x32 1458VPX_SAD_32xHEIGHT_MSA(32); 1459VPX_SAD_32xHEIGHTx3_MSA(32); 1460VPX_SAD_32xHEIGHTx8_MSA(32); 1461VPX_SAD_32xHEIGHTx4D_MSA(32); 1462VPX_AVGSAD_32xHEIGHT_MSA(32); 1463 1464// 32x16 1465VPX_SAD_32xHEIGHT_MSA(16); 1466VPX_SAD_32xHEIGHTx3_MSA(16); 1467VPX_SAD_32xHEIGHTx8_MSA(16); 1468VPX_SAD_32xHEIGHTx4D_MSA(16); 1469VPX_AVGSAD_32xHEIGHT_MSA(16); 1470 1471// 16x32 1472VPX_SAD_16xHEIGHT_MSA(32); 1473VPX_SAD_16xHEIGHTx3_MSA(32); 1474VPX_SAD_16xHEIGHTx8_MSA(32); 1475VPX_SAD_16xHEIGHTx4D_MSA(32); 1476VPX_AVGSAD_16xHEIGHT_MSA(32); 1477 1478// 16x16 1479VPX_SAD_16xHEIGHT_MSA(16); 1480VPX_SAD_16xHEIGHTx3_MSA(16); 1481VPX_SAD_16xHEIGHTx8_MSA(16); 1482VPX_SAD_16xHEIGHTx4D_MSA(16); 1483VPX_AVGSAD_16xHEIGHT_MSA(16); 1484 1485// 16x8 1486VPX_SAD_16xHEIGHT_MSA(8); 1487VPX_SAD_16xHEIGHTx3_MSA(8); 1488VPX_SAD_16xHEIGHTx8_MSA(8); 1489VPX_SAD_16xHEIGHTx4D_MSA(8); 1490VPX_AVGSAD_16xHEIGHT_MSA(8); 1491 1492// 8x16 1493VPX_SAD_8xHEIGHT_MSA(16); 1494VPX_SAD_8xHEIGHTx3_MSA(16); 1495VPX_SAD_8xHEIGHTx8_MSA(16); 1496VPX_SAD_8xHEIGHTx4D_MSA(16); 1497VPX_AVGSAD_8xHEIGHT_MSA(16); 1498 1499// 8x8 1500VPX_SAD_8xHEIGHT_MSA(8); 1501VPX_SAD_8xHEIGHTx3_MSA(8); 1502VPX_SAD_8xHEIGHTx8_MSA(8); 1503VPX_SAD_8xHEIGHTx4D_MSA(8); 1504VPX_AVGSAD_8xHEIGHT_MSA(8); 1505 1506// 8x4 1507VPX_SAD_8xHEIGHT_MSA(4); 1508VPX_SAD_8xHEIGHTx3_MSA(4); 1509VPX_SAD_8xHEIGHTx8_MSA(4); 1510VPX_SAD_8xHEIGHTx4D_MSA(4); 1511VPX_AVGSAD_8xHEIGHT_MSA(4); 1512 1513// 4x8 1514VPX_SAD_4xHEIGHT_MSA(8); 1515VPX_SAD_4xHEIGHTx3_MSA(8); 1516VPX_SAD_4xHEIGHTx8_MSA(8); 1517VPX_SAD_4xHEIGHTx4D_MSA(8); 1518VPX_AVGSAD_4xHEIGHT_MSA(8); 1519 1520// 4x4 1521VPX_SAD_4xHEIGHT_MSA(4); 1522VPX_SAD_4xHEIGHTx3_MSA(4); 1523VPX_SAD_4xHEIGHTx8_MSA(4); 1524VPX_SAD_4xHEIGHTx4D_MSA(4); 1525VPX_AVGSAD_4xHEIGHT_MSA(4); 1526