1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vpx_dsp_rtcd.h" 12#include "vpx_ports/mem.h" 13#include "vpx_dsp/mips/macros_msa.h" 14#include "vpx_dsp/variance.h" 15 16static const uint8_t bilinear_filters_msa[8][2] = { 17 { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, 18 { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, 19}; 20 21#define CALC_MSE_AVG_B(src, ref, var, sub) \ 22 { \ 23 v16u8 src_l0_m, src_l1_m; \ 24 v8i16 res_l0_m, res_l1_m; \ 25 \ 26 ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ 27 HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ 28 DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ 29 \ 30 sub += res_l0_m + res_l1_m; \ 31 } 32 33#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) 34 35#define VARIANCE_LARGE_WxH(sse, diff, shift) \ 36 sse - (((int64_t)diff * diff) >> shift) 37 38static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr, 39 int32_t src_stride, 40 const uint8_t *ref_ptr, 41 int32_t ref_stride, 42 const uint8_t *sec_pred, int32_t height, 43 int32_t *diff) { 44 int32_t ht_cnt; 45 uint32_t src0, src1, src2, src3; 46 uint32_t ref0, ref1, ref2, ref3; 47 v16u8 pred, src = { 0 }; 48 v16u8 ref = { 0 }; 49 v8i16 avg = { 0 }; 50 v4i32 vec, var = { 0 }; 51 52 for (ht_cnt = (height >> 2); ht_cnt--;) { 53 pred = LD_UB(sec_pred); 54 sec_pred += 16; 55 LW4(src_ptr, src_stride, src0, src1, src2, src3); 56 src_ptr += (4 * src_stride); 57 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 58 ref_ptr += (4 * ref_stride); 59 60 INSERT_W4_UB(src0, src1, src2, src3, src); 61 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 62 63 src = __msa_aver_u_b(src, pred); 64 CALC_MSE_AVG_B(src, ref, var, avg); 65 } 66 67 vec = __msa_hadd_s_w(avg, avg); 68 *diff = HADD_SW_S32(vec); 69 70 return HADD_SW_S32(var); 71} 72 73static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr, 74 int32_t src_stride, 75 const uint8_t *ref_ptr, 76 int32_t ref_stride, 77 const uint8_t *sec_pred, int32_t height, 78 int32_t *diff) { 79 int32_t ht_cnt; 80 v16u8 src0, src1, src2, src3; 81 v16u8 ref0, ref1, ref2, ref3; 82 v16u8 pred0, pred1; 83 v8i16 avg = { 0 }; 84 v4i32 vec, var = { 0 }; 85 86 for (ht_cnt = (height >> 2); ht_cnt--;) { 87 LD_UB2(sec_pred, 16, pred0, pred1); 88 sec_pred += 32; 89 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); 90 src_ptr += (4 * src_stride); 91 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 92 ref_ptr += (4 * ref_stride); 93 94 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, 95 ref0, ref1); 96 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 97 CALC_MSE_AVG_B(src0, ref0, var, avg); 98 CALC_MSE_AVG_B(src1, ref1, var, avg); 99 } 100 101 vec = __msa_hadd_s_w(avg, avg); 102 *diff = HADD_SW_S32(vec); 103 104 return HADD_SW_S32(var); 105} 106 107static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr, 108 int32_t src_stride, 109 const uint8_t *ref_ptr, 110 int32_t ref_stride, 111 const uint8_t *sec_pred, 112 int32_t height, int32_t *diff) { 113 int32_t ht_cnt; 114 v16u8 src, ref, pred; 115 v8i16 avg = { 0 }; 116 v4i32 vec, var = { 0 }; 117 118 for (ht_cnt = (height >> 2); ht_cnt--;) { 119 pred = LD_UB(sec_pred); 120 sec_pred += 16; 121 src = LD_UB(src_ptr); 122 src_ptr += src_stride; 123 ref = LD_UB(ref_ptr); 124 ref_ptr += ref_stride; 125 src = __msa_aver_u_b(src, pred); 126 CALC_MSE_AVG_B(src, ref, var, avg); 127 128 pred = LD_UB(sec_pred); 129 sec_pred += 16; 130 src = LD_UB(src_ptr); 131 src_ptr += src_stride; 132 ref = LD_UB(ref_ptr); 133 ref_ptr += ref_stride; 134 src = __msa_aver_u_b(src, pred); 135 CALC_MSE_AVG_B(src, ref, var, avg); 136 137 pred = LD_UB(sec_pred); 138 sec_pred += 16; 139 src = LD_UB(src_ptr); 140 src_ptr += src_stride; 141 ref = LD_UB(ref_ptr); 142 ref_ptr += ref_stride; 143 src = __msa_aver_u_b(src, pred); 144 CALC_MSE_AVG_B(src, ref, var, avg); 145 146 pred = LD_UB(sec_pred); 147 sec_pred += 16; 148 src = LD_UB(src_ptr); 149 src_ptr += src_stride; 150 ref = LD_UB(ref_ptr); 151 ref_ptr += ref_stride; 152 src = __msa_aver_u_b(src, pred); 153 CALC_MSE_AVG_B(src, ref, var, avg); 154 } 155 156 vec = __msa_hadd_s_w(avg, avg); 157 *diff = HADD_SW_S32(vec); 158 159 return HADD_SW_S32(var); 160} 161 162static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr, 163 int32_t src_stride, 164 const uint8_t *ref_ptr, 165 int32_t ref_stride, 166 const uint8_t *sec_pred, 167 int32_t height, int32_t *diff) { 168 int32_t ht_cnt; 169 v16u8 src0, src1, ref0, ref1, pred0, pred1; 170 v8i16 avg = { 0 }; 171 v4i32 vec, var = { 0 }; 172 173 for (ht_cnt = (height >> 2); ht_cnt--;) { 174 LD_UB2(sec_pred, 16, pred0, pred1); 175 sec_pred += 32; 176 LD_UB2(src_ptr, 16, src0, src1); 177 src_ptr += src_stride; 178 LD_UB2(ref_ptr, 16, ref0, ref1); 179 ref_ptr += ref_stride; 180 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 181 CALC_MSE_AVG_B(src0, ref0, var, avg); 182 CALC_MSE_AVG_B(src1, ref1, var, avg); 183 184 LD_UB2(sec_pred, 16, pred0, pred1); 185 sec_pred += 32; 186 LD_UB2(src_ptr, 16, src0, src1); 187 src_ptr += src_stride; 188 LD_UB2(ref_ptr, 16, ref0, ref1); 189 ref_ptr += ref_stride; 190 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 191 CALC_MSE_AVG_B(src0, ref0, var, avg); 192 CALC_MSE_AVG_B(src1, ref1, var, avg); 193 194 LD_UB2(sec_pred, 16, pred0, pred1); 195 sec_pred += 32; 196 LD_UB2(src_ptr, 16, src0, src1); 197 src_ptr += src_stride; 198 LD_UB2(ref_ptr, 16, ref0, ref1); 199 ref_ptr += ref_stride; 200 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 201 CALC_MSE_AVG_B(src0, ref0, var, avg); 202 CALC_MSE_AVG_B(src1, ref1, var, avg); 203 204 LD_UB2(sec_pred, 16, pred0, pred1); 205 sec_pred += 32; 206 LD_UB2(src_ptr, 16, src0, src1); 207 src_ptr += src_stride; 208 LD_UB2(ref_ptr, 16, ref0, ref1); 209 ref_ptr += ref_stride; 210 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 211 CALC_MSE_AVG_B(src0, ref0, var, avg); 212 CALC_MSE_AVG_B(src1, ref1, var, avg); 213 } 214 215 vec = __msa_hadd_s_w(avg, avg); 216 *diff = HADD_SW_S32(vec); 217 218 return HADD_SW_S32(var); 219} 220 221static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr, 222 int32_t src_stride, 223 const uint8_t *ref_ptr, 224 int32_t ref_stride, 225 const uint8_t *sec_pred, int32_t *diff) { 226 int32_t ht_cnt; 227 v16u8 src0, src1, ref0, ref1, pred0, pred1; 228 v8i16 avg0 = { 0 }; 229 v8i16 avg1 = { 0 }; 230 v4i32 vec, var = { 0 }; 231 232 for (ht_cnt = 16; ht_cnt--;) { 233 LD_UB2(sec_pred, 16, pred0, pred1); 234 sec_pred += 32; 235 LD_UB2(src_ptr, 16, src0, src1); 236 src_ptr += src_stride; 237 LD_UB2(ref_ptr, 16, ref0, ref1); 238 ref_ptr += ref_stride; 239 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 240 CALC_MSE_AVG_B(src0, ref0, var, avg0); 241 CALC_MSE_AVG_B(src1, ref1, var, avg1); 242 243 LD_UB2(sec_pred, 16, pred0, pred1); 244 sec_pred += 32; 245 LD_UB2(src_ptr, 16, src0, src1); 246 src_ptr += src_stride; 247 LD_UB2(ref_ptr, 16, ref0, ref1); 248 ref_ptr += ref_stride; 249 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 250 CALC_MSE_AVG_B(src0, ref0, var, avg0); 251 CALC_MSE_AVG_B(src1, ref1, var, avg1); 252 253 LD_UB2(sec_pred, 16, pred0, pred1); 254 sec_pred += 32; 255 LD_UB2(src_ptr, 16, src0, src1); 256 src_ptr += src_stride; 257 LD_UB2(ref_ptr, 16, ref0, ref1); 258 ref_ptr += ref_stride; 259 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 260 CALC_MSE_AVG_B(src0, ref0, var, avg0); 261 CALC_MSE_AVG_B(src1, ref1, var, avg1); 262 263 LD_UB2(sec_pred, 16, pred0, pred1); 264 sec_pred += 32; 265 LD_UB2(src_ptr, 16, src0, src1); 266 src_ptr += src_stride; 267 LD_UB2(ref_ptr, 16, ref0, ref1); 268 ref_ptr += ref_stride; 269 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 270 CALC_MSE_AVG_B(src0, ref0, var, avg0); 271 CALC_MSE_AVG_B(src1, ref1, var, avg1); 272 } 273 274 vec = __msa_hadd_s_w(avg0, avg0); 275 vec += __msa_hadd_s_w(avg1, avg1); 276 *diff = HADD_SW_S32(vec); 277 278 return HADD_SW_S32(var); 279} 280 281static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr, 282 int32_t src_stride, 283 const uint8_t *ref_ptr, 284 int32_t ref_stride, 285 const uint8_t *sec_pred, int32_t *diff) { 286 int32_t ht_cnt; 287 v16u8 src0, src1, src2, src3; 288 v16u8 ref0, ref1, ref2, ref3; 289 v16u8 pred0, pred1, pred2, pred3; 290 v8i16 avg0 = { 0 }; 291 v8i16 avg1 = { 0 }; 292 v4i32 vec, var = { 0 }; 293 294 for (ht_cnt = 16; ht_cnt--;) { 295 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 296 sec_pred += 64; 297 LD_UB4(src_ptr, 16, src0, src1, src2, src3); 298 src_ptr += src_stride; 299 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); 300 ref_ptr += ref_stride; 301 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, 302 src2, src3); 303 CALC_MSE_AVG_B(src0, ref0, var, avg0); 304 CALC_MSE_AVG_B(src2, ref2, var, avg0); 305 CALC_MSE_AVG_B(src1, ref1, var, avg1); 306 CALC_MSE_AVG_B(src3, ref3, var, avg1); 307 308 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 309 sec_pred += 64; 310 LD_UB4(src_ptr, 16, src0, src1, src2, src3); 311 src_ptr += src_stride; 312 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); 313 ref_ptr += ref_stride; 314 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, 315 src2, src3); 316 CALC_MSE_AVG_B(src0, ref0, var, avg0); 317 CALC_MSE_AVG_B(src2, ref2, var, avg0); 318 CALC_MSE_AVG_B(src1, ref1, var, avg1); 319 CALC_MSE_AVG_B(src3, ref3, var, avg1); 320 } 321 322 vec = __msa_hadd_s_w(avg0, avg0); 323 vec += __msa_hadd_s_w(avg1, avg1); 324 325 *diff = HADD_SW_S32(vec); 326 327 return HADD_SW_S32(var); 328} 329 330static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr, 331 int32_t src_stride, 332 const uint8_t *ref_ptr, 333 int32_t ref_stride, 334 const uint8_t *sec_pred, int32_t *diff) { 335 int32_t ht_cnt; 336 v16u8 src0, src1, src2, src3; 337 v16u8 ref0, ref1, ref2, ref3; 338 v16u8 pred0, pred1, pred2, pred3; 339 v8i16 avg0 = { 0 }; 340 v8i16 avg1 = { 0 }; 341 v8i16 avg2 = { 0 }; 342 v8i16 avg3 = { 0 }; 343 v4i32 vec, var = { 0 }; 344 345 for (ht_cnt = 32; ht_cnt--;) { 346 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 347 sec_pred += 64; 348 LD_UB4(src_ptr, 16, src0, src1, src2, src3); 349 src_ptr += src_stride; 350 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); 351 ref_ptr += ref_stride; 352 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, 353 src2, src3); 354 CALC_MSE_AVG_B(src0, ref0, var, avg0); 355 CALC_MSE_AVG_B(src1, ref1, var, avg1); 356 CALC_MSE_AVG_B(src2, ref2, var, avg2); 357 CALC_MSE_AVG_B(src3, ref3, var, avg3); 358 359 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 360 sec_pred += 64; 361 LD_UB4(src_ptr, 16, src0, src1, src2, src3); 362 src_ptr += src_stride; 363 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); 364 ref_ptr += ref_stride; 365 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, 366 src2, src3); 367 CALC_MSE_AVG_B(src0, ref0, var, avg0); 368 CALC_MSE_AVG_B(src1, ref1, var, avg1); 369 CALC_MSE_AVG_B(src2, ref2, var, avg2); 370 CALC_MSE_AVG_B(src3, ref3, var, avg3); 371 } 372 373 vec = __msa_hadd_s_w(avg0, avg0); 374 vec += __msa_hadd_s_w(avg1, avg1); 375 vec += __msa_hadd_s_w(avg2, avg2); 376 vec += __msa_hadd_s_w(avg3, avg3); 377 *diff = HADD_SW_S32(vec); 378 379 return HADD_SW_S32(var); 380} 381 382static uint32_t sub_pixel_sse_diff_4width_h_msa( 383 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 384 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 385 int16_t filtval; 386 uint32_t loop_cnt; 387 uint32_t ref0, ref1, ref2, ref3; 388 v16u8 filt0, ref = { 0 }; 389 v16i8 src0, src1, src2, src3; 390 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 391 v8u16 vec0, vec1, vec2, vec3; 392 v8i16 avg = { 0 }; 393 v4i32 vec, var = { 0 }; 394 395 filtval = LH(filter); 396 filt0 = (v16u8)__msa_fill_h(filtval); 397 398 for (loop_cnt = (height >> 2); loop_cnt--;) { 399 LD_SB4(src, src_stride, src0, src1, src2, src3); 400 src += (4 * src_stride); 401 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 402 dst += (4 * dst_stride); 403 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 404 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 405 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 406 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 407 vec2, vec3); 408 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 409 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, 410 src2, src3); 411 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); 412 src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); 413 CALC_MSE_AVG_B(src0, ref, var, avg); 414 } 415 416 vec = __msa_hadd_s_w(avg, avg); 417 *diff = HADD_SW_S32(vec); 418 419 return HADD_SW_S32(var); 420} 421 422static uint32_t sub_pixel_sse_diff_8width_h_msa( 423 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 424 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 425 int16_t filtval; 426 uint32_t loop_cnt; 427 v16u8 filt0, out, ref0, ref1, ref2, ref3; 428 v16i8 src0, src1, src2, src3; 429 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 430 v8u16 vec0, vec1, vec2, vec3; 431 v8i16 avg = { 0 }; 432 v4i32 vec, var = { 0 }; 433 434 filtval = LH(filter); 435 filt0 = (v16u8)__msa_fill_h(filtval); 436 437 for (loop_cnt = (height >> 2); loop_cnt--;) { 438 LD_SB4(src, src_stride, src0, src1, src2, src3); 439 src += (4 * src_stride); 440 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 441 dst += (4 * dst_stride); 442 443 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 444 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 445 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 446 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 447 vec2, vec3); 448 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 449 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, 450 src2, src3); 451 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); 452 CALC_MSE_AVG_B(out, ref0, var, avg); 453 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); 454 CALC_MSE_AVG_B(out, ref1, var, avg); 455 } 456 457 vec = __msa_hadd_s_w(avg, avg); 458 *diff = HADD_SW_S32(vec); 459 460 return HADD_SW_S32(var); 461} 462 463static uint32_t sub_pixel_sse_diff_16width_h_msa( 464 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 465 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 466 int16_t filtval; 467 uint32_t loop_cnt; 468 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 469 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 470 v16u8 dst0, dst1, dst2, dst3, filt0; 471 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 472 v8u16 out0, out1, out2, out3, out4, out5, out6, out7; 473 v8i16 avg = { 0 }; 474 v4i32 vec, var = { 0 }; 475 476 filtval = LH(filter); 477 filt0 = (v16u8)__msa_fill_h(filtval); 478 479 for (loop_cnt = (height >> 2); loop_cnt--;) { 480 LD_SB4(src, src_stride, src0, src2, src4, src6); 481 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 482 src += (4 * src_stride); 483 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 484 dst += (4 * dst_stride); 485 486 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 487 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 488 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); 489 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); 490 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 491 out2, out3); 492 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 493 out6, out7); 494 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 495 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 496 PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1, 497 src2, src3); 498 CALC_MSE_AVG_B(src0, dst0, var, avg); 499 CALC_MSE_AVG_B(src1, dst1, var, avg); 500 CALC_MSE_AVG_B(src2, dst2, var, avg); 501 CALC_MSE_AVG_B(src3, dst3, var, avg); 502 } 503 504 vec = __msa_hadd_s_w(avg, avg); 505 *diff = HADD_SW_S32(vec); 506 507 return HADD_SW_S32(var); 508} 509 510static uint32_t sub_pixel_sse_diff_32width_h_msa( 511 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 512 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 513 uint32_t loop_cnt, sse = 0; 514 int32_t diff0[2]; 515 516 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 517 sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, 518 filter, height, &diff0[loop_cnt]); 519 src += 16; 520 dst += 16; 521 } 522 523 *diff = diff0[0] + diff0[1]; 524 525 return sse; 526} 527 528static uint32_t sub_pixel_sse_diff_64width_h_msa( 529 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 530 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 531 uint32_t loop_cnt, sse = 0; 532 int32_t diff0[4]; 533 534 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 535 sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, 536 filter, height, &diff0[loop_cnt]); 537 src += 16; 538 dst += 16; 539 } 540 541 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 542 543 return sse; 544} 545 546static uint32_t sub_pixel_sse_diff_4width_v_msa( 547 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 548 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 549 int16_t filtval; 550 uint32_t loop_cnt; 551 uint32_t ref0, ref1, ref2, ref3; 552 v16u8 src0, src1, src2, src3, src4, out; 553 v16u8 src10_r, src32_r, src21_r, src43_r; 554 v16u8 ref = { 0 }; 555 v16u8 src2110, src4332; 556 v16u8 filt0; 557 v8i16 avg = { 0 }; 558 v4i32 vec, var = { 0 }; 559 v8u16 tmp0, tmp1; 560 561 filtval = LH(filter); 562 filt0 = (v16u8)__msa_fill_h(filtval); 563 564 src0 = LD_UB(src); 565 src += src_stride; 566 567 for (loop_cnt = (height >> 2); loop_cnt--;) { 568 LD_UB4(src, src_stride, src1, src2, src3, src4); 569 src += (4 * src_stride); 570 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 571 dst += (4 * dst_stride); 572 573 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 574 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 575 src32_r, src43_r); 576 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 577 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); 578 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 579 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 580 CALC_MSE_AVG_B(out, ref, var, avg); 581 src0 = src4; 582 } 583 584 vec = __msa_hadd_s_w(avg, avg); 585 *diff = HADD_SW_S32(vec); 586 587 return HADD_SW_S32(var); 588} 589 590static uint32_t sub_pixel_sse_diff_8width_v_msa( 591 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 592 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 593 int16_t filtval; 594 uint32_t loop_cnt; 595 v16u8 src0, src1, src2, src3, src4; 596 v16u8 ref0, ref1, ref2, ref3; 597 v8u16 vec0, vec1, vec2, vec3; 598 v8u16 tmp0, tmp1, tmp2, tmp3; 599 v16u8 filt0; 600 v8i16 avg = { 0 }; 601 v4i32 vec, var = { 0 }; 602 603 filtval = LH(filter); 604 filt0 = (v16u8)__msa_fill_h(filtval); 605 606 src0 = LD_UB(src); 607 src += src_stride; 608 609 for (loop_cnt = (height >> 2); loop_cnt--;) { 610 LD_UB4(src, src_stride, src1, src2, src3, src4); 611 src += (4 * src_stride); 612 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 613 dst += (4 * dst_stride); 614 615 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 616 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, 617 vec3); 618 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, 619 tmp2, tmp3); 620 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 621 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 622 CALC_MSE_AVG_B(src0, ref0, var, avg); 623 CALC_MSE_AVG_B(src1, ref1, var, avg); 624 src0 = src4; 625 } 626 627 vec = __msa_hadd_s_w(avg, avg); 628 *diff = HADD_SW_S32(vec); 629 630 return HADD_SW_S32(var); 631} 632 633static uint32_t sub_pixel_sse_diff_16width_v_msa( 634 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 635 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 636 int16_t filtval; 637 uint32_t loop_cnt; 638 v16u8 ref0, ref1, ref2, ref3; 639 v16u8 src0, src1, src2, src3, src4; 640 v16u8 out0, out1, out2, out3; 641 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 642 v8u16 tmp0, tmp1, tmp2, tmp3; 643 v16u8 filt0; 644 v8i16 avg = { 0 }; 645 v4i32 vec, var = { 0 }; 646 647 filtval = LH(filter); 648 filt0 = (v16u8)__msa_fill_h(filtval); 649 650 src0 = LD_UB(src); 651 src += src_stride; 652 653 for (loop_cnt = (height >> 2); loop_cnt--;) { 654 LD_UB4(src, src_stride, src1, src2, src3, src4); 655 src += (4 * src_stride); 656 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 657 dst += (4 * dst_stride); 658 659 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 660 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 661 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 662 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 663 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 664 665 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); 666 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); 667 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 668 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 669 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 670 671 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 672 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 673 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 674 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 675 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 676 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 677 678 src0 = src4; 679 680 CALC_MSE_AVG_B(out0, ref0, var, avg); 681 CALC_MSE_AVG_B(out1, ref1, var, avg); 682 CALC_MSE_AVG_B(out2, ref2, var, avg); 683 CALC_MSE_AVG_B(out3, ref3, var, avg); 684 } 685 686 vec = __msa_hadd_s_w(avg, avg); 687 *diff = HADD_SW_S32(vec); 688 689 return HADD_SW_S32(var); 690} 691 692static uint32_t sub_pixel_sse_diff_32width_v_msa( 693 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 694 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 695 uint32_t loop_cnt, sse = 0; 696 int32_t diff0[2]; 697 698 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 699 sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, 700 filter, height, &diff0[loop_cnt]); 701 src += 16; 702 dst += 16; 703 } 704 705 *diff = diff0[0] + diff0[1]; 706 707 return sse; 708} 709 710static uint32_t sub_pixel_sse_diff_64width_v_msa( 711 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 712 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { 713 uint32_t loop_cnt, sse = 0; 714 int32_t diff0[4]; 715 716 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 717 sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, 718 filter, height, &diff0[loop_cnt]); 719 src += 16; 720 dst += 16; 721 } 722 723 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 724 725 return sse; 726} 727 728static uint32_t sub_pixel_sse_diff_4width_hv_msa( 729 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 730 int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, 731 int32_t height, int32_t *diff) { 732 int16_t filtval; 733 uint32_t loop_cnt; 734 uint32_t ref0, ref1, ref2, ref3; 735 v16u8 src0, src1, src2, src3, src4; 736 v16u8 out, ref = { 0 }; 737 v16u8 filt_vt, filt_hz, vec0, vec1; 738 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; 739 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4; 740 v8u16 tmp0, tmp1; 741 v8i16 avg = { 0 }; 742 v4i32 vec, var = { 0 }; 743 744 filtval = LH(filter_horiz); 745 filt_hz = (v16u8)__msa_fill_h(filtval); 746 filtval = LH(filter_vert); 747 filt_vt = (v16u8)__msa_fill_h(filtval); 748 749 src0 = LD_UB(src); 750 src += src_stride; 751 752 for (loop_cnt = (height >> 2); loop_cnt--;) { 753 LD_UB4(src, src_stride, src1, src2, src3, src4); 754 src += (4 * src_stride); 755 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 756 dst += (4 * dst_stride); 757 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 758 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 759 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 760 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 761 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 762 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); 763 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 764 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 765 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 766 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 767 CALC_MSE_AVG_B(out, ref, var, avg); 768 src0 = src4; 769 } 770 771 vec = __msa_hadd_s_w(avg, avg); 772 *diff = HADD_SW_S32(vec); 773 774 return HADD_SW_S32(var); 775} 776 777static uint32_t sub_pixel_sse_diff_8width_hv_msa( 778 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 779 int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, 780 int32_t height, int32_t *diff) { 781 int16_t filtval; 782 uint32_t loop_cnt; 783 v16u8 ref0, ref1, ref2, ref3; 784 v16u8 src0, src1, src2, src3, src4; 785 v16u8 out0, out1; 786 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 787 v8u16 hz_out0, hz_out1; 788 v8u16 tmp0, tmp1, tmp2, tmp3; 789 v16u8 filt_vt, filt_hz, vec0; 790 v8i16 avg = { 0 }; 791 v4i32 vec, var = { 0 }; 792 793 filtval = LH(filter_horiz); 794 filt_hz = (v16u8)__msa_fill_h(filtval); 795 filtval = LH(filter_vert); 796 filt_vt = (v16u8)__msa_fill_h(filtval); 797 798 src0 = LD_UB(src); 799 src += src_stride; 800 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 801 802 for (loop_cnt = (height >> 2); loop_cnt--;) { 803 LD_UB4(src, src_stride, src1, src2, src3, src4); 804 src += (4 * src_stride); 805 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 806 dst += (4 * dst_stride); 807 808 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 809 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 810 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 811 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 812 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 813 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 814 tmp1 = __msa_dotp_u_h(vec0, filt_vt); 815 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 816 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 817 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 818 tmp2 = __msa_dotp_u_h(vec0, filt_vt); 819 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 820 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 821 tmp3 = __msa_dotp_u_h(vec0, filt_vt); 822 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 823 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 824 CALC_MSE_AVG_B(out0, ref0, var, avg); 825 CALC_MSE_AVG_B(out1, ref1, var, avg); 826 } 827 828 vec = __msa_hadd_s_w(avg, avg); 829 *diff = HADD_SW_S32(vec); 830 831 return HADD_SW_S32(var); 832} 833 834static uint32_t sub_pixel_sse_diff_16width_hv_msa( 835 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 836 int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, 837 int32_t height, int32_t *diff) { 838 int16_t filtval; 839 uint32_t loop_cnt; 840 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 841 v16u8 ref0, ref1, ref2, ref3; 842 v16u8 filt_hz, filt_vt, vec0, vec1; 843 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 844 v8u16 hz_out0, hz_out1, hz_out2, hz_out3; 845 v8u16 tmp0, tmp1; 846 v8i16 avg = { 0 }; 847 v4i32 vec, var = { 0 }; 848 849 filtval = LH(filter_horiz); 850 filt_hz = (v16u8)__msa_fill_h(filtval); 851 filtval = LH(filter_vert); 852 filt_vt = (v16u8)__msa_fill_h(filtval); 853 854 LD_UB2(src, 8, src0, src1); 855 src += src_stride; 856 857 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 858 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 859 860 for (loop_cnt = (height >> 2); loop_cnt--;) { 861 LD_UB4(src, src_stride, src0, src2, src4, src6); 862 LD_UB4(src + 8, src_stride, src1, src3, src5, src7); 863 src += (4 * src_stride); 864 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 865 dst += (4 * dst_stride); 866 867 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 868 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 869 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 870 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 871 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 872 src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 873 874 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 875 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 876 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 877 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 878 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 879 src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 880 881 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 882 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); 883 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 884 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 885 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 886 src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 887 888 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); 889 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); 890 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 891 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 892 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 893 src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 894 895 CALC_MSE_AVG_B(src0, ref0, var, avg); 896 CALC_MSE_AVG_B(src1, ref1, var, avg); 897 CALC_MSE_AVG_B(src2, ref2, var, avg); 898 CALC_MSE_AVG_B(src3, ref3, var, avg); 899 } 900 901 vec = __msa_hadd_s_w(avg, avg); 902 *diff = HADD_SW_S32(vec); 903 904 return HADD_SW_S32(var); 905} 906 907static uint32_t sub_pixel_sse_diff_32width_hv_msa( 908 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 909 int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, 910 int32_t height, int32_t *diff) { 911 uint32_t loop_cnt, sse = 0; 912 int32_t diff0[2]; 913 914 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 915 sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, 916 filter_horiz, filter_vert, height, 917 &diff0[loop_cnt]); 918 src += 16; 919 dst += 16; 920 } 921 922 *diff = diff0[0] + diff0[1]; 923 924 return sse; 925} 926 927static uint32_t sub_pixel_sse_diff_64width_hv_msa( 928 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 929 int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, 930 int32_t height, int32_t *diff) { 931 uint32_t loop_cnt, sse = 0; 932 int32_t diff0[4]; 933 934 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 935 sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, 936 filter_horiz, filter_vert, height, 937 &diff0[loop_cnt]); 938 src += 16; 939 dst += 16; 940 } 941 942 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 943 944 return sse; 945} 946 947static uint32_t sub_pixel_avg_sse_diff_4width_h_msa( 948 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 949 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 950 int32_t height, int32_t *diff) { 951 int16_t filtval; 952 uint32_t loop_cnt; 953 uint32_t ref0, ref1, ref2, ref3; 954 v16u8 out, pred, filt0, ref = { 0 }; 955 v16i8 src0, src1, src2, src3; 956 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 957 v8u16 vec0, vec1, vec2, vec3; 958 v8i16 avg = { 0 }; 959 v4i32 vec, var = { 0 }; 960 961 filtval = LH(filter); 962 filt0 = (v16u8)__msa_fill_h(filtval); 963 964 for (loop_cnt = (height >> 2); loop_cnt--;) { 965 LD_SB4(src, src_stride, src0, src1, src2, src3); 966 src += (4 * src_stride); 967 pred = LD_UB(sec_pred); 968 sec_pred += 16; 969 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 970 dst += (4 * dst_stride); 971 972 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 973 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 974 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 975 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 976 vec2, vec3); 977 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 978 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, 979 src2, src3); 980 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); 981 out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); 982 out = __msa_aver_u_b(out, pred); 983 CALC_MSE_AVG_B(out, ref, var, avg); 984 } 985 986 vec = __msa_hadd_s_w(avg, avg); 987 *diff = HADD_SW_S32(vec); 988 989 return HADD_SW_S32(var); 990} 991 992static uint32_t sub_pixel_avg_sse_diff_8width_h_msa( 993 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 994 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 995 int32_t height, int32_t *diff) { 996 int16_t filtval; 997 uint32_t loop_cnt; 998 v16u8 out, pred, filt0; 999 v16u8 ref0, ref1, ref2, ref3; 1000 v16i8 src0, src1, src2, src3; 1001 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1002 v8u16 vec0, vec1, vec2, vec3; 1003 v8i16 avg = { 0 }; 1004 v4i32 vec, var = { 0 }; 1005 1006 filtval = LH(filter); 1007 filt0 = (v16u8)__msa_fill_h(filtval); 1008 1009 for (loop_cnt = (height >> 2); loop_cnt--;) { 1010 LD_SB4(src, src_stride, src0, src1, src2, src3); 1011 src += (4 * src_stride); 1012 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1013 dst += (4 * dst_stride); 1014 1015 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 1016 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1017 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1018 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 1019 vec2, vec3); 1020 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 1021 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, 1022 src2, src3); 1023 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); 1024 1025 pred = LD_UB(sec_pred); 1026 sec_pred += 16; 1027 out = __msa_aver_u_b(out, pred); 1028 CALC_MSE_AVG_B(out, ref0, var, avg); 1029 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); 1030 pred = LD_UB(sec_pred); 1031 sec_pred += 16; 1032 out = __msa_aver_u_b(out, pred); 1033 CALC_MSE_AVG_B(out, ref1, var, avg); 1034 } 1035 1036 vec = __msa_hadd_s_w(avg, avg); 1037 *diff = HADD_SW_S32(vec); 1038 1039 return HADD_SW_S32(var); 1040} 1041 1042static uint32_t subpel_avg_ssediff_16w_h_msa( 1043 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1044 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 1045 int32_t height, int32_t *diff, int32_t width) { 1046 int16_t filtval; 1047 uint32_t loop_cnt; 1048 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 1049 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1050 v16u8 dst0, dst1, dst2, dst3; 1051 v16u8 tmp0, tmp1, tmp2, tmp3; 1052 v16u8 pred0, pred1, pred2, pred3, filt0; 1053 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1054 v8u16 out0, out1, out2, out3, out4, out5, out6, out7; 1055 v8i16 avg = { 0 }; 1056 v4i32 vec, var = { 0 }; 1057 1058 filtval = LH(filter); 1059 filt0 = (v16u8)__msa_fill_h(filtval); 1060 1061 for (loop_cnt = (height >> 2); loop_cnt--;) { 1062 LD_SB4(src, src_stride, src0, src2, src4, src6); 1063 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 1064 src += (4 * src_stride); 1065 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 1066 dst += (4 * dst_stride); 1067 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); 1068 sec_pred += (4 * width); 1069 1070 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1071 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1072 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); 1073 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); 1074 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 1075 out2, out3); 1076 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 1077 out6, out7); 1078 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 1079 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 1080 PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1, 1081 tmp2, tmp3); 1082 AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1, 1083 tmp2, tmp3); 1084 1085 CALC_MSE_AVG_B(tmp0, dst0, var, avg); 1086 CALC_MSE_AVG_B(tmp1, dst1, var, avg); 1087 CALC_MSE_AVG_B(tmp2, dst2, var, avg); 1088 CALC_MSE_AVG_B(tmp3, dst3, var, avg); 1089 } 1090 1091 vec = __msa_hadd_s_w(avg, avg); 1092 *diff = HADD_SW_S32(vec); 1093 1094 return HADD_SW_S32(var); 1095} 1096 1097static uint32_t sub_pixel_avg_sse_diff_16width_h_msa( 1098 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1099 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 1100 int32_t height, int32_t *diff) { 1101 return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, 1102 sec_pred, filter, height, diff, 16); 1103} 1104 1105static uint32_t sub_pixel_avg_sse_diff_32width_h_msa( 1106 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1107 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 1108 int32_t height, int32_t *diff) { 1109 uint32_t loop_cnt, sse = 0; 1110 int32_t diff0[2]; 1111 1112 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 1113 sse += 1114 subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, 1115 filter, height, &diff0[loop_cnt], 32); 1116 src += 16; 1117 dst += 16; 1118 sec_pred += 16; 1119 } 1120 1121 *diff = diff0[0] + diff0[1]; 1122 1123 return sse; 1124} 1125 1126static uint32_t sub_pixel_avg_sse_diff_64width_h_msa( 1127 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1128 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 1129 int32_t height, int32_t *diff) { 1130 uint32_t loop_cnt, sse = 0; 1131 int32_t diff0[4]; 1132 1133 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 1134 sse += 1135 subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, 1136 filter, height, &diff0[loop_cnt], 64); 1137 src += 16; 1138 dst += 16; 1139 sec_pred += 16; 1140 } 1141 1142 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 1143 1144 return sse; 1145} 1146 1147static uint32_t sub_pixel_avg_sse_diff_4width_v_msa( 1148 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1149 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 1150 int32_t height, int32_t *diff) { 1151 int16_t filtval; 1152 uint32_t loop_cnt; 1153 uint32_t ref0, ref1, ref2, ref3; 1154 v16u8 src0, src1, src2, src3, src4; 1155 v16u8 src10_r, src32_r, src21_r, src43_r; 1156 v16u8 out, pred, ref = { 0 }; 1157 v16u8 src2110, src4332, filt0; 1158 v8i16 avg = { 0 }; 1159 v4i32 vec, var = { 0 }; 1160 v8u16 tmp0, tmp1; 1161 1162 filtval = LH(filter); 1163 filt0 = (v16u8)__msa_fill_h(filtval); 1164 1165 src0 = LD_UB(src); 1166 src += src_stride; 1167 1168 for (loop_cnt = (height >> 2); loop_cnt--;) { 1169 LD_UB4(src, src_stride, src1, src2, src3, src4); 1170 src += (4 * src_stride); 1171 pred = LD_UB(sec_pred); 1172 sec_pred += 16; 1173 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 1174 dst += (4 * dst_stride); 1175 1176 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 1177 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 1178 src32_r, src43_r); 1179 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 1180 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); 1181 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1182 1183 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1184 out = __msa_aver_u_b(out, pred); 1185 CALC_MSE_AVG_B(out, ref, var, avg); 1186 src0 = src4; 1187 } 1188 1189 vec = __msa_hadd_s_w(avg, avg); 1190 *diff = HADD_SW_S32(vec); 1191 1192 return HADD_SW_S32(var); 1193} 1194 1195static uint32_t sub_pixel_avg_sse_diff_8width_v_msa( 1196 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1197 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 1198 int32_t height, int32_t *diff) { 1199 int16_t filtval; 1200 uint32_t loop_cnt; 1201 v16u8 src0, src1, src2, src3, src4; 1202 v16u8 ref0, ref1, ref2, ref3; 1203 v16u8 pred0, pred1, filt0; 1204 v8u16 vec0, vec1, vec2, vec3; 1205 v8u16 tmp0, tmp1, tmp2, tmp3; 1206 v8i16 avg = { 0 }; 1207 v4i32 vec, var = { 0 }; 1208 1209 filtval = LH(filter); 1210 filt0 = (v16u8)__msa_fill_h(filtval); 1211 1212 src0 = LD_UB(src); 1213 src += src_stride; 1214 1215 for (loop_cnt = (height >> 2); loop_cnt--;) { 1216 LD_UB4(src, src_stride, src1, src2, src3, src4); 1217 src += (4 * src_stride); 1218 LD_UB2(sec_pred, 16, pred0, pred1); 1219 sec_pred += 32; 1220 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1221 dst += (4 * dst_stride); 1222 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 1223 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, 1224 vec3); 1225 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, 1226 tmp2, tmp3); 1227 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 1228 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 1229 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 1230 CALC_MSE_AVG_B(src0, ref0, var, avg); 1231 CALC_MSE_AVG_B(src1, ref1, var, avg); 1232 1233 src0 = src4; 1234 } 1235 1236 vec = __msa_hadd_s_w(avg, avg); 1237 *diff = HADD_SW_S32(vec); 1238 1239 return HADD_SW_S32(var); 1240} 1241 1242static uint32_t subpel_avg_ssediff_16w_v_msa( 1243 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1244 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 1245 int32_t height, int32_t *diff, int32_t width) { 1246 int16_t filtval; 1247 uint32_t loop_cnt; 1248 v16u8 ref0, ref1, ref2, ref3; 1249 v16u8 pred0, pred1, pred2, pred3; 1250 v16u8 src0, src1, src2, src3, src4; 1251 v16u8 out0, out1, out2, out3, filt0; 1252 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1253 v8u16 tmp0, tmp1, tmp2, tmp3; 1254 v8i16 avg = { 0 }; 1255 v4i32 vec, var = { 0 }; 1256 1257 filtval = LH(filter); 1258 filt0 = (v16u8)__msa_fill_h(filtval); 1259 1260 src0 = LD_UB(src); 1261 src += src_stride; 1262 1263 for (loop_cnt = (height >> 2); loop_cnt--;) { 1264 LD_UB4(src, src_stride, src1, src2, src3, src4); 1265 src += (4 * src_stride); 1266 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); 1267 sec_pred += (4 * width); 1268 1269 ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); 1270 ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); 1271 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 1272 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1273 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1274 1275 ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); 1276 ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); 1277 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 1278 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 1279 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 1280 1281 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 1282 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1283 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1284 1285 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 1286 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 1287 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 1288 1289 src0 = src4; 1290 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1291 dst += (4 * dst_stride); 1292 1293 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, 1294 out2, out3); 1295 1296 CALC_MSE_AVG_B(out0, ref0, var, avg); 1297 CALC_MSE_AVG_B(out1, ref1, var, avg); 1298 CALC_MSE_AVG_B(out2, ref2, var, avg); 1299 CALC_MSE_AVG_B(out3, ref3, var, avg); 1300 } 1301 1302 vec = __msa_hadd_s_w(avg, avg); 1303 *diff = HADD_SW_S32(vec); 1304 1305 return HADD_SW_S32(var); 1306} 1307 1308static uint32_t sub_pixel_avg_sse_diff_16width_v_msa( 1309 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1310 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 1311 int32_t height, int32_t *diff) { 1312 return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, 1313 sec_pred, filter, height, diff, 16); 1314} 1315 1316static uint32_t sub_pixel_avg_sse_diff_32width_v_msa( 1317 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1318 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 1319 int32_t height, int32_t *diff) { 1320 uint32_t loop_cnt, sse = 0; 1321 int32_t diff0[2]; 1322 1323 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 1324 sse += 1325 subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, 1326 filter, height, &diff0[loop_cnt], 32); 1327 src += 16; 1328 dst += 16; 1329 sec_pred += 16; 1330 } 1331 1332 *diff = diff0[0] + diff0[1]; 1333 1334 return sse; 1335} 1336 1337static uint32_t sub_pixel_avg_sse_diff_64width_v_msa( 1338 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1339 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, 1340 int32_t height, int32_t *diff) { 1341 uint32_t loop_cnt, sse = 0; 1342 int32_t diff0[4]; 1343 1344 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 1345 sse += 1346 subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, 1347 filter, height, &diff0[loop_cnt], 64); 1348 src += 16; 1349 dst += 16; 1350 sec_pred += 16; 1351 } 1352 1353 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 1354 1355 return sse; 1356} 1357 1358static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa( 1359 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1360 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, 1361 const uint8_t *filter_vert, int32_t height, int32_t *diff) { 1362 int16_t filtval; 1363 uint32_t loop_cnt; 1364 uint32_t ref0, ref1, ref2, ref3; 1365 v16u8 src0, src1, src2, src3, src4; 1366 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; 1367 v16u8 filt_hz, filt_vt, vec0, vec1; 1368 v16u8 out, pred, ref = { 0 }; 1369 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; 1370 v8i16 avg = { 0 }; 1371 v4i32 vec, var = { 0 }; 1372 1373 filtval = LH(filter_horiz); 1374 filt_hz = (v16u8)__msa_fill_h(filtval); 1375 filtval = LH(filter_vert); 1376 filt_vt = (v16u8)__msa_fill_h(filtval); 1377 1378 src0 = LD_UB(src); 1379 src += src_stride; 1380 1381 for (loop_cnt = (height >> 2); loop_cnt--;) { 1382 LD_UB4(src, src_stride, src1, src2, src3, src4); 1383 src += (4 * src_stride); 1384 pred = LD_UB(sec_pred); 1385 sec_pred += 16; 1386 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 1387 dst += (4 * dst_stride); 1388 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 1389 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 1390 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 1391 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 1392 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 1393 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); 1394 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 1395 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1396 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1397 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1398 out = __msa_aver_u_b(out, pred); 1399 CALC_MSE_AVG_B(out, ref, var, avg); 1400 src0 = src4; 1401 } 1402 1403 vec = __msa_hadd_s_w(avg, avg); 1404 *diff = HADD_SW_S32(vec); 1405 1406 return HADD_SW_S32(var); 1407} 1408 1409static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa( 1410 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1411 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, 1412 const uint8_t *filter_vert, int32_t height, int32_t *diff) { 1413 int16_t filtval; 1414 uint32_t loop_cnt; 1415 v16u8 ref0, ref1, ref2, ref3; 1416 v16u8 src0, src1, src2, src3, src4; 1417 v16u8 pred0, pred1, out0, out1; 1418 v16u8 filt_hz, filt_vt, vec0; 1419 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1420 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 1421 v8i16 avg = { 0 }; 1422 v4i32 vec, var = { 0 }; 1423 1424 filtval = LH(filter_horiz); 1425 filt_hz = (v16u8)__msa_fill_h(filtval); 1426 filtval = LH(filter_vert); 1427 filt_vt = (v16u8)__msa_fill_h(filtval); 1428 1429 src0 = LD_UB(src); 1430 src += src_stride; 1431 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 1432 1433 for (loop_cnt = (height >> 2); loop_cnt--;) { 1434 LD_UB4(src, src_stride, src1, src2, src3, src4); 1435 src += (4 * src_stride); 1436 LD_UB2(sec_pred, 16, pred0, pred1); 1437 sec_pred += 32; 1438 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1439 dst += (4 * dst_stride); 1440 1441 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 1442 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 1443 1444 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 1445 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 1446 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 1447 1448 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 1449 tmp1 = __msa_dotp_u_h(vec0, filt_vt); 1450 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1451 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 1452 1453 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 1454 tmp2 = __msa_dotp_u_h(vec0, filt_vt); 1455 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 1456 1457 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 1458 tmp3 = __msa_dotp_u_h(vec0, filt_vt); 1459 1460 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 1461 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 1462 AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); 1463 1464 CALC_MSE_AVG_B(out0, ref0, var, avg); 1465 CALC_MSE_AVG_B(out1, ref1, var, avg); 1466 } 1467 1468 vec = __msa_hadd_s_w(avg, avg); 1469 *diff = HADD_SW_S32(vec); 1470 1471 return HADD_SW_S32(var); 1472} 1473 1474static uint32_t subpel_avg_ssediff_16w_hv_msa( 1475 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1476 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, 1477 const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) { 1478 int16_t filtval; 1479 uint32_t loop_cnt; 1480 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 1481 v16u8 ref0, ref1, ref2, ref3; 1482 v16u8 pred0, pred1, pred2, pred3; 1483 v16u8 out0, out1, out2, out3; 1484 v16u8 filt_hz, filt_vt, vec0, vec1; 1485 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1486 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; 1487 v8i16 avg = { 0 }; 1488 v4i32 vec, var = { 0 }; 1489 1490 filtval = LH(filter_horiz); 1491 filt_hz = (v16u8)__msa_fill_h(filtval); 1492 filtval = LH(filter_vert); 1493 filt_vt = (v16u8)__msa_fill_h(filtval); 1494 1495 LD_UB2(src, 8, src0, src1); 1496 src += src_stride; 1497 1498 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 1499 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 1500 1501 for (loop_cnt = (height >> 2); loop_cnt--;) { 1502 LD_UB4(src, src_stride, src0, src2, src4, src6); 1503 LD_UB4(src + 8, src_stride, src1, src3, src5, src7); 1504 src += (4 * src_stride); 1505 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); 1506 sec_pred += (4 * width); 1507 1508 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 1509 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 1510 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 1511 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1512 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1513 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1514 1515 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 1516 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 1517 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 1518 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1519 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1520 out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1521 1522 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 1523 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); 1524 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 1525 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1526 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1527 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1528 1529 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); 1530 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); 1531 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 1532 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1533 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1534 out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1535 1536 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1537 dst += (4 * dst_stride); 1538 1539 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, 1540 out2, out3); 1541 1542 CALC_MSE_AVG_B(out0, ref0, var, avg); 1543 CALC_MSE_AVG_B(out1, ref1, var, avg); 1544 CALC_MSE_AVG_B(out2, ref2, var, avg); 1545 CALC_MSE_AVG_B(out3, ref3, var, avg); 1546 } 1547 1548 vec = __msa_hadd_s_w(avg, avg); 1549 *diff = HADD_SW_S32(vec); 1550 1551 return HADD_SW_S32(var); 1552} 1553 1554static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa( 1555 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1556 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, 1557 const uint8_t *filter_vert, int32_t height, int32_t *diff) { 1558 return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, 1559 sec_pred, filter_horiz, filter_vert, 1560 height, diff, 16); 1561} 1562 1563static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa( 1564 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1565 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, 1566 const uint8_t *filter_vert, int32_t height, int32_t *diff) { 1567 uint32_t loop_cnt, sse = 0; 1568 int32_t diff0[2]; 1569 1570 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 1571 sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, 1572 sec_pred, filter_horiz, filter_vert, 1573 height, &diff0[loop_cnt], 32); 1574 src += 16; 1575 dst += 16; 1576 sec_pred += 16; 1577 } 1578 1579 *diff = diff0[0] + diff0[1]; 1580 1581 return sse; 1582} 1583 1584static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( 1585 const uint8_t *src, int32_t src_stride, const uint8_t *dst, 1586 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, 1587 const uint8_t *filter_vert, int32_t height, int32_t *diff) { 1588 uint32_t loop_cnt, sse = 0; 1589 int32_t diff0[4]; 1590 1591 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 1592 sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, 1593 sec_pred, filter_horiz, filter_vert, 1594 height, &diff0[loop_cnt], 64); 1595 src += 16; 1596 dst += 16; 1597 sec_pred += 16; 1598 } 1599 1600 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 1601 1602 return sse; 1603} 1604 1605#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); 1606#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); 1607#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); 1608#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); 1609#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); 1610#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); 1611#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); 1612 1613#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); 1614#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); 1615#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); 1616#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); 1617#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); 1618#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); 1619 1620#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ 1621 uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa( \ 1622 const uint8_t *src, int32_t src_stride, int32_t xoffset, \ 1623 int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \ 1624 uint32_t *sse) { \ 1625 int32_t diff; \ 1626 uint32_t var; \ 1627 const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ 1628 const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ 1629 \ 1630 if (yoffset) { \ 1631 if (xoffset) { \ 1632 *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \ 1633 src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ 1634 } else { \ 1635 *sse = sub_pixel_sse_diff_##wd##width_v_msa( \ 1636 src, src_stride, ref, ref_stride, v_filter, ht, &diff); \ 1637 } \ 1638 \ 1639 var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ 1640 } else { \ 1641 if (xoffset) { \ 1642 *sse = sub_pixel_sse_diff_##wd##width_h_msa( \ 1643 src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ 1644 \ 1645 var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ 1646 } else { \ 1647 var = vpx_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \ 1648 sse); \ 1649 } \ 1650 } \ 1651 \ 1652 return var; \ 1653 } 1654 1655VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4); 1656VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8); 1657 1658VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4); 1659VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8); 1660VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16); 1661 1662VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8); 1663VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16); 1664VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32); 1665 1666VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16); 1667VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32); 1668VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64); 1669 1670VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32); 1671VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64); 1672 1673#define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \ 1674 uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \ 1675 const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ 1676 int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ 1677 uint32_t *sse, const uint8_t *sec_pred) { \ 1678 int32_t diff; \ 1679 const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ 1680 const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ 1681 \ 1682 if (yoffset) { \ 1683 if (xoffset) { \ 1684 *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \ 1685 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ 1686 v_filter, ht, &diff); \ 1687 } else { \ 1688 *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \ 1689 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ 1690 &diff); \ 1691 } \ 1692 } else { \ 1693 if (xoffset) { \ 1694 *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \ 1695 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ 1696 &diff); \ 1697 } else { \ 1698 *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \ 1699 ref_stride, sec_pred, ht, &diff); \ 1700 } \ 1701 } \ 1702 \ 1703 return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ 1704 } 1705 1706VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4); 1707VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8); 1708 1709VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4); 1710VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8); 1711VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16); 1712 1713VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8); 1714VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16); 1715VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32); 1716 1717VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16); 1718VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32); 1719 1720uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, 1721 int32_t src_stride, 1722 int32_t xoffset, int32_t yoffset, 1723 const uint8_t *ref_ptr, 1724 int32_t ref_stride, uint32_t *sse, 1725 const uint8_t *sec_pred) { 1726 int32_t diff; 1727 const uint8_t *h_filter = bilinear_filters_msa[xoffset]; 1728 const uint8_t *v_filter = bilinear_filters_msa[yoffset]; 1729 1730 if (yoffset) { 1731 if (xoffset) { 1732 *sse = sub_pixel_avg_sse_diff_32width_hv_msa( 1733 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, 1734 v_filter, 64, &diff); 1735 } else { 1736 *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr, 1737 ref_stride, sec_pred, 1738 v_filter, 64, &diff); 1739 } 1740 } else { 1741 if (xoffset) { 1742 *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr, 1743 ref_stride, sec_pred, 1744 h_filter, 64, &diff); 1745 } else { 1746 *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride, 1747 sec_pred, &diff); 1748 } 1749 } 1750 1751 return VARIANCE_32Wx64H(*sse, diff); 1752} 1753 1754#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ 1755 uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa( \ 1756 const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ 1757 int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ 1758 uint32_t *sse, const uint8_t *sec_pred) { \ 1759 int32_t diff; \ 1760 const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ 1761 const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ 1762 \ 1763 if (yoffset) { \ 1764 if (xoffset) { \ 1765 *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \ 1766 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ 1767 v_filter, ht, &diff); \ 1768 } else { \ 1769 *sse = sub_pixel_avg_sse_diff_64width_v_msa( \ 1770 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ 1771 &diff); \ 1772 } \ 1773 } else { \ 1774 if (xoffset) { \ 1775 *sse = sub_pixel_avg_sse_diff_64width_h_msa( \ 1776 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ 1777 &diff); \ 1778 } else { \ 1779 *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \ 1780 ref_stride, sec_pred, &diff); \ 1781 } \ 1782 } \ 1783 \ 1784 return VARIANCE_64Wx##ht##H(*sse, diff); \ 1785 } 1786 1787VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32); 1788VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64); 1789