sub_pixel_variance_msa.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vpx_dsp_rtcd.h" 12#include "vpx_ports/mem.h" 13#include "vpx_dsp/mips/macros_msa.h" 14#include "vpx_dsp/variance.h" 15 16static const uint8_t bilinear_filters_msa[8][2] = { 17 { 128, 0, }, 18 { 112, 16, }, 19 { 96, 32, }, 20 { 80, 48, }, 21 { 64, 64, }, 22 { 48, 80, }, 23 { 32, 96, }, 24 { 16, 112, }, 25}; 26 27#define CALC_MSE_AVG_B(src, ref, var, sub) { \ 28 v16u8 src_l0_m, src_l1_m; \ 29 v8i16 res_l0_m, res_l1_m; \ 30 \ 31 ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ 32 HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ 33 DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ 34 \ 35 sub += res_l0_m + res_l1_m; \ 36} 37 38#define VARIANCE_WxH(sse, diff, shift) \ 39 sse - (((uint32_t)diff * diff) >> shift) 40 41#define VARIANCE_LARGE_WxH(sse, diff, shift) \ 42 sse - (((int64_t)diff * diff) >> shift) 43 44static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr, 45 int32_t src_stride, 46 const uint8_t *ref_ptr, 47 int32_t ref_stride, 48 const uint8_t *sec_pred, 49 int32_t height, 50 int32_t *diff) { 51 int32_t ht_cnt; 52 uint32_t src0, src1, src2, src3; 53 uint32_t ref0, ref1, ref2, ref3; 54 v16u8 pred, src = { 0 }; 55 v16u8 ref = { 0 }; 56 v8i16 avg = { 0 }; 57 v4i32 vec, var = { 0 }; 58 59 for (ht_cnt = (height >> 2); ht_cnt--;) { 60 pred = LD_UB(sec_pred); 61 sec_pred += 16; 62 LW4(src_ptr, src_stride, src0, src1, src2, src3); 63 src_ptr += (4 * src_stride); 64 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 65 ref_ptr += (4 * ref_stride); 66 67 INSERT_W4_UB(src0, src1, src2, src3, src); 68 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 69 70 src = __msa_aver_u_b(src, pred); 71 CALC_MSE_AVG_B(src, ref, var, avg); 72 } 73 74 vec = __msa_hadd_s_w(avg, avg); 75 *diff = HADD_SW_S32(vec); 76 77 return HADD_SW_S32(var); 78} 79 80static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr, 81 int32_t src_stride, 82 const uint8_t *ref_ptr, 83 int32_t ref_stride, 84 const uint8_t *sec_pred, 85 int32_t height, 86 int32_t *diff) { 87 int32_t ht_cnt; 88 v16u8 src0, src1, src2, src3; 89 v16u8 ref0, ref1, ref2, ref3; 90 v16u8 pred0, pred1; 91 v8i16 avg = { 0 }; 92 v4i32 vec, var = { 0 }; 93 94 for (ht_cnt = (height >> 2); ht_cnt--;) { 95 LD_UB2(sec_pred, 16, pred0, pred1); 96 sec_pred += 32; 97 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); 98 src_ptr += (4 * src_stride); 99 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); 100 ref_ptr += (4 * ref_stride); 101 102 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, 103 src0, src1, ref0, ref1); 104 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 105 CALC_MSE_AVG_B(src0, ref0, var, avg); 106 CALC_MSE_AVG_B(src1, ref1, var, avg); 107 } 108 109 vec = __msa_hadd_s_w(avg, avg); 110 *diff = HADD_SW_S32(vec); 111 112 return HADD_SW_S32(var); 113} 114 115static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr, 116 int32_t src_stride, 117 const uint8_t *ref_ptr, 118 int32_t ref_stride, 119 const uint8_t *sec_pred, 120 int32_t height, 121 int32_t *diff) { 122 int32_t ht_cnt; 123 v16u8 src, ref, pred; 124 v8i16 avg = { 0 }; 125 v4i32 vec, var = { 0 }; 126 127 for (ht_cnt = (height >> 2); ht_cnt--;) { 128 pred = LD_UB(sec_pred); 129 sec_pred += 16; 130 src = LD_UB(src_ptr); 131 src_ptr += src_stride; 132 ref = LD_UB(ref_ptr); 133 ref_ptr += ref_stride; 134 src = __msa_aver_u_b(src, pred); 135 CALC_MSE_AVG_B(src, ref, var, avg); 136 137 pred = LD_UB(sec_pred); 138 sec_pred += 16; 139 src = LD_UB(src_ptr); 140 src_ptr += src_stride; 141 ref = LD_UB(ref_ptr); 142 ref_ptr += ref_stride; 143 src = __msa_aver_u_b(src, pred); 144 CALC_MSE_AVG_B(src, ref, var, avg); 145 146 pred = LD_UB(sec_pred); 147 sec_pred += 16; 148 src = LD_UB(src_ptr); 149 src_ptr += src_stride; 150 ref = LD_UB(ref_ptr); 151 ref_ptr += ref_stride; 152 src = __msa_aver_u_b(src, pred); 153 CALC_MSE_AVG_B(src, ref, var, avg); 154 155 pred = LD_UB(sec_pred); 156 sec_pred += 16; 157 src = LD_UB(src_ptr); 158 src_ptr += src_stride; 159 ref = LD_UB(ref_ptr); 160 ref_ptr += ref_stride; 161 src = __msa_aver_u_b(src, pred); 162 CALC_MSE_AVG_B(src, ref, var, avg); 163 } 164 165 vec = __msa_hadd_s_w(avg, avg); 166 *diff = HADD_SW_S32(vec); 167 168 return HADD_SW_S32(var); 169} 170 171static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr, 172 int32_t src_stride, 173 const uint8_t *ref_ptr, 174 int32_t ref_stride, 175 const uint8_t *sec_pred, 176 int32_t height, 177 int32_t *diff) { 178 int32_t ht_cnt; 179 v16u8 src0, src1, ref0, ref1, pred0, pred1; 180 v8i16 avg = { 0 }; 181 v4i32 vec, var = { 0 }; 182 183 for (ht_cnt = (height >> 2); ht_cnt--;) { 184 LD_UB2(sec_pred, 16, pred0, pred1); 185 sec_pred += 32; 186 LD_UB2(src_ptr, 16, src0, src1); 187 src_ptr += src_stride; 188 LD_UB2(ref_ptr, 16, ref0, ref1); 189 ref_ptr += ref_stride; 190 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 191 CALC_MSE_AVG_B(src0, ref0, var, avg); 192 CALC_MSE_AVG_B(src1, ref1, var, avg); 193 194 LD_UB2(sec_pred, 16, pred0, pred1); 195 sec_pred += 32; 196 LD_UB2(src_ptr, 16, src0, src1); 197 src_ptr += src_stride; 198 LD_UB2(ref_ptr, 16, ref0, ref1); 199 ref_ptr += ref_stride; 200 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 201 CALC_MSE_AVG_B(src0, ref0, var, avg); 202 CALC_MSE_AVG_B(src1, ref1, var, avg); 203 204 LD_UB2(sec_pred, 16, pred0, pred1); 205 sec_pred += 32; 206 LD_UB2(src_ptr, 16, src0, src1); 207 src_ptr += src_stride; 208 LD_UB2(ref_ptr, 16, ref0, ref1); 209 ref_ptr += ref_stride; 210 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 211 CALC_MSE_AVG_B(src0, ref0, var, avg); 212 CALC_MSE_AVG_B(src1, ref1, var, avg); 213 214 LD_UB2(sec_pred, 16, pred0, pred1); 215 sec_pred += 32; 216 LD_UB2(src_ptr, 16, src0, src1); 217 src_ptr += src_stride; 218 LD_UB2(ref_ptr, 16, ref0, ref1); 219 ref_ptr += ref_stride; 220 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 221 CALC_MSE_AVG_B(src0, ref0, var, avg); 222 CALC_MSE_AVG_B(src1, ref1, var, avg); 223 } 224 225 vec = __msa_hadd_s_w(avg, avg); 226 *diff = HADD_SW_S32(vec); 227 228 return HADD_SW_S32(var); 229} 230 231static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr, 232 int32_t src_stride, 233 const uint8_t *ref_ptr, 234 int32_t ref_stride, 235 const uint8_t *sec_pred, 236 int32_t *diff) { 237 int32_t ht_cnt; 238 v16u8 src0, src1, ref0, ref1, pred0, pred1; 239 v8i16 avg0 = { 0 }; 240 v8i16 avg1 = { 0 }; 241 v4i32 vec, var = { 0 }; 242 243 for (ht_cnt = 16; ht_cnt--;) { 244 LD_UB2(sec_pred, 16, pred0, pred1); 245 sec_pred += 32; 246 LD_UB2(src_ptr, 16, src0, src1); 247 src_ptr += src_stride; 248 LD_UB2(ref_ptr, 16, ref0, ref1); 249 ref_ptr += ref_stride; 250 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 251 CALC_MSE_AVG_B(src0, ref0, var, avg0); 252 CALC_MSE_AVG_B(src1, ref1, var, avg1); 253 254 LD_UB2(sec_pred, 16, pred0, pred1); 255 sec_pred += 32; 256 LD_UB2(src_ptr, 16, src0, src1); 257 src_ptr += src_stride; 258 LD_UB2(ref_ptr, 16, ref0, ref1); 259 ref_ptr += ref_stride; 260 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 261 CALC_MSE_AVG_B(src0, ref0, var, avg0); 262 CALC_MSE_AVG_B(src1, ref1, var, avg1); 263 264 LD_UB2(sec_pred, 16, pred0, pred1); 265 sec_pred += 32; 266 LD_UB2(src_ptr, 16, src0, src1); 267 src_ptr += src_stride; 268 LD_UB2(ref_ptr, 16, ref0, ref1); 269 ref_ptr += ref_stride; 270 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 271 CALC_MSE_AVG_B(src0, ref0, var, avg0); 272 CALC_MSE_AVG_B(src1, ref1, var, avg1); 273 274 LD_UB2(sec_pred, 16, pred0, pred1); 275 sec_pred += 32; 276 LD_UB2(src_ptr, 16, src0, src1); 277 src_ptr += src_stride; 278 LD_UB2(ref_ptr, 16, ref0, ref1); 279 ref_ptr += ref_stride; 280 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 281 CALC_MSE_AVG_B(src0, ref0, var, avg0); 282 CALC_MSE_AVG_B(src1, ref1, var, avg1); 283 } 284 285 vec = __msa_hadd_s_w(avg0, avg0); 286 vec += __msa_hadd_s_w(avg1, avg1); 287 *diff = HADD_SW_S32(vec); 288 289 return HADD_SW_S32(var); 290} 291 292static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr, 293 int32_t src_stride, 294 const uint8_t *ref_ptr, 295 int32_t ref_stride, 296 const uint8_t *sec_pred, 297 int32_t *diff) { 298 int32_t ht_cnt; 299 v16u8 src0, src1, src2, src3; 300 v16u8 ref0, ref1, ref2, ref3; 301 v16u8 pred0, pred1, pred2, pred3; 302 v8i16 avg0 = { 0 }; 303 v8i16 avg1 = { 0 }; 304 v4i32 vec, var = { 0 }; 305 306 for (ht_cnt = 16; ht_cnt--;) { 307 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 308 sec_pred += 64; 309 LD_UB4(src_ptr, 16, src0, src1, src2, src3); 310 src_ptr += src_stride; 311 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); 312 ref_ptr += ref_stride; 313 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, 314 src0, src1, src2, src3); 315 CALC_MSE_AVG_B(src0, ref0, var, avg0); 316 CALC_MSE_AVG_B(src2, ref2, var, avg0); 317 CALC_MSE_AVG_B(src1, ref1, var, avg1); 318 CALC_MSE_AVG_B(src3, ref3, var, avg1); 319 320 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 321 sec_pred += 64; 322 LD_UB4(src_ptr, 16, src0, src1, src2, src3); 323 src_ptr += src_stride; 324 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); 325 ref_ptr += ref_stride; 326 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, 327 src0, src1, src2, src3); 328 CALC_MSE_AVG_B(src0, ref0, var, avg0); 329 CALC_MSE_AVG_B(src2, ref2, var, avg0); 330 CALC_MSE_AVG_B(src1, ref1, var, avg1); 331 CALC_MSE_AVG_B(src3, ref3, var, avg1); 332 } 333 334 vec = __msa_hadd_s_w(avg0, avg0); 335 vec += __msa_hadd_s_w(avg1, avg1); 336 337 *diff = HADD_SW_S32(vec); 338 339 return HADD_SW_S32(var); 340} 341 342static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr, 343 int32_t src_stride, 344 const uint8_t *ref_ptr, 345 int32_t ref_stride, 346 const uint8_t *sec_pred, 347 int32_t *diff) { 348 int32_t ht_cnt; 349 v16u8 src0, src1, src2, src3; 350 v16u8 ref0, ref1, ref2, ref3; 351 v16u8 pred0, pred1, pred2, pred3; 352 v8i16 avg0 = { 0 }; 353 v8i16 avg1 = { 0 }; 354 v8i16 avg2 = { 0 }; 355 v8i16 avg3 = { 0 }; 356 v4i32 vec, var = { 0 }; 357 358 for (ht_cnt = 32; ht_cnt--;) { 359 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 360 sec_pred += 64; 361 LD_UB4(src_ptr, 16, src0, src1, src2, src3); 362 src_ptr += src_stride; 363 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); 364 ref_ptr += ref_stride; 365 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, 366 src0, src1, src2, src3); 367 CALC_MSE_AVG_B(src0, ref0, var, avg0); 368 CALC_MSE_AVG_B(src1, ref1, var, avg1); 369 CALC_MSE_AVG_B(src2, ref2, var, avg2); 370 CALC_MSE_AVG_B(src3, ref3, var, avg3); 371 372 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); 373 sec_pred += 64; 374 LD_UB4(src_ptr, 16, src0, src1, src2, src3); 375 src_ptr += src_stride; 376 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); 377 ref_ptr += ref_stride; 378 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, 379 src0, src1, src2, src3); 380 CALC_MSE_AVG_B(src0, ref0, var, avg0); 381 CALC_MSE_AVG_B(src1, ref1, var, avg1); 382 CALC_MSE_AVG_B(src2, ref2, var, avg2); 383 CALC_MSE_AVG_B(src3, ref3, var, avg3); 384 } 385 386 vec = __msa_hadd_s_w(avg0, avg0); 387 vec += __msa_hadd_s_w(avg1, avg1); 388 vec += __msa_hadd_s_w(avg2, avg2); 389 vec += __msa_hadd_s_w(avg3, avg3); 390 *diff = HADD_SW_S32(vec); 391 392 return HADD_SW_S32(var); 393} 394 395static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src, 396 int32_t src_stride, 397 const uint8_t *dst, 398 int32_t dst_stride, 399 const uint8_t *filter, 400 int32_t height, 401 int32_t *diff) { 402 int16_t filtval; 403 uint32_t loop_cnt; 404 uint32_t ref0, ref1, ref2, ref3; 405 v16u8 filt0, ref = { 0 }; 406 v16i8 src0, src1, src2, src3; 407 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 408 v8u16 vec0, vec1, vec2, vec3; 409 v8i16 avg = { 0 }; 410 v4i32 vec, var = { 0 }; 411 412 filtval = LH(filter); 413 filt0 = (v16u8)__msa_fill_h(filtval); 414 415 for (loop_cnt = (height >> 2); loop_cnt--;) { 416 LD_SB4(src, src_stride, src0, src1, src2, src3); 417 src += (4 * src_stride); 418 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 419 dst += (4 * dst_stride); 420 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 421 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 422 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 423 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 424 vec0, vec1, vec2, vec3); 425 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 426 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, 427 src0, src1, src2, src3); 428 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); 429 src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); 430 CALC_MSE_AVG_B(src0, ref, var, avg); 431 } 432 433 vec = __msa_hadd_s_w(avg, avg); 434 *diff = HADD_SW_S32(vec); 435 436 return HADD_SW_S32(var); 437} 438 439static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src, 440 int32_t src_stride, 441 const uint8_t *dst, 442 int32_t dst_stride, 443 const uint8_t *filter, 444 int32_t height, 445 int32_t *diff) { 446 int16_t filtval; 447 uint32_t loop_cnt; 448 v16u8 filt0, out, ref0, ref1, ref2, ref3; 449 v16i8 src0, src1, src2, src3; 450 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 451 v8u16 vec0, vec1, vec2, vec3; 452 v8i16 avg = { 0 }; 453 v4i32 vec, var = { 0 }; 454 455 filtval = LH(filter); 456 filt0 = (v16u8)__msa_fill_h(filtval); 457 458 for (loop_cnt = (height >> 2); loop_cnt--;) { 459 LD_SB4(src, src_stride, src0, src1, src2, src3); 460 src += (4 * src_stride); 461 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 462 dst += (4 * dst_stride); 463 464 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 465 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 466 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 467 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 468 vec0, vec1, vec2, vec3); 469 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 470 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, 471 src0, src1, src2, src3); 472 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); 473 CALC_MSE_AVG_B(out, ref0, var, avg); 474 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); 475 CALC_MSE_AVG_B(out, ref1, var, avg); 476 } 477 478 vec = __msa_hadd_s_w(avg, avg); 479 *diff = HADD_SW_S32(vec); 480 481 return HADD_SW_S32(var); 482} 483 484static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src, 485 int32_t src_stride, 486 const uint8_t *dst, 487 int32_t dst_stride, 488 const uint8_t *filter, 489 int32_t height, 490 int32_t *diff) { 491 int16_t filtval; 492 uint32_t loop_cnt; 493 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 494 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 495 v16u8 dst0, dst1, dst2, dst3, filt0; 496 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 497 v8u16 out0, out1, out2, out3, out4, out5, out6, out7; 498 v8i16 avg = { 0 }; 499 v4i32 vec, var = { 0 }; 500 501 filtval = LH(filter); 502 filt0 = (v16u8)__msa_fill_h(filtval); 503 504 for (loop_cnt = (height >> 2); loop_cnt--;) { 505 LD_SB4(src, src_stride, src0, src2, src4, src6); 506 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 507 src += (4 * src_stride); 508 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 509 dst += (4 * dst_stride); 510 511 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 512 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 513 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); 514 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); 515 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 516 out0, out1, out2, out3); 517 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 518 out4, out5, out6, out7); 519 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 520 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 521 PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, 522 src0, src1, src2, src3); 523 CALC_MSE_AVG_B(src0, dst0, var, avg); 524 CALC_MSE_AVG_B(src1, dst1, var, avg); 525 CALC_MSE_AVG_B(src2, dst2, var, avg); 526 CALC_MSE_AVG_B(src3, dst3, var, avg); 527 } 528 529 vec = __msa_hadd_s_w(avg, avg); 530 *diff = HADD_SW_S32(vec); 531 532 return HADD_SW_S32(var); 533} 534 535static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src, 536 int32_t src_stride, 537 const uint8_t *dst, 538 int32_t dst_stride, 539 const uint8_t *filter, 540 int32_t height, 541 int32_t *diff) { 542 uint32_t loop_cnt, sse = 0; 543 int32_t diff0[2]; 544 545 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 546 sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, 547 filter, height, &diff0[loop_cnt]); 548 src += 16; 549 dst += 16; 550 } 551 552 *diff = diff0[0] + diff0[1]; 553 554 return sse; 555} 556 557static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src, 558 int32_t src_stride, 559 const uint8_t *dst, 560 int32_t dst_stride, 561 const uint8_t *filter, 562 int32_t height, 563 int32_t *diff) { 564 uint32_t loop_cnt, sse = 0; 565 int32_t diff0[4]; 566 567 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 568 sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, 569 filter, height, &diff0[loop_cnt]); 570 src += 16; 571 dst += 16; 572 } 573 574 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 575 576 return sse; 577} 578 579static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src, 580 int32_t src_stride, 581 const uint8_t *dst, 582 int32_t dst_stride, 583 const uint8_t *filter, 584 int32_t height, 585 int32_t *diff) { 586 int16_t filtval; 587 uint32_t loop_cnt; 588 uint32_t ref0, ref1, ref2, ref3; 589 v16u8 src0, src1, src2, src3, src4, out; 590 v16u8 src10_r, src32_r, src21_r, src43_r; 591 v16u8 ref = { 0 }; 592 v16u8 src2110, src4332; 593 v16u8 filt0; 594 v8i16 avg = { 0 }; 595 v4i32 vec, var = { 0 }; 596 v8u16 tmp0, tmp1; 597 598 filtval = LH(filter); 599 filt0 = (v16u8)__msa_fill_h(filtval); 600 601 src0 = LD_UB(src); 602 src += src_stride; 603 604 for (loop_cnt = (height >> 2); loop_cnt--;) { 605 LD_UB4(src, src_stride, src1, src2, src3, src4); 606 src += (4 * src_stride); 607 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 608 dst += (4 * dst_stride); 609 610 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 611 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 612 src10_r, src21_r, src32_r, src43_r); 613 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 614 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); 615 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 616 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 617 CALC_MSE_AVG_B(out, ref, var, avg); 618 src0 = src4; 619 } 620 621 vec = __msa_hadd_s_w(avg, avg); 622 *diff = HADD_SW_S32(vec); 623 624 return HADD_SW_S32(var); 625} 626 627static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src, 628 int32_t src_stride, 629 const uint8_t *dst, 630 int32_t dst_stride, 631 const uint8_t *filter, 632 int32_t height, 633 int32_t *diff) { 634 int16_t filtval; 635 uint32_t loop_cnt; 636 v16u8 src0, src1, src2, src3, src4; 637 v16u8 ref0, ref1, ref2, ref3; 638 v8u16 vec0, vec1, vec2, vec3; 639 v8u16 tmp0, tmp1, tmp2, tmp3; 640 v16u8 filt0; 641 v8i16 avg = { 0 }; 642 v4i32 vec, var = { 0 }; 643 644 filtval = LH(filter); 645 filt0 = (v16u8)__msa_fill_h(filtval); 646 647 src0 = LD_UB(src); 648 src += src_stride; 649 650 for (loop_cnt = (height >> 2); loop_cnt--;) { 651 LD_UB4(src, src_stride, src1, src2, src3, src4); 652 src += (4 * src_stride); 653 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 654 dst += (4 * dst_stride); 655 656 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 657 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, 658 vec0, vec1, vec2, vec3); 659 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 660 tmp0, tmp1, tmp2, tmp3); 661 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 662 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 663 CALC_MSE_AVG_B(src0, ref0, var, avg); 664 CALC_MSE_AVG_B(src1, ref1, var, avg); 665 src0 = src4; 666 } 667 668 vec = __msa_hadd_s_w(avg, avg); 669 *diff = HADD_SW_S32(vec); 670 671 return HADD_SW_S32(var); 672} 673 674static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src, 675 int32_t src_stride, 676 const uint8_t *dst, 677 int32_t dst_stride, 678 const uint8_t *filter, 679 int32_t height, 680 int32_t *diff) { 681 int16_t filtval; 682 uint32_t loop_cnt; 683 v16u8 ref0, ref1, ref2, ref3; 684 v16u8 src0, src1, src2, src3, src4; 685 v16u8 out0, out1, out2, out3; 686 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 687 v8u16 tmp0, tmp1, tmp2, tmp3; 688 v16u8 filt0; 689 v8i16 avg = { 0 }; 690 v4i32 vec, var = { 0 }; 691 692 filtval = LH(filter); 693 filt0 = (v16u8)__msa_fill_h(filtval); 694 695 src0 = LD_UB(src); 696 src += src_stride; 697 698 for (loop_cnt = (height >> 2); loop_cnt--;) { 699 LD_UB4(src, src_stride, src1, src2, src3, src4); 700 src += (4 * src_stride); 701 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 702 dst += (4 * dst_stride); 703 704 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 705 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 706 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 707 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 708 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 709 710 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); 711 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); 712 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 713 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 714 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 715 716 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 717 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 718 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 719 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 720 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 721 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 722 723 src0 = src4; 724 725 CALC_MSE_AVG_B(out0, ref0, var, avg); 726 CALC_MSE_AVG_B(out1, ref1, var, avg); 727 CALC_MSE_AVG_B(out2, ref2, var, avg); 728 CALC_MSE_AVG_B(out3, ref3, var, avg); 729 } 730 731 vec = __msa_hadd_s_w(avg, avg); 732 *diff = HADD_SW_S32(vec); 733 734 return HADD_SW_S32(var); 735} 736 737static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src, 738 int32_t src_stride, 739 const uint8_t *dst, 740 int32_t dst_stride, 741 const uint8_t *filter, 742 int32_t height, 743 int32_t *diff) { 744 uint32_t loop_cnt, sse = 0; 745 int32_t diff0[2]; 746 747 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 748 sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, 749 filter, height, &diff0[loop_cnt]); 750 src += 16; 751 dst += 16; 752 } 753 754 *diff = diff0[0] + diff0[1]; 755 756 return sse; 757} 758 759static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src, 760 int32_t src_stride, 761 const uint8_t *dst, 762 int32_t dst_stride, 763 const uint8_t *filter, 764 int32_t height, 765 int32_t *diff) { 766 uint32_t loop_cnt, sse = 0; 767 int32_t diff0[4]; 768 769 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 770 sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, 771 filter, height, &diff0[loop_cnt]); 772 src += 16; 773 dst += 16; 774 } 775 776 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 777 778 return sse; 779} 780 781static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src, 782 int32_t src_stride, 783 const uint8_t *dst, 784 int32_t dst_stride, 785 const uint8_t *filter_horiz, 786 const uint8_t *filter_vert, 787 int32_t height, 788 int32_t *diff) { 789 int16_t filtval; 790 uint32_t loop_cnt; 791 uint32_t ref0, ref1, ref2, ref3; 792 v16u8 src0, src1, src2, src3, src4; 793 v16u8 out, ref = { 0 }; 794 v16u8 filt_vt, filt_hz, vec0, vec1; 795 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; 796 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4; 797 v8u16 tmp0, tmp1; 798 v8i16 avg = { 0 }; 799 v4i32 vec, var = { 0 }; 800 801 filtval = LH(filter_horiz); 802 filt_hz = (v16u8)__msa_fill_h(filtval); 803 filtval = LH(filter_vert); 804 filt_vt = (v16u8)__msa_fill_h(filtval); 805 806 src0 = LD_UB(src); 807 src += src_stride; 808 809 for (loop_cnt = (height >> 2); loop_cnt--;) { 810 LD_UB4(src, src_stride, src1, src2, src3, src4); 811 src += (4 * src_stride); 812 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 813 dst += (4 * dst_stride); 814 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 815 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 816 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 817 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 818 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 819 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); 820 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 821 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 822 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 823 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 824 CALC_MSE_AVG_B(out, ref, var, avg); 825 src0 = src4; 826 } 827 828 vec = __msa_hadd_s_w(avg, avg); 829 *diff = HADD_SW_S32(vec); 830 831 return HADD_SW_S32(var); 832} 833 834static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src, 835 int32_t src_stride, 836 const uint8_t *dst, 837 int32_t dst_stride, 838 const uint8_t *filter_horiz, 839 const uint8_t *filter_vert, 840 int32_t height, 841 int32_t *diff) { 842 int16_t filtval; 843 uint32_t loop_cnt; 844 v16u8 ref0, ref1, ref2, ref3; 845 v16u8 src0, src1, src2, src3, src4; 846 v16u8 out0, out1; 847 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 848 v8u16 hz_out0, hz_out1; 849 v8u16 tmp0, tmp1, tmp2, tmp3; 850 v16u8 filt_vt, filt_hz, vec0; 851 v8i16 avg = { 0 }; 852 v4i32 vec, var = { 0 }; 853 854 filtval = LH(filter_horiz); 855 filt_hz = (v16u8)__msa_fill_h(filtval); 856 filtval = LH(filter_vert); 857 filt_vt = (v16u8)__msa_fill_h(filtval); 858 859 src0 = LD_UB(src); 860 src += src_stride; 861 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 862 863 for (loop_cnt = (height >> 2); loop_cnt--;) { 864 LD_UB4(src, src_stride, src1, src2, src3, src4); 865 src += (4 * src_stride); 866 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 867 dst += (4 * dst_stride); 868 869 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 870 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 871 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 872 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 873 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 874 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 875 tmp1 = __msa_dotp_u_h(vec0, filt_vt); 876 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 877 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 878 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 879 tmp2 = __msa_dotp_u_h(vec0, filt_vt); 880 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 881 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 882 tmp3 = __msa_dotp_u_h(vec0, filt_vt); 883 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 884 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 885 CALC_MSE_AVG_B(out0, ref0, var, avg); 886 CALC_MSE_AVG_B(out1, ref1, var, avg); 887 } 888 889 vec = __msa_hadd_s_w(avg, avg); 890 *diff = HADD_SW_S32(vec); 891 892 return HADD_SW_S32(var); 893} 894 895static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src, 896 int32_t src_stride, 897 const uint8_t *dst, 898 int32_t dst_stride, 899 const uint8_t *filter_horiz, 900 const uint8_t *filter_vert, 901 int32_t height, 902 int32_t *diff) { 903 int16_t filtval; 904 uint32_t loop_cnt; 905 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 906 v16u8 ref0, ref1, ref2, ref3; 907 v16u8 filt_hz, filt_vt, vec0, vec1; 908 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 909 v8u16 hz_out0, hz_out1, hz_out2, hz_out3; 910 v8u16 tmp0, tmp1; 911 v8i16 avg = { 0 }; 912 v4i32 vec, var = { 0 }; 913 914 filtval = LH(filter_horiz); 915 filt_hz = (v16u8)__msa_fill_h(filtval); 916 filtval = LH(filter_vert); 917 filt_vt = (v16u8)__msa_fill_h(filtval); 918 919 LD_UB2(src, 8, src0, src1); 920 src += src_stride; 921 922 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 923 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 924 925 for (loop_cnt = (height >> 2); loop_cnt--;) { 926 LD_UB4(src, src_stride, src0, src2, src4, src6); 927 LD_UB4(src + 8, src_stride, src1, src3, src5, src7); 928 src += (4 * src_stride); 929 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 930 dst += (4 * dst_stride); 931 932 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 933 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 934 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 935 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 936 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 937 src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 938 939 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 940 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 941 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 942 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 943 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 944 src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 945 946 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 947 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); 948 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 949 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 950 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 951 src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 952 953 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); 954 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); 955 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 956 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 957 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 958 src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 959 960 CALC_MSE_AVG_B(src0, ref0, var, avg); 961 CALC_MSE_AVG_B(src1, ref1, var, avg); 962 CALC_MSE_AVG_B(src2, ref2, var, avg); 963 CALC_MSE_AVG_B(src3, ref3, var, avg); 964 } 965 966 vec = __msa_hadd_s_w(avg, avg); 967 *diff = HADD_SW_S32(vec); 968 969 return HADD_SW_S32(var); 970} 971 972static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src, 973 int32_t src_stride, 974 const uint8_t *dst, 975 int32_t dst_stride, 976 const uint8_t *filter_horiz, 977 const uint8_t *filter_vert, 978 int32_t height, 979 int32_t *diff) { 980 uint32_t loop_cnt, sse = 0; 981 int32_t diff0[2]; 982 983 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 984 sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, 985 filter_horiz, filter_vert, height, 986 &diff0[loop_cnt]); 987 src += 16; 988 dst += 16; 989 } 990 991 *diff = diff0[0] + diff0[1]; 992 993 return sse; 994} 995 996static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src, 997 int32_t src_stride, 998 const uint8_t *dst, 999 int32_t dst_stride, 1000 const uint8_t *filter_horiz, 1001 const uint8_t *filter_vert, 1002 int32_t height, 1003 int32_t *diff) { 1004 uint32_t loop_cnt, sse = 0; 1005 int32_t diff0[4]; 1006 1007 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 1008 sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, 1009 filter_horiz, filter_vert, height, 1010 &diff0[loop_cnt]); 1011 src += 16; 1012 dst += 16; 1013 } 1014 1015 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 1016 1017 return sse; 1018} 1019 1020static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src, 1021 int32_t src_stride, 1022 const uint8_t *dst, 1023 int32_t dst_stride, 1024 const uint8_t *sec_pred, 1025 const uint8_t *filter, 1026 int32_t height, 1027 int32_t *diff) { 1028 int16_t filtval; 1029 uint32_t loop_cnt; 1030 uint32_t ref0, ref1, ref2, ref3; 1031 v16u8 out, pred, filt0, ref = { 0 }; 1032 v16i8 src0, src1, src2, src3; 1033 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1034 v8u16 vec0, vec1, vec2, vec3; 1035 v8i16 avg = { 0 }; 1036 v4i32 vec, var = { 0 }; 1037 1038 filtval = LH(filter); 1039 filt0 = (v16u8)__msa_fill_h(filtval); 1040 1041 for (loop_cnt = (height >> 2); loop_cnt--;) { 1042 LD_SB4(src, src_stride, src0, src1, src2, src3); 1043 src += (4 * src_stride); 1044 pred = LD_UB(sec_pred); 1045 sec_pred += 16; 1046 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 1047 dst += (4 * dst_stride); 1048 1049 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 1050 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1051 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1052 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1053 vec0, vec1, vec2, vec3); 1054 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 1055 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, 1056 src0, src1, src2, src3); 1057 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); 1058 out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); 1059 out = __msa_aver_u_b(out, pred); 1060 CALC_MSE_AVG_B(out, ref, var, avg); 1061 } 1062 1063 vec = __msa_hadd_s_w(avg, avg); 1064 *diff = HADD_SW_S32(vec); 1065 1066 return HADD_SW_S32(var); 1067} 1068 1069static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src, 1070 int32_t src_stride, 1071 const uint8_t *dst, 1072 int32_t dst_stride, 1073 const uint8_t *sec_pred, 1074 const uint8_t *filter, 1075 int32_t height, 1076 int32_t *diff) { 1077 int16_t filtval; 1078 uint32_t loop_cnt; 1079 v16u8 out, pred, filt0; 1080 v16u8 ref0, ref1, ref2, ref3; 1081 v16i8 src0, src1, src2, src3; 1082 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1083 v8u16 vec0, vec1, vec2, vec3; 1084 v8i16 avg = { 0 }; 1085 v4i32 vec, var = { 0 }; 1086 1087 filtval = LH(filter); 1088 filt0 = (v16u8)__msa_fill_h(filtval); 1089 1090 for (loop_cnt = (height >> 2); loop_cnt--;) { 1091 LD_SB4(src, src_stride, src0, src1, src2, src3); 1092 src += (4 * src_stride); 1093 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1094 dst += (4 * dst_stride); 1095 1096 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 1097 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1098 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1099 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1100 vec0, vec1, vec2, vec3); 1101 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 1102 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, 1103 src0, src1, src2, src3); 1104 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); 1105 1106 pred = LD_UB(sec_pred); 1107 sec_pred += 16; 1108 out = __msa_aver_u_b(out, pred); 1109 CALC_MSE_AVG_B(out, ref0, var, avg); 1110 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); 1111 pred = LD_UB(sec_pred); 1112 sec_pred += 16; 1113 out = __msa_aver_u_b(out, pred); 1114 CALC_MSE_AVG_B(out, ref1, var, avg); 1115 } 1116 1117 vec = __msa_hadd_s_w(avg, avg); 1118 *diff = HADD_SW_S32(vec); 1119 1120 return HADD_SW_S32(var); 1121} 1122 1123static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src, 1124 int32_t src_stride, 1125 const uint8_t *dst, 1126 int32_t dst_stride, 1127 const uint8_t *sec_pred, 1128 const uint8_t *filter, 1129 int32_t height, 1130 int32_t *diff, 1131 int32_t width) { 1132 int16_t filtval; 1133 uint32_t loop_cnt; 1134 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 1135 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1136 v16u8 dst0, dst1, dst2, dst3; 1137 v16u8 tmp0, tmp1, tmp2, tmp3; 1138 v16u8 pred0, pred1, pred2, pred3, filt0; 1139 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1140 v8u16 out0, out1, out2, out3, out4, out5, out6, out7; 1141 v8i16 avg = { 0 }; 1142 v4i32 vec, var = { 0 }; 1143 1144 filtval = LH(filter); 1145 filt0 = (v16u8)__msa_fill_h(filtval); 1146 1147 for (loop_cnt = (height >> 2); loop_cnt--;) { 1148 LD_SB4(src, src_stride, src0, src2, src4, src6); 1149 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 1150 src += (4 * src_stride); 1151 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 1152 dst += (4 * dst_stride); 1153 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); 1154 sec_pred += (4 * width); 1155 1156 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1157 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1158 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); 1159 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); 1160 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1161 out0, out1, out2, out3); 1162 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 1163 out4, out5, out6, out7); 1164 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 1165 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 1166 PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, 1167 tmp0, tmp1, tmp2, tmp3); 1168 AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, 1169 tmp0, tmp1, tmp2, tmp3); 1170 1171 CALC_MSE_AVG_B(tmp0, dst0, var, avg); 1172 CALC_MSE_AVG_B(tmp1, dst1, var, avg); 1173 CALC_MSE_AVG_B(tmp2, dst2, var, avg); 1174 CALC_MSE_AVG_B(tmp3, dst3, var, avg); 1175 } 1176 1177 vec = __msa_hadd_s_w(avg, avg); 1178 *diff = HADD_SW_S32(vec); 1179 1180 return HADD_SW_S32(var); 1181} 1182 1183static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t *src, 1184 int32_t src_stride, 1185 const uint8_t *dst, 1186 int32_t dst_stride, 1187 const uint8_t *sec_pred, 1188 const uint8_t *filter, 1189 int32_t height, 1190 int32_t *diff) { 1191 return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, 1192 sec_pred, filter, height, diff, 16); 1193} 1194 1195static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src, 1196 int32_t src_stride, 1197 const uint8_t *dst, 1198 int32_t dst_stride, 1199 const uint8_t *sec_pred, 1200 const uint8_t *filter, 1201 int32_t height, 1202 int32_t *diff) { 1203 uint32_t loop_cnt, sse = 0; 1204 int32_t diff0[2]; 1205 1206 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 1207 sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, 1208 sec_pred, filter, height, 1209 &diff0[loop_cnt], 32); 1210 src += 16; 1211 dst += 16; 1212 sec_pred += 16; 1213 } 1214 1215 *diff = diff0[0] + diff0[1]; 1216 1217 return sse; 1218} 1219 1220static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src, 1221 int32_t src_stride, 1222 const uint8_t *dst, 1223 int32_t dst_stride, 1224 const uint8_t *sec_pred, 1225 const uint8_t *filter, 1226 int32_t height, 1227 int32_t *diff) { 1228 uint32_t loop_cnt, sse = 0; 1229 int32_t diff0[4]; 1230 1231 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 1232 sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, 1233 sec_pred, filter, height, 1234 &diff0[loop_cnt], 64); 1235 src += 16; 1236 dst += 16; 1237 sec_pred += 16; 1238 } 1239 1240 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 1241 1242 return sse; 1243} 1244 1245static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src, 1246 int32_t src_stride, 1247 const uint8_t *dst, 1248 int32_t dst_stride, 1249 const uint8_t *sec_pred, 1250 const uint8_t *filter, 1251 int32_t height, 1252 int32_t *diff) { 1253 int16_t filtval; 1254 uint32_t loop_cnt; 1255 uint32_t ref0, ref1, ref2, ref3; 1256 v16u8 src0, src1, src2, src3, src4; 1257 v16u8 src10_r, src32_r, src21_r, src43_r; 1258 v16u8 out, pred, ref = { 0 }; 1259 v16u8 src2110, src4332, filt0; 1260 v8i16 avg = { 0 }; 1261 v4i32 vec, var = { 0 }; 1262 v8u16 tmp0, tmp1; 1263 1264 filtval = LH(filter); 1265 filt0 = (v16u8)__msa_fill_h(filtval); 1266 1267 src0 = LD_UB(src); 1268 src += src_stride; 1269 1270 for (loop_cnt = (height >> 2); loop_cnt--;) { 1271 LD_UB4(src, src_stride, src1, src2, src3, src4); 1272 src += (4 * src_stride); 1273 pred = LD_UB(sec_pred); 1274 sec_pred += 16; 1275 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 1276 dst += (4 * dst_stride); 1277 1278 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 1279 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 1280 src10_r, src21_r, src32_r, src43_r); 1281 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 1282 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); 1283 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1284 1285 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1286 out = __msa_aver_u_b(out, pred); 1287 CALC_MSE_AVG_B(out, ref, var, avg); 1288 src0 = src4; 1289 } 1290 1291 vec = __msa_hadd_s_w(avg, avg); 1292 *diff = HADD_SW_S32(vec); 1293 1294 return HADD_SW_S32(var); 1295} 1296 1297static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src, 1298 int32_t src_stride, 1299 const uint8_t *dst, 1300 int32_t dst_stride, 1301 const uint8_t *sec_pred, 1302 const uint8_t *filter, 1303 int32_t height, 1304 int32_t *diff) { 1305 int16_t filtval; 1306 uint32_t loop_cnt; 1307 v16u8 src0, src1, src2, src3, src4; 1308 v16u8 ref0, ref1, ref2, ref3; 1309 v16u8 pred0, pred1, filt0; 1310 v8u16 vec0, vec1, vec2, vec3; 1311 v8u16 tmp0, tmp1, tmp2, tmp3; 1312 v8i16 avg = { 0 }; 1313 v4i32 vec, var = { 0 }; 1314 1315 filtval = LH(filter); 1316 filt0 = (v16u8)__msa_fill_h(filtval); 1317 1318 src0 = LD_UB(src); 1319 src += src_stride; 1320 1321 for (loop_cnt = (height >> 2); loop_cnt--;) { 1322 LD_UB4(src, src_stride, src1, src2, src3, src4); 1323 src += (4 * src_stride); 1324 LD_UB2(sec_pred, 16, pred0, pred1); 1325 sec_pred += 32; 1326 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1327 dst += (4 * dst_stride); 1328 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 1329 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, 1330 vec0, vec1, vec2, vec3); 1331 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1332 tmp0, tmp1, tmp2, tmp3); 1333 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 1334 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 1335 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 1336 CALC_MSE_AVG_B(src0, ref0, var, avg); 1337 CALC_MSE_AVG_B(src1, ref1, var, avg); 1338 1339 src0 = src4; 1340 } 1341 1342 vec = __msa_hadd_s_w(avg, avg); 1343 *diff = HADD_SW_S32(vec); 1344 1345 return HADD_SW_S32(var); 1346} 1347 1348static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src, 1349 int32_t src_stride, 1350 const uint8_t *dst, 1351 int32_t dst_stride, 1352 const uint8_t *sec_pred, 1353 const uint8_t *filter, 1354 int32_t height, 1355 int32_t *diff, 1356 int32_t width) { 1357 int16_t filtval; 1358 uint32_t loop_cnt; 1359 v16u8 ref0, ref1, ref2, ref3; 1360 v16u8 pred0, pred1, pred2, pred3; 1361 v16u8 src0, src1, src2, src3, src4; 1362 v16u8 out0, out1, out2, out3, filt0; 1363 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1364 v8u16 tmp0, tmp1, tmp2, tmp3; 1365 v8i16 avg = { 0 }; 1366 v4i32 vec, var = { 0 }; 1367 1368 filtval = LH(filter); 1369 filt0 = (v16u8)__msa_fill_h(filtval); 1370 1371 src0 = LD_UB(src); 1372 src += src_stride; 1373 1374 for (loop_cnt = (height >> 2); loop_cnt--;) { 1375 LD_UB4(src, src_stride, src1, src2, src3, src4); 1376 src += (4 * src_stride); 1377 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); 1378 sec_pred += (4 * width); 1379 1380 ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); 1381 ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); 1382 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 1383 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1384 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1385 1386 ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); 1387 ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); 1388 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 1389 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 1390 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 1391 1392 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 1393 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1394 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1395 1396 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 1397 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 1398 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 1399 1400 src0 = src4; 1401 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1402 dst += (4 * dst_stride); 1403 1404 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, 1405 out0, out1, out2, out3); 1406 1407 CALC_MSE_AVG_B(out0, ref0, var, avg); 1408 CALC_MSE_AVG_B(out1, ref1, var, avg); 1409 CALC_MSE_AVG_B(out2, ref2, var, avg); 1410 CALC_MSE_AVG_B(out3, ref3, var, avg); 1411 } 1412 1413 vec = __msa_hadd_s_w(avg, avg); 1414 *diff = HADD_SW_S32(vec); 1415 1416 return HADD_SW_S32(var); 1417} 1418 1419static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t *src, 1420 int32_t src_stride, 1421 const uint8_t *dst, 1422 int32_t dst_stride, 1423 const uint8_t *sec_pred, 1424 const uint8_t *filter, 1425 int32_t height, 1426 int32_t *diff) { 1427 return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, 1428 sec_pred, filter, height, diff, 16); 1429} 1430 1431static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src, 1432 int32_t src_stride, 1433 const uint8_t *dst, 1434 int32_t dst_stride, 1435 const uint8_t *sec_pred, 1436 const uint8_t *filter, 1437 int32_t height, 1438 int32_t *diff) { 1439 uint32_t loop_cnt, sse = 0; 1440 int32_t diff0[2]; 1441 1442 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 1443 sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, 1444 sec_pred, filter, height, 1445 &diff0[loop_cnt], 32); 1446 src += 16; 1447 dst += 16; 1448 sec_pred += 16; 1449 } 1450 1451 *diff = diff0[0] + diff0[1]; 1452 1453 return sse; 1454} 1455 1456static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src, 1457 int32_t src_stride, 1458 const uint8_t *dst, 1459 int32_t dst_stride, 1460 const uint8_t *sec_pred, 1461 const uint8_t *filter, 1462 int32_t height, 1463 int32_t *diff) { 1464 uint32_t loop_cnt, sse = 0; 1465 int32_t diff0[4]; 1466 1467 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 1468 sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, 1469 sec_pred, filter, height, 1470 &diff0[loop_cnt], 64); 1471 src += 16; 1472 dst += 16; 1473 sec_pred += 16; 1474 } 1475 1476 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 1477 1478 return sse; 1479} 1480 1481static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa( 1482 const uint8_t *src, int32_t src_stride, 1483 const uint8_t *dst, int32_t dst_stride, 1484 const uint8_t *sec_pred, 1485 const uint8_t *filter_horiz, const uint8_t *filter_vert, 1486 int32_t height, int32_t *diff) { 1487 int16_t filtval; 1488 uint32_t loop_cnt; 1489 uint32_t ref0, ref1, ref2, ref3; 1490 v16u8 src0, src1, src2, src3, src4; 1491 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; 1492 v16u8 filt_hz, filt_vt, vec0, vec1; 1493 v16u8 out, pred, ref = { 0 }; 1494 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; 1495 v8i16 avg = { 0 }; 1496 v4i32 vec, var = { 0 }; 1497 1498 filtval = LH(filter_horiz); 1499 filt_hz = (v16u8)__msa_fill_h(filtval); 1500 filtval = LH(filter_vert); 1501 filt_vt = (v16u8)__msa_fill_h(filtval); 1502 1503 src0 = LD_UB(src); 1504 src += src_stride; 1505 1506 for (loop_cnt = (height >> 2); loop_cnt--;) { 1507 LD_UB4(src, src_stride, src1, src2, src3, src4); 1508 src += (4 * src_stride); 1509 pred = LD_UB(sec_pred); 1510 sec_pred += 16; 1511 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 1512 dst += (4 * dst_stride); 1513 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 1514 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 1515 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 1516 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 1517 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 1518 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); 1519 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 1520 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1521 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1522 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1523 out = __msa_aver_u_b(out, pred); 1524 CALC_MSE_AVG_B(out, ref, var, avg); 1525 src0 = src4; 1526 } 1527 1528 vec = __msa_hadd_s_w(avg, avg); 1529 *diff = HADD_SW_S32(vec); 1530 1531 return HADD_SW_S32(var); 1532} 1533 1534static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa( 1535 const uint8_t *src, int32_t src_stride, 1536 const uint8_t *dst, int32_t dst_stride, 1537 const uint8_t *sec_pred, 1538 const uint8_t *filter_horiz, const uint8_t *filter_vert, 1539 int32_t height, int32_t *diff) { 1540 int16_t filtval; 1541 uint32_t loop_cnt; 1542 v16u8 ref0, ref1, ref2, ref3; 1543 v16u8 src0, src1, src2, src3, src4; 1544 v16u8 pred0, pred1, out0, out1; 1545 v16u8 filt_hz, filt_vt, vec0; 1546 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1547 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 1548 v8i16 avg = { 0 }; 1549 v4i32 vec, var = { 0 }; 1550 1551 filtval = LH(filter_horiz); 1552 filt_hz = (v16u8)__msa_fill_h(filtval); 1553 filtval = LH(filter_vert); 1554 filt_vt = (v16u8)__msa_fill_h(filtval); 1555 1556 src0 = LD_UB(src); 1557 src += src_stride; 1558 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 1559 1560 for (loop_cnt = (height >> 2); loop_cnt--;) { 1561 LD_UB4(src, src_stride, src1, src2, src3, src4); 1562 src += (4 * src_stride); 1563 LD_UB2(sec_pred, 16, pred0, pred1); 1564 sec_pred += 32; 1565 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1566 dst += (4 * dst_stride); 1567 1568 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 1569 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 1570 1571 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 1572 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 1573 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 1574 1575 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 1576 tmp1 = __msa_dotp_u_h(vec0, filt_vt); 1577 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1578 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 1579 1580 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 1581 tmp2 = __msa_dotp_u_h(vec0, filt_vt); 1582 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 1583 1584 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 1585 tmp3 = __msa_dotp_u_h(vec0, filt_vt); 1586 1587 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 1588 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 1589 AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); 1590 1591 CALC_MSE_AVG_B(out0, ref0, var, avg); 1592 CALC_MSE_AVG_B(out1, ref1, var, avg); 1593 } 1594 1595 vec = __msa_hadd_s_w(avg, avg); 1596 *diff = HADD_SW_S32(vec); 1597 1598 return HADD_SW_S32(var); 1599} 1600 1601static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src, 1602 int32_t src_stride, 1603 const uint8_t *dst, 1604 int32_t dst_stride, 1605 const uint8_t *sec_pred, 1606 const uint8_t *filter_horiz, 1607 const uint8_t *filter_vert, 1608 int32_t height, 1609 int32_t *diff, 1610 int32_t width) { 1611 int16_t filtval; 1612 uint32_t loop_cnt; 1613 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 1614 v16u8 ref0, ref1, ref2, ref3; 1615 v16u8 pred0, pred1, pred2, pred3; 1616 v16u8 out0, out1, out2, out3; 1617 v16u8 filt_hz, filt_vt, vec0, vec1; 1618 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1619 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; 1620 v8i16 avg = { 0 }; 1621 v4i32 vec, var = { 0 }; 1622 1623 filtval = LH(filter_horiz); 1624 filt_hz = (v16u8)__msa_fill_h(filtval); 1625 filtval = LH(filter_vert); 1626 filt_vt = (v16u8)__msa_fill_h(filtval); 1627 1628 LD_UB2(src, 8, src0, src1); 1629 src += src_stride; 1630 1631 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 1632 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 1633 1634 for (loop_cnt = (height >> 2); loop_cnt--;) { 1635 LD_UB4(src, src_stride, src0, src2, src4, src6); 1636 LD_UB4(src + 8, src_stride, src1, src3, src5, src7); 1637 src += (4 * src_stride); 1638 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); 1639 sec_pred += (4 * width); 1640 1641 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 1642 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 1643 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 1644 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1645 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1646 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1647 1648 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 1649 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 1650 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 1651 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1652 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1653 out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1654 1655 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 1656 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); 1657 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 1658 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1659 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1660 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1661 1662 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); 1663 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); 1664 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 1665 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1666 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1667 out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1668 1669 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1670 dst += (4 * dst_stride); 1671 1672 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, 1673 out0, out1, out2, out3); 1674 1675 CALC_MSE_AVG_B(out0, ref0, var, avg); 1676 CALC_MSE_AVG_B(out1, ref1, var, avg); 1677 CALC_MSE_AVG_B(out2, ref2, var, avg); 1678 CALC_MSE_AVG_B(out3, ref3, var, avg); 1679 } 1680 1681 vec = __msa_hadd_s_w(avg, avg); 1682 *diff = HADD_SW_S32(vec); 1683 1684 return HADD_SW_S32(var); 1685} 1686 1687static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa( 1688 const uint8_t *src, int32_t src_stride, 1689 const uint8_t *dst, int32_t dst_stride, 1690 const uint8_t *sec_pred, 1691 const uint8_t *filter_horiz, const uint8_t *filter_vert, 1692 int32_t height, int32_t *diff) { 1693 return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, 1694 sec_pred, filter_horiz, filter_vert, 1695 height, diff, 16); 1696} 1697 1698static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa( 1699 const uint8_t *src, int32_t src_stride, 1700 const uint8_t *dst, int32_t dst_stride, 1701 const uint8_t *sec_pred, 1702 const uint8_t *filter_horiz, const uint8_t *filter_vert, 1703 int32_t height, int32_t *diff) { 1704 uint32_t loop_cnt, sse = 0; 1705 int32_t diff0[2]; 1706 1707 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { 1708 sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, 1709 sec_pred, filter_horiz, filter_vert, 1710 height, &diff0[loop_cnt], 32); 1711 src += 16; 1712 dst += 16; 1713 sec_pred += 16; 1714 } 1715 1716 *diff = diff0[0] + diff0[1]; 1717 1718 return sse; 1719} 1720 1721static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( 1722 const uint8_t *src, int32_t src_stride, 1723 const uint8_t *dst, int32_t dst_stride, 1724 const uint8_t *sec_pred, 1725 const uint8_t *filter_horiz, const uint8_t *filter_vert, 1726 int32_t height, int32_t *diff) { 1727 uint32_t loop_cnt, sse = 0; 1728 int32_t diff0[4]; 1729 1730 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { 1731 sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, 1732 sec_pred, filter_horiz, filter_vert, 1733 height, &diff0[loop_cnt], 64); 1734 src += 16; 1735 dst += 16; 1736 sec_pred += 16; 1737 } 1738 1739 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; 1740 1741 return sse; 1742} 1743 1744#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); 1745#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); 1746#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); 1747#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); 1748#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); 1749#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); 1750#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); 1751 1752#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); 1753#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); 1754#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); 1755#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); 1756#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); 1757#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); 1758 1759#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ 1760uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \ 1761 int32_t src_stride, \ 1762 int32_t xoffset, \ 1763 int32_t yoffset, \ 1764 const uint8_t *ref, \ 1765 int32_t ref_stride, \ 1766 uint32_t *sse) { \ 1767 int32_t diff; \ 1768 uint32_t var; \ 1769 const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ 1770 const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ 1771 \ 1772 if (yoffset) { \ 1773 if (xoffset) { \ 1774 *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride, \ 1775 ref, ref_stride, \ 1776 h_filter, v_filter, \ 1777 ht, &diff); \ 1778 } else { \ 1779 *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride, \ 1780 ref, ref_stride, \ 1781 v_filter, ht, &diff); \ 1782 } \ 1783 \ 1784 var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ 1785 } else { \ 1786 if (xoffset) { \ 1787 *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride, \ 1788 ref, ref_stride, \ 1789 h_filter, ht, &diff); \ 1790 \ 1791 var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ 1792 } else { \ 1793 var = vpx_variance##wd##x##ht##_msa(src, src_stride, \ 1794 ref, ref_stride, sse); \ 1795 } \ 1796 } \ 1797 \ 1798 return var; \ 1799} 1800 1801VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4); 1802VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8); 1803 1804VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4); 1805VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8); 1806VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16); 1807 1808VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8); 1809VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16); 1810VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32); 1811 1812VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16); 1813VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32); 1814VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64); 1815 1816VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32); 1817VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64); 1818 1819#define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \ 1820uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \ 1821 const uint8_t *src_ptr, int32_t src_stride, \ 1822 int32_t xoffset, int32_t yoffset, \ 1823 const uint8_t *ref_ptr, int32_t ref_stride, \ 1824 uint32_t *sse, const uint8_t *sec_pred) { \ 1825 int32_t diff; \ 1826 const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ 1827 const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ 1828 \ 1829 if (yoffset) { \ 1830 if (xoffset) { \ 1831 *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(src_ptr, src_stride, \ 1832 ref_ptr, ref_stride, \ 1833 sec_pred, h_filter, \ 1834 v_filter, ht, &diff); \ 1835 } else { \ 1836 *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(src_ptr, src_stride, \ 1837 ref_ptr, ref_stride, \ 1838 sec_pred, v_filter, \ 1839 ht, &diff); \ 1840 } \ 1841 } else { \ 1842 if (xoffset) { \ 1843 *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(src_ptr, src_stride, \ 1844 ref_ptr, ref_stride, \ 1845 sec_pred, h_filter, \ 1846 ht, &diff); \ 1847 } else { \ 1848 *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, \ 1849 ref_ptr, ref_stride, \ 1850 sec_pred, ht, &diff); \ 1851 } \ 1852 } \ 1853 \ 1854 return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ 1855} 1856 1857VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4); 1858VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8); 1859 1860VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4); 1861VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8); 1862VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16); 1863 1864VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8); 1865VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16); 1866VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32); 1867 1868VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16); 1869VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32); 1870 1871uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, 1872 int32_t src_stride, 1873 int32_t xoffset, 1874 int32_t yoffset, 1875 const uint8_t *ref_ptr, 1876 int32_t ref_stride, 1877 uint32_t *sse, 1878 const uint8_t *sec_pred) { 1879 int32_t diff; 1880 const uint8_t *h_filter = bilinear_filters_msa[xoffset]; 1881 const uint8_t *v_filter = bilinear_filters_msa[yoffset]; 1882 1883 if (yoffset) { 1884 if (xoffset) { 1885 *sse = sub_pixel_avg_sse_diff_32width_hv_msa(src_ptr, src_stride, 1886 ref_ptr, ref_stride, 1887 sec_pred, h_filter, 1888 v_filter, 64, &diff); 1889 } else { 1890 *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, 1891 ref_ptr, ref_stride, 1892 sec_pred, v_filter, 1893 64, &diff); 1894 } 1895 } else { 1896 if (xoffset) { 1897 *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, 1898 ref_ptr, ref_stride, 1899 sec_pred, h_filter, 1900 64, &diff); 1901 } else { 1902 *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride, 1903 sec_pred, &diff); 1904 } 1905 } 1906 1907 return VARIANCE_32Wx64H(*sse, diff); 1908} 1909 1910#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ 1911uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(const uint8_t *src_ptr, \ 1912 int32_t src_stride, \ 1913 int32_t xoffset, \ 1914 int32_t yoffset, \ 1915 const uint8_t *ref_ptr, \ 1916 int32_t ref_stride, \ 1917 uint32_t *sse, \ 1918 const uint8_t *sec_pred) { \ 1919 int32_t diff; \ 1920 const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ 1921 const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ 1922 \ 1923 if (yoffset) { \ 1924 if (xoffset) { \ 1925 *sse = sub_pixel_avg_sse_diff_64width_hv_msa(src_ptr, src_stride, \ 1926 ref_ptr, ref_stride, \ 1927 sec_pred, h_filter, \ 1928 v_filter, ht, &diff); \ 1929 } else { \ 1930 *sse = sub_pixel_avg_sse_diff_64width_v_msa(src_ptr, src_stride, \ 1931 ref_ptr, ref_stride, \ 1932 sec_pred, v_filter, \ 1933 ht, &diff); \ 1934 } \ 1935 } else { \ 1936 if (xoffset) { \ 1937 *sse = sub_pixel_avg_sse_diff_64width_h_msa(src_ptr, src_stride, \ 1938 ref_ptr, ref_stride, \ 1939 sec_pred, h_filter, \ 1940 ht, &diff); \ 1941 } else { \ 1942 *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, \ 1943 ref_ptr, ref_stride, \ 1944 sec_pred, &diff); \ 1945 } \ 1946 } \ 1947 \ 1948 return VARIANCE_64Wx##ht##H(*sse, diff); \ 1949} 1950 1951VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32); 1952VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64); 1953