1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include "./vpx_dsp_rtcd.h" 13#include "vpx_dsp/mips/vpx_convolve_msa.h" 14 15static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, 16 int32_t src_stride, uint8_t *dst, 17 int32_t dst_stride, 18 int8_t *filter) { 19 uint32_t tp0, tp1, tp2, tp3; 20 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 21 v16u8 dst0 = { 0 }, res; 22 v16u8 mask0, mask1, mask2, mask3; 23 v8i16 filt, res0, res1; 24 25 mask0 = LD_UB(&mc_filt_mask_arr[16]); 26 src -= 3; 27 28 /* rearranging filter */ 29 filt = LD_SH(filter); 30 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 31 32 mask1 = mask0 + 2; 33 mask2 = mask0 + 4; 34 mask3 = mask0 + 6; 35 36 LD_SB4(src, src_stride, src0, src1, src2, src3); 37 XORI_B4_128_SB(src0, src1, src2, src3); 38 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 39 filt0, filt1, filt2, filt3, res0, res1); 40 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 41 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 42 SRARI_H2_SH(res0, res1, FILTER_BITS); 43 SAT_SH2_SH(res0, res1, 7); 44 res = PCKEV_XORI128_UB(res0, res1); 45 res = (v16u8)__msa_aver_u_b(res, dst0); 46 ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); 47} 48 49static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, 50 int32_t src_stride, uint8_t *dst, 51 int32_t dst_stride, 52 int8_t *filter) { 53 uint32_t tp0, tp1, tp2, tp3; 54 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 55 v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3; 56 v16u8 dst0 = { 0 }, dst1 = { 0 }; 57 v8i16 filt, vec0, vec1, vec2, vec3; 58 59 mask0 = LD_UB(&mc_filt_mask_arr[16]); 60 src -= 3; 61 62 /* rearranging filter */ 63 filt = LD_SH(filter); 64 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 65 66 mask1 = mask0 + 2; 67 mask2 = mask0 + 4; 68 mask3 = mask0 + 6; 69 70 LD_SB4(src, src_stride, src0, src1, src2, src3); 71 XORI_B4_128_SB(src0, src1, src2, src3); 72 src += (4 * src_stride); 73 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 74 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 75 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); 76 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 77 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 78 filt0, filt1, filt2, filt3, vec0, vec1); 79 LD_SB4(src, src_stride, src0, src1, src2, src3); 80 XORI_B4_128_SB(src0, src1, src2, src3); 81 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 82 filt0, filt1, filt2, filt3, vec2, vec3); 83 SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS); 84 SAT_SH4_SH(vec0, vec1, vec2, vec3, 7); 85 PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2, 86 res3); 87 ILVR_D2_UB(res1, res0, res3, res2, res0, res2); 88 XORI_B2_128_UB(res0, res2); 89 AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2); 90 ST4x8_UB(res0, res2, dst, dst_stride); 91} 92 93static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, 94 int32_t src_stride, uint8_t *dst, 95 int32_t dst_stride, int8_t *filter, 96 int32_t height) { 97 if (4 == height) { 98 common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); 99 } else if (8 == height) { 100 common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); 101 } 102} 103 104static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, 105 int32_t src_stride, uint8_t *dst, 106 int32_t dst_stride, int8_t *filter, 107 int32_t height) { 108 int32_t loop_cnt; 109 int64_t tp0, tp1, tp2, tp3; 110 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 111 v16u8 mask0, mask1, mask2, mask3, dst0 = { 0 }, dst1 = { 0 }; 112 v8i16 filt, out0, out1, out2, out3; 113 114 mask0 = LD_UB(&mc_filt_mask_arr[0]); 115 src -= 3; 116 117 /* rearranging filter */ 118 filt = LD_SH(filter); 119 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 120 121 mask1 = mask0 + 2; 122 mask2 = mask0 + 4; 123 mask3 = mask0 + 6; 124 125 for (loop_cnt = (height >> 2); loop_cnt--;) { 126 LD_SB4(src, src_stride, src0, src1, src2, src3); 127 XORI_B4_128_SB(src0, src1, src2, src3); 128 src += (4 * src_stride); 129 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 130 mask3, filt0, filt1, filt2, filt3, out0, out1, 131 out2, out3); 132 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 133 INSERT_D2_UB(tp0, tp1, dst0); 134 INSERT_D2_UB(tp2, tp3, dst1); 135 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 136 SAT_SH4_SH(out0, out1, out2, out3, 7); 137 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst, 138 dst_stride); 139 dst += (4 * dst_stride); 140 } 141} 142 143static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, 144 int32_t src_stride, uint8_t *dst, 145 int32_t dst_stride, 146 int8_t *filter, int32_t height) { 147 int32_t loop_cnt; 148 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 149 v16u8 mask0, mask1, mask2, mask3, dst0, dst1; 150 v8i16 filt, out0, out1, out2, out3; 151 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 152 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 153 154 mask0 = LD_UB(&mc_filt_mask_arr[0]); 155 src -= 3; 156 157 /* rearranging filter */ 158 filt = LD_SH(filter); 159 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 160 161 mask1 = mask0 + 2; 162 mask2 = mask0 + 4; 163 mask3 = mask0 + 6; 164 165 for (loop_cnt = height >> 1; loop_cnt--;) { 166 LD_SB2(src, src_stride, src0, src2); 167 LD_SB2(src + 8, src_stride, src1, src3); 168 src += (2 * src_stride); 169 170 XORI_B4_128_SB(src0, src1, src2, src3); 171 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); 172 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); 173 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, 174 vec14); 175 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, 176 vec15); 177 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 178 vec2, vec3); 179 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, 180 vec9, vec10, vec11); 181 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, 182 vec2, vec3); 183 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, 184 vec9, vec10, vec11); 185 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, 186 out2, out3); 187 LD_UB2(dst, dst_stride, dst0, dst1); 188 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 189 SAT_SH4_SH(out0, out1, out2, out3, 7); 190 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst); 191 dst += dst_stride; 192 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst); 193 dst += dst_stride; 194 } 195} 196 197static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, 198 int32_t src_stride, uint8_t *dst, 199 int32_t dst_stride, 200 int8_t *filter, int32_t height) { 201 uint32_t loop_cnt; 202 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 203 v16u8 dst1, dst2, mask0, mask1, mask2, mask3; 204 v8i16 filt, out0, out1, out2, out3; 205 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 206 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 207 208 mask0 = LD_UB(&mc_filt_mask_arr[0]); 209 src -= 3; 210 211 /* rearranging filter */ 212 filt = LD_SH(filter); 213 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 214 215 mask1 = mask0 + 2; 216 mask2 = mask0 + 4; 217 mask3 = mask0 + 6; 218 219 for (loop_cnt = height; loop_cnt--;) { 220 src0 = LD_SB(src); 221 src2 = LD_SB(src + 16); 222 src3 = LD_SB(src + 24); 223 src1 = __msa_sldi_b(src2, src0, 8); 224 src += src_stride; 225 226 XORI_B4_128_SB(src0, src1, src2, src3); 227 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); 228 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); 229 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, 230 vec14); 231 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, 232 vec15); 233 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 234 vec2, vec3); 235 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, 236 vec9, vec10, vec11); 237 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, 238 vec2, vec3); 239 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, 240 vec9, vec10, vec11); 241 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, 242 out2, out3); 243 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 244 SAT_SH4_SH(out0, out1, out2, out3, 7); 245 LD_UB2(dst, 16, dst1, dst2); 246 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst); 247 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16); 248 dst += dst_stride; 249 } 250} 251 252static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, 253 int32_t src_stride, uint8_t *dst, 254 int32_t dst_stride, 255 int8_t *filter, int32_t height) { 256 uint32_t loop_cnt, cnt; 257 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 258 v16u8 dst1, dst2, mask0, mask1, mask2, mask3; 259 v8i16 filt, out0, out1, out2, out3; 260 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 261 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; 262 263 mask0 = LD_UB(&mc_filt_mask_arr[0]); 264 src -= 3; 265 266 /* rearranging filter */ 267 filt = LD_SH(filter); 268 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 269 270 mask1 = mask0 + 2; 271 mask2 = mask0 + 4; 272 mask3 = mask0 + 6; 273 274 for (loop_cnt = height; loop_cnt--;) { 275 for (cnt = 0; cnt < 2; ++cnt) { 276 src0 = LD_SB(&src[cnt << 5]); 277 src2 = LD_SB(&src[16 + (cnt << 5)]); 278 src3 = LD_SB(&src[24 + (cnt << 5)]); 279 src1 = __msa_sldi_b(src2, src0, 8); 280 281 XORI_B4_128_SB(src0, src1, src2, src3); 282 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, 283 vec12); 284 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, 285 vec13); 286 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, 287 vec14); 288 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, 289 vec15); 290 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, 291 vec1, vec2, vec3); 292 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, 293 vec9, vec10, vec11); 294 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, 295 vec1, vec2, vec3); 296 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, 297 vec9, vec10, vec11); 298 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, 299 out2, out3); 300 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 301 SAT_SH4_SH(out0, out1, out2, out3, 7); 302 LD_UB2(&dst[cnt << 5], 16, dst1, dst2); 303 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]); 304 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]); 305 } 306 307 src += src_stride; 308 dst += dst_stride; 309 } 310} 311 312static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, 313 int32_t src_stride, uint8_t *dst, 314 int32_t dst_stride, 315 int8_t *filter) { 316 uint32_t tp0, tp1, tp2, tp3; 317 v16i8 src0, src1, src2, src3, mask; 318 v16u8 filt0, dst0 = { 0 }, vec0, vec1, res; 319 v8u16 vec2, vec3, filt; 320 321 mask = LD_SB(&mc_filt_mask_arr[16]); 322 323 /* rearranging filter */ 324 filt = LD_UH(filter); 325 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 326 327 LD_SB4(src, src_stride, src0, src1, src2, src3); 328 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 329 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 330 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 331 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); 332 SRARI_H2_UH(vec2, vec3, FILTER_BITS); 333 res = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); 334 res = (v16u8)__msa_aver_u_b(res, dst0); 335 ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); 336} 337 338static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, 339 int32_t src_stride, uint8_t *dst, 340 int32_t dst_stride, 341 int8_t *filter) { 342 uint32_t tp0, tp1, tp2, tp3; 343 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 344 v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; 345 v16u8 dst0 = { 0 }, dst1 = { 0 }; 346 v8u16 vec4, vec5, vec6, vec7, filt; 347 348 mask = LD_SB(&mc_filt_mask_arr[16]); 349 350 /* rearranging filter */ 351 filt = LD_UH(filter); 352 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 353 354 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 355 LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 356 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 357 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); 358 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 359 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 360 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); 361 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, 362 vec6, vec7); 363 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); 364 PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, 365 res3); 366 ILVR_D2_UB(res1, res0, res3, res2, res0, res2); 367 AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2); 368 ST4x8_UB(res0, res2, dst, dst_stride); 369} 370 371static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src, 372 int32_t src_stride, uint8_t *dst, 373 int32_t dst_stride, int8_t *filter, 374 int32_t height) { 375 if (4 == height) { 376 common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); 377 } else if (8 == height) { 378 common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); 379 } 380} 381 382static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, 383 int32_t src_stride, uint8_t *dst, 384 int32_t dst_stride, 385 int8_t *filter) { 386 int64_t tp0, tp1, tp2, tp3; 387 v16i8 src0, src1, src2, src3, mask; 388 v16u8 filt0, dst0 = { 0 }, dst1 = { 0 }; 389 v8u16 vec0, vec1, vec2, vec3, filt; 390 391 mask = LD_SB(&mc_filt_mask_arr[0]); 392 393 /* rearranging filter */ 394 filt = LD_UH(filter); 395 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 396 397 LD_SB4(src, src_stride, src0, src1, src2, src3); 398 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 399 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 400 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 401 vec2, vec3); 402 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 403 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 404 INSERT_D2_UB(tp0, tp1, dst0); 405 INSERT_D2_UB(tp2, tp3, dst1); 406 PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); 407} 408 409static void common_hz_2t_and_aver_dst_8x8mult_msa( 410 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 411 int8_t *filter, int32_t height) { 412 int64_t tp0, tp1, tp2, tp3; 413 v16i8 src0, src1, src2, src3, mask; 414 v16u8 filt0, dst0 = { 0 }, dst1 = { 0 }; 415 v8u16 vec0, vec1, vec2, vec3, filt; 416 417 mask = LD_SB(&mc_filt_mask_arr[0]); 418 419 /* rearranging filter */ 420 filt = LD_UH(filter); 421 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 422 423 LD_SB4(src, src_stride, src0, src1, src2, src3); 424 src += (4 * src_stride); 425 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 426 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 427 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 428 vec2, vec3); 429 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 430 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 431 INSERT_D2_UB(tp0, tp1, dst0); 432 INSERT_D2_UB(tp2, tp3, dst1); 433 LD_SB4(src, src_stride, src0, src1, src2, src3); 434 src += (4 * src_stride); 435 PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); 436 dst += (4 * dst_stride); 437 438 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 439 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 440 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 441 vec2, vec3); 442 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 443 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 444 INSERT_D2_UB(tp0, tp1, dst0); 445 INSERT_D2_UB(tp2, tp3, dst1); 446 PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); 447 dst += (4 * dst_stride); 448 449 if (16 == height) { 450 LD_SB4(src, src_stride, src0, src1, src2, src3); 451 src += (4 * src_stride); 452 453 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 454 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 455 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 456 vec2, vec3); 457 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 458 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 459 INSERT_D2_UB(tp0, tp1, dst0); 460 INSERT_D2_UB(tp2, tp3, dst1); 461 LD_SB4(src, src_stride, src0, src1, src2, src3); 462 PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); 463 dst += (4 * dst_stride); 464 465 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 466 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 467 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 468 vec2, vec3); 469 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 470 LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 471 INSERT_D2_UB(tp0, tp1, dst0); 472 INSERT_D2_UB(tp2, tp3, dst1); 473 PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); 474 } 475} 476 477static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, 478 int32_t src_stride, uint8_t *dst, 479 int32_t dst_stride, int8_t *filter, 480 int32_t height) { 481 if (4 == height) { 482 common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); 483 } else { 484 common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, 485 filter, height); 486 } 487} 488 489static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, 490 int32_t src_stride, uint8_t *dst, 491 int32_t dst_stride, 492 int8_t *filter, int32_t height) { 493 uint32_t loop_cnt; 494 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 495 v16u8 filt0, dst0, dst1, dst2, dst3; 496 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 497 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; 498 499 mask = LD_SB(&mc_filt_mask_arr[0]); 500 501 /* rearranging filter */ 502 filt = LD_UH(filter); 503 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 504 505 LD_SB4(src, src_stride, src0, src2, src4, src6); 506 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 507 src += (4 * src_stride); 508 509 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 510 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 511 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 512 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 513 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, 514 res2, res3); 515 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, 516 res6, res7); 517 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); 518 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); 519 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 520 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); 521 dst += dst_stride; 522 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); 523 dst += dst_stride; 524 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); 525 dst += dst_stride; 526 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); 527 dst += dst_stride; 528 529 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { 530 LD_SB4(src, src_stride, src0, src2, src4, src6); 531 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 532 src += (4 * src_stride); 533 534 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 535 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 536 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 537 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 538 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, 539 res2, res3); 540 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, 541 res6, res7); 542 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); 543 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); 544 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 545 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); 546 dst += dst_stride; 547 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); 548 dst += dst_stride; 549 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); 550 dst += dst_stride; 551 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); 552 dst += dst_stride; 553 } 554} 555 556static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, 557 int32_t src_stride, uint8_t *dst, 558 int32_t dst_stride, 559 int8_t *filter, int32_t height) { 560 uint32_t loop_cnt; 561 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 562 v16u8 filt0, dst0, dst1, dst2, dst3; 563 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 564 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; 565 566 mask = LD_SB(&mc_filt_mask_arr[0]); 567 568 /* rearranging filter */ 569 filt = LD_UH(filter); 570 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 571 572 for (loop_cnt = (height >> 1); loop_cnt--;) { 573 src0 = LD_SB(src); 574 src2 = LD_SB(src + 16); 575 src3 = LD_SB(src + 24); 576 src1 = __msa_sldi_b(src2, src0, 8); 577 src += src_stride; 578 src4 = LD_SB(src); 579 src6 = LD_SB(src + 16); 580 src7 = LD_SB(src + 24); 581 src5 = __msa_sldi_b(src6, src4, 8); 582 src += src_stride; 583 584 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 585 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 586 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 587 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 588 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, 589 res2, res3); 590 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, 591 res6, res7); 592 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); 593 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); 594 LD_UB2(dst, 16, dst0, dst1); 595 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); 596 PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); 597 dst += dst_stride; 598 LD_UB2(dst, 16, dst2, dst3); 599 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); 600 PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); 601 dst += dst_stride; 602 } 603} 604 605static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, 606 int32_t src_stride, uint8_t *dst, 607 int32_t dst_stride, 608 int8_t *filter, int32_t height) { 609 uint32_t loop_cnt; 610 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 611 v16u8 filt0, dst0, dst1, dst2, dst3; 612 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 613 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; 614 615 mask = LD_SB(&mc_filt_mask_arr[0]); 616 617 /* rearranging filter */ 618 filt = LD_UH(filter); 619 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 620 621 for (loop_cnt = height; loop_cnt--;) { 622 LD_SB4(src, 16, src0, src2, src4, src6); 623 src7 = LD_SB(src + 56); 624 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); 625 src += src_stride; 626 627 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 628 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 629 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 630 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 631 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 632 out2, out3); 633 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 634 out6, out7); 635 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 636 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 637 LD_UB4(dst, 16, dst0, dst1, dst2, dst3); 638 PCKEV_AVG_ST_UB(out1, out0, dst0, dst); 639 PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); 640 PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); 641 PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); 642 dst += dst_stride; 643 } 644} 645 646void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, 647 uint8_t *dst, ptrdiff_t dst_stride, 648 const InterpKernel *filter, int x0_q4, 649 int x_step_q4, int y0_q4, int y_step_q4, int w, 650 int h) { 651 const int16_t *const filter_x = filter[x0_q4]; 652 int8_t cnt, filt_hor[8]; 653 654 assert(x_step_q4 == 16); 655 assert(((const int32_t *)filter_x)[1] != 0x800000); 656 657 for (cnt = 0; cnt < 8; ++cnt) { 658 filt_hor[cnt] = filter_x[cnt]; 659 } 660 661 if (((const int32_t *)filter_x)[0] == 0) { 662 switch (w) { 663 case 4: 664 common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, 665 (int32_t)dst_stride, &filt_hor[3], h); 666 break; 667 case 8: 668 common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, 669 (int32_t)dst_stride, &filt_hor[3], h); 670 break; 671 case 16: 672 common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, 673 (int32_t)dst_stride, &filt_hor[3], h); 674 break; 675 case 32: 676 common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, 677 (int32_t)dst_stride, &filt_hor[3], h); 678 break; 679 case 64: 680 common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, 681 (int32_t)dst_stride, &filt_hor[3], h); 682 break; 683 default: 684 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, 685 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); 686 break; 687 } 688 } else { 689 switch (w) { 690 case 4: 691 common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, 692 (int32_t)dst_stride, filt_hor, h); 693 break; 694 case 8: 695 common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, 696 (int32_t)dst_stride, filt_hor, h); 697 break; 698 case 16: 699 common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, 700 (int32_t)dst_stride, filt_hor, h); 701 break; 702 case 32: 703 common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, 704 (int32_t)dst_stride, filt_hor, h); 705 break; 706 case 64: 707 common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, 708 (int32_t)dst_stride, filt_hor, h); 709 break; 710 default: 711 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, 712 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); 713 break; 714 } 715 } 716} 717