1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include "./vpx_dsp_rtcd.h" 13#include "vpx_dsp/mips/vpx_convolve_msa.h" 14 15static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, 16 uint8_t *dst, int32_t dst_stride, 17 int8_t *filter) { 18 v16u8 mask0, mask1, mask2, mask3, out; 19 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 20 v8i16 filt, out0, out1; 21 22 mask0 = LD_UB(&mc_filt_mask_arr[16]); 23 src -= 3; 24 25 /* rearranging filter */ 26 filt = LD_SH(filter); 27 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 28 29 mask1 = mask0 + 2; 30 mask2 = mask0 + 4; 31 mask3 = mask0 + 6; 32 33 LD_SB4(src, src_stride, src0, src1, src2, src3); 34 XORI_B4_128_SB(src0, src1, src2, src3); 35 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 36 filt0, filt1, filt2, filt3, out0, out1); 37 SRARI_H2_SH(out0, out1, FILTER_BITS); 38 SAT_SH2_SH(out0, out1, 7); 39 out = PCKEV_XORI128_UB(out0, out1); 40 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 41} 42 43static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, 44 uint8_t *dst, int32_t dst_stride, 45 int8_t *filter) { 46 v16i8 filt0, filt1, filt2, filt3; 47 v16i8 src0, src1, src2, src3; 48 v16u8 mask0, mask1, mask2, mask3, out; 49 v8i16 filt, out0, out1, out2, out3; 50 51 mask0 = LD_UB(&mc_filt_mask_arr[16]); 52 src -= 3; 53 54 /* rearranging filter */ 55 filt = LD_SH(filter); 56 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 57 58 mask1 = mask0 + 2; 59 mask2 = mask0 + 4; 60 mask3 = mask0 + 6; 61 62 LD_SB4(src, src_stride, src0, src1, src2, src3); 63 XORI_B4_128_SB(src0, src1, src2, src3); 64 src += (4 * src_stride); 65 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 66 filt0, filt1, filt2, filt3, out0, out1); 67 LD_SB4(src, src_stride, src0, src1, src2, src3); 68 XORI_B4_128_SB(src0, src1, src2, src3); 69 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 70 filt0, filt1, filt2, filt3, out2, out3); 71 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 72 SAT_SH4_SH(out0, out1, out2, out3, 7); 73 out = PCKEV_XORI128_UB(out0, out1); 74 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 75 dst += (4 * dst_stride); 76 out = PCKEV_XORI128_UB(out2, out3); 77 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 78} 79 80static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, 81 uint8_t *dst, int32_t dst_stride, 82 int8_t *filter, int32_t height) { 83 if (4 == height) { 84 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter); 85 } else if (8 == height) { 86 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter); 87 } 88} 89 90static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, 91 uint8_t *dst, int32_t dst_stride, 92 int8_t *filter) { 93 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 94 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; 95 v8i16 filt, out0, out1, out2, out3; 96 97 mask0 = LD_UB(&mc_filt_mask_arr[0]); 98 src -= 3; 99 100 /* rearranging filter */ 101 filt = LD_SH(filter); 102 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 103 104 mask1 = mask0 + 2; 105 mask2 = mask0 + 4; 106 mask3 = mask0 + 6; 107 108 LD_SB4(src, src_stride, src0, src1, src2, src3); 109 XORI_B4_128_SB(src0, src1, src2, src3); 110 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, 111 filt0, filt1, filt2, filt3, out0, out1, out2, 112 out3); 113 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 114 SAT_SH4_SH(out0, out1, out2, out3, 7); 115 tmp0 = PCKEV_XORI128_UB(out0, out1); 116 tmp1 = PCKEV_XORI128_UB(out2, out3); 117 ST8x4_UB(tmp0, tmp1, dst, dst_stride); 118} 119 120static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, 121 uint8_t *dst, int32_t dst_stride, 122 int8_t *filter, int32_t height) { 123 uint32_t loop_cnt; 124 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 125 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; 126 v8i16 filt, out0, out1, out2, out3; 127 128 mask0 = LD_UB(&mc_filt_mask_arr[0]); 129 src -= 3; 130 131 /* rearranging filter */ 132 filt = LD_SH(filter); 133 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 134 135 mask1 = mask0 + 2; 136 mask2 = mask0 + 4; 137 mask3 = mask0 + 6; 138 139 for (loop_cnt = (height >> 2); loop_cnt--;) { 140 LD_SB4(src, src_stride, src0, src1, src2, src3); 141 XORI_B4_128_SB(src0, src1, src2, src3); 142 src += (4 * src_stride); 143 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 144 mask3, filt0, filt1, filt2, filt3, out0, out1, 145 out2, out3); 146 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 147 SAT_SH4_SH(out0, out1, out2, out3, 7); 148 tmp0 = PCKEV_XORI128_UB(out0, out1); 149 tmp1 = PCKEV_XORI128_UB(out2, out3); 150 ST8x4_UB(tmp0, tmp1, dst, dst_stride); 151 dst += (4 * dst_stride); 152 } 153} 154 155static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, 156 uint8_t *dst, int32_t dst_stride, 157 int8_t *filter, int32_t height) { 158 if (4 == height) { 159 common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter); 160 } else { 161 common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); 162 } 163} 164 165static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, 166 uint8_t *dst, int32_t dst_stride, 167 int8_t *filter, int32_t height) { 168 uint32_t loop_cnt; 169 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 170 v16u8 mask0, mask1, mask2, mask3, out; 171 v8i16 filt, out0, out1, out2, out3; 172 173 mask0 = LD_UB(&mc_filt_mask_arr[0]); 174 src -= 3; 175 176 /* rearranging filter */ 177 filt = LD_SH(filter); 178 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 179 180 mask1 = mask0 + 2; 181 mask2 = mask0 + 4; 182 mask3 = mask0 + 6; 183 184 for (loop_cnt = (height >> 1); loop_cnt--;) { 185 LD_SB2(src, src_stride, src0, src2); 186 LD_SB2(src + 8, src_stride, src1, src3); 187 XORI_B4_128_SB(src0, src1, src2, src3); 188 src += (2 * src_stride); 189 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 190 mask3, filt0, filt1, filt2, filt3, out0, out1, 191 out2, out3); 192 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 193 SAT_SH4_SH(out0, out1, out2, out3, 7); 194 out = PCKEV_XORI128_UB(out0, out1); 195 ST_UB(out, dst); 196 dst += dst_stride; 197 out = PCKEV_XORI128_UB(out2, out3); 198 ST_UB(out, dst); 199 dst += dst_stride; 200 } 201} 202 203static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, 204 uint8_t *dst, int32_t dst_stride, 205 int8_t *filter, int32_t height) { 206 uint32_t loop_cnt; 207 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 208 v16u8 mask0, mask1, mask2, mask3, out; 209 v8i16 filt, out0, out1, out2, out3; 210 211 mask0 = LD_UB(&mc_filt_mask_arr[0]); 212 src -= 3; 213 214 /* rearranging filter */ 215 filt = LD_SH(filter); 216 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 217 218 mask1 = mask0 + 2; 219 mask2 = mask0 + 4; 220 mask3 = mask0 + 6; 221 222 for (loop_cnt = (height >> 1); loop_cnt--;) { 223 src0 = LD_SB(src); 224 src2 = LD_SB(src + 16); 225 src3 = LD_SB(src + 24); 226 src1 = __msa_sldi_b(src2, src0, 8); 227 src += src_stride; 228 XORI_B4_128_SB(src0, src1, src2, src3); 229 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 230 mask3, filt0, filt1, filt2, filt3, out0, out1, 231 out2, out3); 232 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 233 SAT_SH4_SH(out0, out1, out2, out3, 7); 234 235 src0 = LD_SB(src); 236 src2 = LD_SB(src + 16); 237 src3 = LD_SB(src + 24); 238 src1 = __msa_sldi_b(src2, src0, 8); 239 src += src_stride; 240 241 out = PCKEV_XORI128_UB(out0, out1); 242 ST_UB(out, dst); 243 out = PCKEV_XORI128_UB(out2, out3); 244 ST_UB(out, dst + 16); 245 dst += dst_stride; 246 247 XORI_B4_128_SB(src0, src1, src2, src3); 248 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 249 mask3, filt0, filt1, filt2, filt3, out0, out1, 250 out2, out3); 251 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 252 SAT_SH4_SH(out0, out1, out2, out3, 7); 253 out = PCKEV_XORI128_UB(out0, out1); 254 ST_UB(out, dst); 255 out = PCKEV_XORI128_UB(out2, out3); 256 ST_UB(out, dst + 16); 257 dst += dst_stride; 258 } 259} 260 261static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, 262 uint8_t *dst, int32_t dst_stride, 263 int8_t *filter, int32_t height) { 264 int32_t loop_cnt; 265 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 266 v16u8 mask0, mask1, mask2, mask3, out; 267 v8i16 filt, out0, out1, out2, out3; 268 269 mask0 = LD_UB(&mc_filt_mask_arr[0]); 270 src -= 3; 271 272 /* rearranging filter */ 273 filt = LD_SH(filter); 274 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 275 276 mask1 = mask0 + 2; 277 mask2 = mask0 + 4; 278 mask3 = mask0 + 6; 279 280 for (loop_cnt = height; loop_cnt--;) { 281 src0 = LD_SB(src); 282 src2 = LD_SB(src + 16); 283 src3 = LD_SB(src + 24); 284 src1 = __msa_sldi_b(src2, src0, 8); 285 286 XORI_B4_128_SB(src0, src1, src2, src3); 287 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 288 mask3, filt0, filt1, filt2, filt3, out0, out1, 289 out2, out3); 290 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 291 SAT_SH4_SH(out0, out1, out2, out3, 7); 292 out = PCKEV_XORI128_UB(out0, out1); 293 ST_UB(out, dst); 294 out = PCKEV_XORI128_UB(out2, out3); 295 ST_UB(out, dst + 16); 296 297 src0 = LD_SB(src + 32); 298 src2 = LD_SB(src + 48); 299 src3 = LD_SB(src + 56); 300 src1 = __msa_sldi_b(src2, src0, 8); 301 src += src_stride; 302 303 XORI_B4_128_SB(src0, src1, src2, src3); 304 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, 305 mask3, filt0, filt1, filt2, filt3, out0, out1, 306 out2, out3); 307 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); 308 SAT_SH4_SH(out0, out1, out2, out3, 7); 309 out = PCKEV_XORI128_UB(out0, out1); 310 ST_UB(out, dst + 32); 311 out = PCKEV_XORI128_UB(out2, out3); 312 ST_UB(out, dst + 48); 313 dst += dst_stride; 314 } 315} 316 317static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, 318 uint8_t *dst, int32_t dst_stride, 319 int8_t *filter) { 320 v16i8 src0, src1, src2, src3, mask; 321 v16u8 filt0, vec0, vec1, res0, res1; 322 v8u16 vec2, vec3, filt; 323 324 mask = LD_SB(&mc_filt_mask_arr[16]); 325 326 /* rearranging filter */ 327 filt = LD_UH(filter); 328 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 329 330 LD_SB4(src, src_stride, src0, src1, src2, src3); 331 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 332 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); 333 SRARI_H2_UH(vec2, vec3, FILTER_BITS); 334 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); 335 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 336} 337 338static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, 339 uint8_t *dst, int32_t dst_stride, 340 int8_t *filter) { 341 v16u8 vec0, vec1, vec2, vec3, filt0; 342 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 343 v16i8 res0, res1, res2, res3; 344 v8u16 vec4, vec5, vec6, vec7, filt; 345 346 mask = LD_SB(&mc_filt_mask_arr[16]); 347 348 /* rearranging filter */ 349 filt = LD_UH(filter); 350 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 351 352 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 353 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 354 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); 355 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, 356 vec6, vec7); 357 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); 358 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, 359 res2, res3); 360 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 361 dst += (4 * dst_stride); 362 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); 363} 364 365static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, 366 uint8_t *dst, int32_t dst_stride, 367 int8_t *filter, int32_t height) { 368 if (4 == height) { 369 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); 370 } else if (8 == height) { 371 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); 372 } 373} 374 375static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, 376 uint8_t *dst, int32_t dst_stride, 377 int8_t *filter) { 378 v16u8 filt0; 379 v16i8 src0, src1, src2, src3, mask; 380 v8u16 vec0, vec1, vec2, vec3, filt; 381 382 mask = LD_SB(&mc_filt_mask_arr[0]); 383 384 /* rearranging filter */ 385 filt = LD_UH(filter); 386 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 387 388 LD_SB4(src, src_stride, src0, src1, src2, src3); 389 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 390 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 391 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 392 vec2, vec3); 393 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 394 PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); 395 ST8x4_UB(src0, src1, dst, dst_stride); 396} 397 398static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, 399 uint8_t *dst, int32_t dst_stride, 400 int8_t *filter, int32_t height) { 401 v16u8 filt0; 402 v16i8 src0, src1, src2, src3, mask, out0, out1; 403 v8u16 vec0, vec1, vec2, vec3, filt; 404 405 mask = LD_SB(&mc_filt_mask_arr[0]); 406 407 /* rearranging filter */ 408 filt = LD_UH(filter); 409 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 410 411 LD_SB4(src, src_stride, src0, src1, src2, src3); 412 src += (4 * src_stride); 413 414 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 415 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 416 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 417 vec2, vec3); 418 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 419 420 LD_SB4(src, src_stride, src0, src1, src2, src3); 421 src += (4 * src_stride); 422 423 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 424 ST8x4_UB(out0, out1, dst, dst_stride); 425 dst += (4 * dst_stride); 426 427 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 428 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 429 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 430 vec2, vec3); 431 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 432 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 433 ST8x4_UB(out0, out1, dst, dst_stride); 434 dst += (4 * dst_stride); 435 436 if (16 == height) { 437 LD_SB4(src, src_stride, src0, src1, src2, src3); 438 src += (4 * src_stride); 439 440 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 441 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 442 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 443 vec2, vec3); 444 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 445 LD_SB4(src, src_stride, src0, src1, src2, src3); 446 src += (4 * src_stride); 447 448 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 449 ST8x4_UB(out0, out1, dst, dst_stride); 450 451 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 452 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 453 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 454 vec2, vec3); 455 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 456 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 457 ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); 458 } 459} 460 461static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride, 462 uint8_t *dst, int32_t dst_stride, 463 int8_t *filter, int32_t height) { 464 if (4 == height) { 465 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); 466 } else { 467 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); 468 } 469} 470 471static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, 472 uint8_t *dst, int32_t dst_stride, 473 int8_t *filter, int32_t height) { 474 uint32_t loop_cnt; 475 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 476 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 477 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; 478 479 mask = LD_SB(&mc_filt_mask_arr[0]); 480 481 loop_cnt = (height >> 2) - 1; 482 483 /* rearranging filter */ 484 filt = LD_UH(filter); 485 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 486 487 LD_SB4(src, src_stride, src0, src2, src4, src6); 488 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 489 src += (4 * src_stride); 490 491 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 492 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 493 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 494 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 495 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 496 out2, out3); 497 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 498 out6, out7); 499 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 500 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 501 PCKEV_ST_SB(out0, out1, dst); 502 dst += dst_stride; 503 PCKEV_ST_SB(out2, out3, dst); 504 dst += dst_stride; 505 PCKEV_ST_SB(out4, out5, dst); 506 dst += dst_stride; 507 PCKEV_ST_SB(out6, out7, dst); 508 dst += dst_stride; 509 510 for (; loop_cnt--;) { 511 LD_SB4(src, src_stride, src0, src2, src4, src6); 512 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 513 src += (4 * src_stride); 514 515 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 516 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 517 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 518 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 519 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 520 out2, out3); 521 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 522 out6, out7); 523 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 524 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 525 PCKEV_ST_SB(out0, out1, dst); 526 dst += dst_stride; 527 PCKEV_ST_SB(out2, out3, dst); 528 dst += dst_stride; 529 PCKEV_ST_SB(out4, out5, dst); 530 dst += dst_stride; 531 PCKEV_ST_SB(out6, out7, dst); 532 dst += dst_stride; 533 } 534} 535 536static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, 537 uint8_t *dst, int32_t dst_stride, 538 int8_t *filter, int32_t height) { 539 uint32_t loop_cnt; 540 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 541 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 542 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; 543 544 mask = LD_SB(&mc_filt_mask_arr[0]); 545 546 /* rearranging filter */ 547 filt = LD_UH(filter); 548 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 549 550 for (loop_cnt = height >> 1; loop_cnt--;) { 551 src0 = LD_SB(src); 552 src2 = LD_SB(src + 16); 553 src3 = LD_SB(src + 24); 554 src1 = __msa_sldi_b(src2, src0, 8); 555 src += src_stride; 556 src4 = LD_SB(src); 557 src6 = LD_SB(src + 16); 558 src7 = LD_SB(src + 24); 559 src5 = __msa_sldi_b(src6, src4, 8); 560 src += src_stride; 561 562 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 563 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 564 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 565 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 566 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 567 out2, out3); 568 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 569 out6, out7); 570 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 571 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 572 PCKEV_ST_SB(out0, out1, dst); 573 PCKEV_ST_SB(out2, out3, dst + 16); 574 dst += dst_stride; 575 PCKEV_ST_SB(out4, out5, dst); 576 PCKEV_ST_SB(out6, out7, dst + 16); 577 dst += dst_stride; 578 } 579} 580 581static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, 582 uint8_t *dst, int32_t dst_stride, 583 int8_t *filter, int32_t height) { 584 uint32_t loop_cnt; 585 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 586 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 587 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; 588 589 mask = LD_SB(&mc_filt_mask_arr[0]); 590 591 /* rearranging filter */ 592 filt = LD_UH(filter); 593 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 594 595 for (loop_cnt = height; loop_cnt--;) { 596 src0 = LD_SB(src); 597 src2 = LD_SB(src + 16); 598 src4 = LD_SB(src + 32); 599 src6 = LD_SB(src + 48); 600 src7 = LD_SB(src + 56); 601 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); 602 src += src_stride; 603 604 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 605 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 606 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 607 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 608 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 609 out2, out3); 610 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 611 out6, out7); 612 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 613 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 614 PCKEV_ST_SB(out0, out1, dst); 615 PCKEV_ST_SB(out2, out3, dst + 16); 616 PCKEV_ST_SB(out4, out5, dst + 32); 617 PCKEV_ST_SB(out6, out7, dst + 48); 618 dst += dst_stride; 619 } 620} 621 622void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, 623 uint8_t *dst, ptrdiff_t dst_stride, 624 const int16_t *filter_x, int x_step_q4, 625 const int16_t *filter_y, int y_step_q4, 626 int w, int h) { 627 int8_t cnt, filt_hor[8]; 628 629 assert(x_step_q4 == 16); 630 assert(((const int32_t *)filter_x)[1] != 0x800000); 631 632 for (cnt = 0; cnt < 8; ++cnt) { 633 filt_hor[cnt] = filter_x[cnt]; 634 } 635 636 if (((const int32_t *)filter_x)[0] == 0) { 637 switch (w) { 638 case 4: 639 common_hz_2t_4w_msa(src, (int32_t)src_stride, 640 dst, (int32_t)dst_stride, 641 &filt_hor[3], h); 642 break; 643 case 8: 644 common_hz_2t_8w_msa(src, (int32_t)src_stride, 645 dst, (int32_t)dst_stride, 646 &filt_hor[3], h); 647 break; 648 case 16: 649 common_hz_2t_16w_msa(src, (int32_t)src_stride, 650 dst, (int32_t)dst_stride, 651 &filt_hor[3], h); 652 break; 653 case 32: 654 common_hz_2t_32w_msa(src, (int32_t)src_stride, 655 dst, (int32_t)dst_stride, 656 &filt_hor[3], h); 657 break; 658 case 64: 659 common_hz_2t_64w_msa(src, (int32_t)src_stride, 660 dst, (int32_t)dst_stride, 661 &filt_hor[3], h); 662 break; 663 default: 664 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, 665 filter_x, x_step_q4, filter_y, y_step_q4, 666 w, h); 667 break; 668 } 669 } else { 670 switch (w) { 671 case 4: 672 common_hz_8t_4w_msa(src, (int32_t)src_stride, 673 dst, (int32_t)dst_stride, 674 filt_hor, h); 675 break; 676 case 8: 677 common_hz_8t_8w_msa(src, (int32_t)src_stride, 678 dst, (int32_t)dst_stride, 679 filt_hor, h); 680 break; 681 case 16: 682 common_hz_8t_16w_msa(src, (int32_t)src_stride, 683 dst, (int32_t)dst_stride, 684 filt_hor, h); 685 break; 686 case 32: 687 common_hz_8t_32w_msa(src, (int32_t)src_stride, 688 dst, (int32_t)dst_stride, 689 filt_hor, h); 690 break; 691 case 64: 692 common_hz_8t_64w_msa(src, (int32_t)src_stride, 693 dst, (int32_t)dst_stride, 694 filt_hor, h); 695 break; 696 default: 697 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, 698 filter_x, x_step_q4, filter_y, y_step_q4, 699 w, h); 700 break; 701 } 702 } 703} 704