1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vp8_rtcd.h" 12#include "vpx_ports/mem.h" 13#include "vp8/common/filter.h" 14#include "vp8/common/mips/msa/vp8_macros_msa.h" 15 16DECLARE_ALIGNED(16, static const int8_t, vp8_bilinear_filters_msa[7][2]) = { 17 { 112, 16 }, { 96, 32 }, { 80, 48 }, { 64, 64 }, 18 { 48, 80 }, { 32, 96 }, { 16, 112 } 19}; 20 21static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { 22 /* 8 width cases */ 23 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 24 /* 4 width cases */ 25 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 26 /* 4 width cases */ 27 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 28}; 29 30static void common_hz_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, 31 uint8_t *RESTRICT dst, int32_t dst_stride, 32 const int8_t *filter) { 33 v16i8 src0, src1, src2, src3, mask; 34 v16u8 filt0, vec0, vec1, res0, res1; 35 v8u16 vec2, vec3, filt; 36 37 mask = LD_SB(&vp8_mc_filt_mask_arr[16]); 38 39 filt = LD_UH(filter); 40 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 41 42 LD_SB4(src, src_stride, src0, src1, src2, src3); 43 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 44 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); 45 SRARI_H2_UH(vec2, vec3, VP8_FILTER_SHIFT); 46 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); 47 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 48} 49 50static void common_hz_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride, 51 uint8_t *RESTRICT dst, int32_t dst_stride, 52 const int8_t *filter) { 53 v16u8 vec0, vec1, vec2, vec3, filt0; 54 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 55 v16i8 res0, res1, res2, res3; 56 v8u16 vec4, vec5, vec6, vec7, filt; 57 58 mask = LD_SB(&vp8_mc_filt_mask_arr[16]); 59 60 filt = LD_UH(filter); 61 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 62 63 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 64 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 65 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); 66 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, 67 vec6, vec7); 68 SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT); 69 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, 70 res3); 71 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 72 dst += (4 * dst_stride); 73 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); 74} 75 76static void common_hz_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, 77 uint8_t *RESTRICT dst, int32_t dst_stride, 78 const int8_t *filter, int32_t height) { 79 if (4 == height) { 80 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); 81 } else if (8 == height) { 82 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); 83 } 84} 85 86static void common_hz_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride, 87 uint8_t *RESTRICT dst, int32_t dst_stride, 88 const int8_t *filter) { 89 v16u8 filt0; 90 v16i8 src0, src1, src2, src3, mask; 91 v8u16 vec0, vec1, vec2, vec3, filt; 92 93 mask = LD_SB(&vp8_mc_filt_mask_arr[0]); 94 95 filt = LD_UH(filter); 96 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 97 98 LD_SB4(src, src_stride, src0, src1, src2, src3); 99 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 100 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 101 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 102 vec2, vec3); 103 SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT); 104 PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); 105 ST8x4_UB(src0, src1, dst, dst_stride); 106} 107 108static void common_hz_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride, 109 uint8_t *RESTRICT dst, int32_t dst_stride, 110 const int8_t *filter, int32_t height) { 111 v16u8 filt0; 112 v16i8 src0, src1, src2, src3, mask, out0, out1; 113 v8u16 vec0, vec1, vec2, vec3, filt; 114 115 mask = LD_SB(&vp8_mc_filt_mask_arr[0]); 116 117 filt = LD_UH(filter); 118 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 119 120 LD_SB4(src, src_stride, src0, src1, src2, src3); 121 src += (4 * src_stride); 122 123 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 124 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 125 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 126 vec2, vec3); 127 SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT); 128 129 LD_SB4(src, src_stride, src0, src1, src2, src3); 130 src += (4 * src_stride); 131 132 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 133 ST8x4_UB(out0, out1, dst, dst_stride); 134 dst += (4 * dst_stride); 135 136 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 137 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 138 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 139 vec2, vec3); 140 SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT); 141 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 142 ST8x4_UB(out0, out1, dst, dst_stride); 143 dst += (4 * dst_stride); 144 145 if (16 == height) { 146 LD_SB4(src, src_stride, src0, src1, src2, src3); 147 src += (4 * src_stride); 148 149 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 150 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 151 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 152 vec2, vec3); 153 SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT); 154 LD_SB4(src, src_stride, src0, src1, src2, src3); 155 src += (4 * src_stride); 156 157 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 158 ST8x4_UB(out0, out1, dst, dst_stride); 159 160 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 161 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 162 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 163 vec2, vec3); 164 SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT); 165 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 166 ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); 167 } 168} 169 170static void common_hz_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, 171 uint8_t *RESTRICT dst, int32_t dst_stride, 172 const int8_t *filter, int32_t height) { 173 if (4 == height) { 174 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); 175 } else { 176 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); 177 } 178} 179 180static void common_hz_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, 181 uint8_t *RESTRICT dst, int32_t dst_stride, 182 const int8_t *filter, int32_t height) { 183 uint32_t loop_cnt; 184 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 185 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 186 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; 187 188 mask = LD_SB(&vp8_mc_filt_mask_arr[0]); 189 190 loop_cnt = (height >> 2) - 1; 191 192 filt = LD_UH(filter); 193 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 194 195 LD_SB4(src, src_stride, src0, src2, src4, src6); 196 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 197 src += (4 * src_stride); 198 199 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 200 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 201 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 202 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 203 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 204 out2, out3); 205 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 206 out6, out7); 207 SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT); 208 SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT); 209 PCKEV_ST_SB(out0, out1, dst); 210 dst += dst_stride; 211 PCKEV_ST_SB(out2, out3, dst); 212 dst += dst_stride; 213 PCKEV_ST_SB(out4, out5, dst); 214 dst += dst_stride; 215 PCKEV_ST_SB(out6, out7, dst); 216 dst += dst_stride; 217 218 for (; loop_cnt--;) { 219 LD_SB4(src, src_stride, src0, src2, src4, src6); 220 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 221 src += (4 * src_stride); 222 223 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 224 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 225 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 226 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 227 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 228 out2, out3); 229 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 230 out6, out7); 231 SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT); 232 SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT); 233 PCKEV_ST_SB(out0, out1, dst); 234 dst += dst_stride; 235 PCKEV_ST_SB(out2, out3, dst); 236 dst += dst_stride; 237 PCKEV_ST_SB(out4, out5, dst); 238 dst += dst_stride; 239 PCKEV_ST_SB(out6, out7, dst); 240 dst += dst_stride; 241 } 242} 243 244static void common_vt_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, 245 uint8_t *RESTRICT dst, int32_t dst_stride, 246 const int8_t *filter) { 247 v16i8 src0, src1, src2, src3, src4; 248 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; 249 v16u8 filt0; 250 v8i16 filt; 251 v8u16 tmp0, tmp1; 252 253 filt = LD_SH(filter); 254 filt0 = (v16u8)__msa_splati_h(filt, 0); 255 256 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 257 src += (5 * src_stride); 258 259 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 260 src32_r, src43_r); 261 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 262 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); 263 SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT); 264 src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 265 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); 266} 267 268static void common_vt_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride, 269 uint8_t *RESTRICT dst, int32_t dst_stride, 270 const int8_t *filter) { 271 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 272 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; 273 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; 274 v8u16 tmp0, tmp1, tmp2, tmp3; 275 v16u8 filt0; 276 v8i16 filt; 277 278 filt = LD_SH(filter); 279 filt0 = (v16u8)__msa_splati_h(filt, 0); 280 281 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 282 src += (8 * src_stride); 283 284 src8 = LD_SB(src); 285 src += src_stride; 286 287 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 288 src32_r, src43_r); 289 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 290 src76_r, src87_r); 291 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, 292 src76_r, src2110, src4332, src6554, src8776); 293 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, 294 tmp0, tmp1, tmp2, tmp3); 295 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT); 296 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); 297 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); 298 ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 299} 300 301static void common_vt_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, 302 uint8_t *RESTRICT dst, int32_t dst_stride, 303 const int8_t *filter, int32_t height) { 304 if (4 == height) { 305 common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); 306 } else if (8 == height) { 307 common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); 308 } 309} 310 311static void common_vt_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride, 312 uint8_t *RESTRICT dst, int32_t dst_stride, 313 const int8_t *filter) { 314 v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; 315 v16i8 out0, out1; 316 v8u16 tmp0, tmp1, tmp2, tmp3; 317 v8i16 filt; 318 319 filt = LD_SH(filter); 320 filt0 = (v16u8)__msa_splati_h(filt, 0); 321 322 LD_UB5(src, src_stride, src0, src1, src2, src3, src4); 323 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); 324 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); 325 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, 326 tmp2, tmp3); 327 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT); 328 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 329 ST8x4_UB(out0, out1, dst, dst_stride); 330} 331 332static void common_vt_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride, 333 uint8_t *RESTRICT dst, int32_t dst_stride, 334 const int8_t *filter, int32_t height) { 335 uint32_t loop_cnt; 336 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 337 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 338 v16i8 out0, out1; 339 v8u16 tmp0, tmp1, tmp2, tmp3; 340 v8i16 filt; 341 342 filt = LD_SH(filter); 343 filt0 = (v16u8)__msa_splati_h(filt, 0); 344 345 src0 = LD_UB(src); 346 src += src_stride; 347 348 for (loop_cnt = (height >> 3); loop_cnt--;) { 349 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); 350 src += (8 * src_stride); 351 352 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, 353 vec3); 354 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, 355 vec7); 356 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, 357 tmp2, tmp3); 358 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT); 359 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 360 ST8x4_UB(out0, out1, dst, dst_stride); 361 dst += (4 * dst_stride); 362 363 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, 364 tmp2, tmp3); 365 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT); 366 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 367 ST8x4_UB(out0, out1, dst, dst_stride); 368 dst += (4 * dst_stride); 369 370 src0 = src8; 371 } 372} 373 374static void common_vt_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, 375 uint8_t *RESTRICT dst, int32_t dst_stride, 376 const int8_t *filter, int32_t height) { 377 if (4 == height) { 378 common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); 379 } else { 380 common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); 381 } 382} 383 384static void common_vt_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, 385 uint8_t *RESTRICT dst, int32_t dst_stride, 386 const int8_t *filter, int32_t height) { 387 uint32_t loop_cnt; 388 v16u8 src0, src1, src2, src3, src4; 389 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 390 v8u16 tmp0, tmp1, tmp2, tmp3; 391 v8i16 filt; 392 393 filt = LD_SH(filter); 394 filt0 = (v16u8)__msa_splati_h(filt, 0); 395 396 src0 = LD_UB(src); 397 src += src_stride; 398 399 for (loop_cnt = (height >> 2); loop_cnt--;) { 400 LD_UB4(src, src_stride, src1, src2, src3, src4); 401 src += (4 * src_stride); 402 403 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 404 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 405 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 406 SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT); 407 PCKEV_ST_SB(tmp0, tmp1, dst); 408 dst += dst_stride; 409 410 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); 411 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); 412 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 413 SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT); 414 PCKEV_ST_SB(tmp2, tmp3, dst); 415 dst += dst_stride; 416 417 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 418 SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT); 419 PCKEV_ST_SB(tmp0, tmp1, dst); 420 dst += dst_stride; 421 422 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 423 SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT); 424 PCKEV_ST_SB(tmp2, tmp3, dst); 425 dst += dst_stride; 426 427 src0 = src4; 428 } 429} 430 431static void common_hv_2ht_2vt_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, 432 uint8_t *RESTRICT dst, int32_t dst_stride, 433 const int8_t *filter_horiz, 434 const int8_t *filter_vert) { 435 v16i8 src0, src1, src2, src3, src4, mask; 436 v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; 437 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; 438 439 mask = LD_SB(&vp8_mc_filt_mask_arr[16]); 440 441 filt = LD_UH(filter_horiz); 442 filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); 443 filt = LD_UH(filter_vert); 444 filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); 445 446 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 447 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT); 448 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT); 449 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT); 450 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 451 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); 452 453 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 454 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 455 SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT); 456 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); 457 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 458} 459 460static void common_hv_2ht_2vt_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride, 461 uint8_t *RESTRICT dst, int32_t dst_stride, 462 const int8_t *filter_horiz, 463 const int8_t *filter_vert) { 464 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; 465 v16i8 res0, res1, res2, res3; 466 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; 467 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 468 v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; 469 470 mask = LD_SB(&vp8_mc_filt_mask_arr[16]); 471 472 filt = LD_UH(filter_horiz); 473 filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); 474 filt = LD_UH(filter_vert); 475 filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); 476 477 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 478 src += (8 * src_stride); 479 src8 = LD_SB(src); 480 481 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT); 482 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT); 483 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, VP8_FILTER_SHIFT); 484 hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, VP8_FILTER_SHIFT); 485 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, VP8_FILTER_SHIFT); 486 SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, 487 hz_out3, hz_out5, 8); 488 hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); 489 490 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 491 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); 492 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4, 493 vec5, vec6, vec7); 494 SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT); 495 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, 496 res3); 497 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 498 dst += (4 * dst_stride); 499 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); 500} 501 502static void common_hv_2ht_2vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride, 503 uint8_t *RESTRICT dst, int32_t dst_stride, 504 const int8_t *filter_horiz, 505 const int8_t *filter_vert, 506 int32_t height) { 507 if (4 == height) { 508 common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz, 509 filter_vert); 510 } else if (8 == height) { 511 common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz, 512 filter_vert); 513 } 514} 515 516static void common_hv_2ht_2vt_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride, 517 uint8_t *RESTRICT dst, int32_t dst_stride, 518 const int8_t *filter_horiz, 519 const int8_t *filter_vert) { 520 v16i8 src0, src1, src2, src3, src4, mask, out0, out1; 521 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; 522 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 523 v8i16 filt; 524 525 mask = LD_SB(&vp8_mc_filt_mask_arr[0]); 526 527 filt = LD_SH(filter_horiz); 528 filt_hz = (v16u8)__msa_splati_h(filt, 0); 529 filt = LD_SH(filter_vert); 530 filt_vt = (v16u8)__msa_splati_h(filt, 0); 531 532 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 533 534 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT); 535 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT); 536 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 537 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 538 539 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT); 540 vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 541 tmp1 = __msa_dotp_u_h(vec1, filt_vt); 542 543 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT); 544 vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 545 tmp2 = __msa_dotp_u_h(vec2, filt_vt); 546 547 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT); 548 vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 549 tmp3 = __msa_dotp_u_h(vec3, filt_vt); 550 551 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT); 552 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 553 ST8x4_UB(out0, out1, dst, dst_stride); 554} 555 556static void common_hv_2ht_2vt_8x8mult_msa( 557 uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, 558 int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, 559 int32_t height) { 560 uint32_t loop_cnt; 561 v16i8 src0, src1, src2, src3, src4, mask, out0, out1; 562 v16u8 filt_hz, filt_vt, vec0; 563 v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; 564 v8i16 filt; 565 566 mask = LD_SB(&vp8_mc_filt_mask_arr[0]); 567 568 filt = LD_SH(filter_horiz); 569 filt_hz = (v16u8)__msa_splati_h(filt, 0); 570 filt = LD_SH(filter_vert); 571 filt_vt = (v16u8)__msa_splati_h(filt, 0); 572 573 src0 = LD_SB(src); 574 src += src_stride; 575 576 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT); 577 578 for (loop_cnt = (height >> 3); loop_cnt--;) { 579 LD_SB4(src, src_stride, src1, src2, src3, src4); 580 src += (4 * src_stride); 581 582 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT); 583 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 584 tmp1 = __msa_dotp_u_h(vec0, filt_vt); 585 586 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT); 587 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 588 tmp2 = __msa_dotp_u_h(vec0, filt_vt); 589 590 SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT); 591 592 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT); 593 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 594 tmp3 = __msa_dotp_u_h(vec0, filt_vt); 595 596 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT); 597 LD_SB4(src, src_stride, src1, src2, src3, src4); 598 src += (4 * src_stride); 599 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 600 tmp4 = __msa_dotp_u_h(vec0, filt_vt); 601 602 SRARI_H2_UH(tmp3, tmp4, VP8_FILTER_SHIFT); 603 PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); 604 ST8x4_UB(out0, out1, dst, dst_stride); 605 dst += (4 * dst_stride); 606 607 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT); 608 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 609 tmp5 = __msa_dotp_u_h(vec0, filt_vt); 610 611 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT); 612 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 613 tmp6 = __msa_dotp_u_h(vec0, filt_vt); 614 615 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT); 616 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 617 tmp7 = __msa_dotp_u_h(vec0, filt_vt); 618 619 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT); 620 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 621 tmp8 = __msa_dotp_u_h(vec0, filt_vt); 622 623 SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, VP8_FILTER_SHIFT); 624 PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); 625 ST8x4_UB(out0, out1, dst, dst_stride); 626 dst += (4 * dst_stride); 627 } 628} 629 630static void common_hv_2ht_2vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride, 631 uint8_t *RESTRICT dst, int32_t dst_stride, 632 const int8_t *filter_horiz, 633 const int8_t *filter_vert, 634 int32_t height) { 635 if (4 == height) { 636 common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz, 637 filter_vert); 638 } else { 639 common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, 640 filter_horiz, filter_vert, height); 641 } 642} 643 644static void common_hv_2ht_2vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride, 645 uint8_t *RESTRICT dst, int32_t dst_stride, 646 const int8_t *filter_horiz, 647 const int8_t *filter_vert, 648 int32_t height) { 649 uint32_t loop_cnt; 650 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 651 v16u8 filt_hz, filt_vt, vec0, vec1; 652 v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; 653 v8i16 filt; 654 655 mask = LD_SB(&vp8_mc_filt_mask_arr[0]); 656 657 /* rearranging filter */ 658 filt = LD_SH(filter_horiz); 659 filt_hz = (v16u8)__msa_splati_h(filt, 0); 660 filt = LD_SH(filter_vert); 661 filt_vt = (v16u8)__msa_splati_h(filt, 0); 662 663 LD_SB2(src, 8, src0, src1); 664 src += src_stride; 665 666 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT); 667 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT); 668 669 for (loop_cnt = (height >> 2); loop_cnt--;) { 670 LD_SB4(src, src_stride, src0, src2, src4, src6); 671 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 672 src += (4 * src_stride); 673 674 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT); 675 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT); 676 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 677 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 678 SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT); 679 PCKEV_ST_SB(tmp1, tmp2, dst); 680 dst += dst_stride; 681 682 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT); 683 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT); 684 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 685 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 686 SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT); 687 PCKEV_ST_SB(tmp1, tmp2, dst); 688 dst += dst_stride; 689 690 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT); 691 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, VP8_FILTER_SHIFT); 692 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 693 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 694 SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT); 695 PCKEV_ST_SB(tmp1, tmp2, dst); 696 dst += dst_stride; 697 698 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, VP8_FILTER_SHIFT); 699 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, VP8_FILTER_SHIFT); 700 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 701 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 702 SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT); 703 PCKEV_ST_SB(tmp1, tmp2, dst); 704 dst += dst_stride; 705 } 706} 707 708void vp8_bilinear_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, 709 int32_t xoffset, int32_t yoffset, 710 uint8_t *RESTRICT dst, int32_t dst_stride) { 711 const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1]; 712 const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1]; 713 714 if (yoffset) { 715 if (xoffset) { 716 common_hv_2ht_2vt_4w_msa(src, src_stride, dst, dst_stride, h_filter, 717 v_filter, 4); 718 } else { 719 common_vt_2t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4); 720 } 721 } else { 722 if (xoffset) { 723 common_hz_2t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4); 724 } else { 725 uint32_t tp0, tp1, tp2, tp3; 726 727 LW4(src, src_stride, tp0, tp1, tp2, tp3); 728 SW4(tp0, tp1, tp2, tp3, dst, dst_stride); 729 } 730 } 731} 732 733void vp8_bilinear_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride, 734 int32_t xoffset, int32_t yoffset, 735 uint8_t *RESTRICT dst, int32_t dst_stride) { 736 const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1]; 737 const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1]; 738 739 if (yoffset) { 740 if (xoffset) { 741 common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride, h_filter, 742 v_filter, 4); 743 } else { 744 common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4); 745 } 746 } else { 747 if (xoffset) { 748 common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4); 749 } else { 750 vp8_copy_mem8x4(src, src_stride, dst, dst_stride); 751 } 752 } 753} 754 755void vp8_bilinear_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride, 756 int32_t xoffset, int32_t yoffset, 757 uint8_t *RESTRICT dst, int32_t dst_stride) { 758 const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1]; 759 const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1]; 760 761 if (yoffset) { 762 if (xoffset) { 763 common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride, h_filter, 764 v_filter, 8); 765 } else { 766 common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8); 767 } 768 } else { 769 if (xoffset) { 770 common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8); 771 } else { 772 vp8_copy_mem8x8(src, src_stride, dst, dst_stride); 773 } 774 } 775} 776 777void vp8_bilinear_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride, 778 int32_t xoffset, int32_t yoffset, 779 uint8_t *RESTRICT dst, int32_t dst_stride) { 780 const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1]; 781 const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1]; 782 783 if (yoffset) { 784 if (xoffset) { 785 common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, h_filter, 786 v_filter, 16); 787 } else { 788 common_vt_2t_16w_msa(src, src_stride, dst, dst_stride, v_filter, 16); 789 } 790 } else { 791 if (xoffset) { 792 common_hz_2t_16w_msa(src, src_stride, dst, dst_stride, h_filter, 16); 793 } else { 794 vp8_copy_mem16x16(src, src_stride, dst, dst_stride); 795 } 796 } 797} 798