vpx_convolve8_avg_msa.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include "./vpx_dsp_rtcd.h" 13#include "vpx_dsp/mips/vpx_convolve_msa.h" 14 15static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, 16 int32_t src_stride, 17 uint8_t *dst, 18 int32_t dst_stride, 19 int8_t *filter_horiz, 20 int8_t *filter_vert, 21 int32_t height) { 22 uint32_t loop_cnt; 23 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 24 v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; 25 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; 26 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 27 v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; 28 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; 29 30 mask0 = LD_UB(&mc_filt_mask_arr[16]); 31 src -= (3 + 3 * src_stride); 32 33 /* rearranging filter */ 34 filt = LD_SH(filter_horiz); 35 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 36 37 mask1 = mask0 + 2; 38 mask2 = mask0 + 4; 39 mask3 = mask0 + 6; 40 41 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 42 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 43 src += (7 * src_stride); 44 45 hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, 46 filt_hz1, filt_hz2, filt_hz3); 47 hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, 48 filt_hz1, filt_hz2, filt_hz3); 49 hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, 50 filt_hz1, filt_hz2, filt_hz3); 51 hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, 52 filt_hz1, filt_hz2, filt_hz3); 53 SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); 54 55 filt = LD_SH(filter_vert); 56 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 57 58 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 59 vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); 60 61 for (loop_cnt = (height >> 2); loop_cnt--;) { 62 LD_SB4(src, src_stride, src7, src8, src9, src10); 63 XORI_B4_128_SB(src7, src8, src9, src10); 64 src += (4 * src_stride); 65 66 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 67 hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, 68 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 69 hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); 70 vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); 71 res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, 72 filt_vt2, filt_vt3); 73 74 hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, 75 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 76 hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); 77 vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); 78 res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, 79 filt_vt2, filt_vt3); 80 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); 81 82 SRARI_H2_SH(res0, res1, FILTER_BITS); 83 SAT_SH2_SH(res0, res1, 7); 84 PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1); 85 XORI_B2_128_UB(tmp0, tmp1); 86 AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1); 87 ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 88 dst += (4 * dst_stride); 89 90 hz_out5 = hz_out9; 91 vec0 = vec2; 92 vec1 = vec3; 93 vec2 = vec4; 94 } 95} 96 97static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, 98 int32_t src_stride, 99 uint8_t *dst, 100 int32_t dst_stride, 101 int8_t *filter_horiz, 102 int8_t *filter_vert, 103 int32_t height) { 104 uint32_t loop_cnt; 105 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 106 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; 107 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; 108 v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3; 109 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 110 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; 111 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; 112 113 mask0 = LD_UB(&mc_filt_mask_arr[0]); 114 src -= (3 + 3 * src_stride); 115 116 /* rearranging filter */ 117 filt = LD_SH(filter_horiz); 118 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 119 120 mask1 = mask0 + 2; 121 mask2 = mask0 + 4; 122 mask3 = mask0 + 6; 123 124 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 125 src += (7 * src_stride); 126 127 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 128 hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, 129 filt_hz1, filt_hz2, filt_hz3); 130 hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, 131 filt_hz1, filt_hz2, filt_hz3); 132 hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, 133 filt_hz1, filt_hz2, filt_hz3); 134 hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, 135 filt_hz1, filt_hz2, filt_hz3); 136 hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, 137 filt_hz1, filt_hz2, filt_hz3); 138 hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, 139 filt_hz1, filt_hz2, filt_hz3); 140 hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, 141 filt_hz1, filt_hz2, filt_hz3); 142 143 filt = LD_SH(filter_vert); 144 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 145 146 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 147 ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); 148 ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); 149 150 for (loop_cnt = (height >> 2); loop_cnt--;) { 151 LD_SB4(src, src_stride, src7, src8, src9, src10); 152 XORI_B4_128_SB(src7, src8, src9, src10); 153 src += (4 * src_stride); 154 155 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 156 157 hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, 158 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 159 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); 160 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, 161 filt_vt2, filt_vt3); 162 163 hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, 164 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 165 out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); 166 tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, 167 filt_vt2, filt_vt3); 168 169 hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, 170 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 171 out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); 172 tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, 173 filt_vt2, filt_vt3); 174 175 hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, 176 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 177 out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); 178 tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, 179 filt_vt2, filt_vt3); 180 181 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 182 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 183 CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, 184 dst, dst_stride); 185 dst += (4 * dst_stride); 186 187 hz_out6 = hz_out10; 188 out0 = out2; 189 out1 = out3; 190 out2 = out8; 191 out4 = out6; 192 out5 = out7; 193 out6 = out9; 194 } 195} 196 197static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src, 198 int32_t src_stride, 199 uint8_t *dst, 200 int32_t dst_stride, 201 int8_t *filter_horiz, 202 int8_t *filter_vert, 203 int32_t height) { 204 int32_t multiple8_cnt; 205 for (multiple8_cnt = 2; multiple8_cnt--;) { 206 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 207 filter_horiz, filter_vert, height); 208 src += 8; 209 dst += 8; 210 } 211} 212 213static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src, 214 int32_t src_stride, 215 uint8_t *dst, 216 int32_t dst_stride, 217 int8_t *filter_horiz, 218 int8_t *filter_vert, 219 int32_t height) { 220 int32_t multiple8_cnt; 221 for (multiple8_cnt = 4; multiple8_cnt--;) { 222 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 223 filter_horiz, filter_vert, height); 224 src += 8; 225 dst += 8; 226 } 227} 228 229static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src, 230 int32_t src_stride, 231 uint8_t *dst, 232 int32_t dst_stride, 233 int8_t *filter_horiz, 234 int8_t *filter_vert, 235 int32_t height) { 236 int32_t multiple8_cnt; 237 for (multiple8_cnt = 8; multiple8_cnt--;) { 238 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 239 filter_horiz, filter_vert, height); 240 src += 8; 241 dst += 8; 242 } 243} 244 245static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src, 246 int32_t src_stride, 247 uint8_t *dst, 248 int32_t dst_stride, 249 int8_t *filter_horiz, 250 int8_t *filter_vert) { 251 v16i8 src0, src1, src2, src3, src4, mask; 252 v16u8 filt_hz, filt_vt, vec0, vec1; 253 v16u8 dst0, dst1, dst2, dst3, res0, res1; 254 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; 255 256 mask = LD_SB(&mc_filt_mask_arr[16]); 257 258 /* rearranging filter */ 259 filt = LD_UH(filter_horiz); 260 filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); 261 262 filt = LD_UH(filter_vert); 263 filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); 264 265 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 266 267 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 268 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 269 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 270 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 271 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); 272 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 273 274 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 275 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); 276 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 277 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 278 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); 279 AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); 280 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 281} 282 283static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src, 284 int32_t src_stride, 285 uint8_t *dst, 286 int32_t dst_stride, 287 int8_t *filter_horiz, 288 int8_t *filter_vert) { 289 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; 290 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3; 291 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 292 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 293 v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; 294 v8i16 filt; 295 296 mask = LD_SB(&mc_filt_mask_arr[16]); 297 298 /* rearranging filter */ 299 filt = LD_SH(filter_horiz); 300 filt_hz = (v16u8)__msa_splati_h(filt, 0); 301 302 filt = LD_SH(filter_vert); 303 filt_vt = (v16u8)__msa_splati_h(filt, 0); 304 305 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 306 src += (8 * src_stride); 307 src8 = LD_SB(src); 308 309 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 310 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 311 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); 312 hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); 313 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); 314 SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, 315 hz_out3, hz_out5, 8); 316 hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); 317 318 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 319 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, 320 dst4, dst6); 321 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 322 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); 323 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, 324 tmp0, tmp1, tmp2, tmp3); 325 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 326 PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, 327 res2, res3); 328 AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, 329 res2, res3); 330 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 331 dst += (4 * dst_stride); 332 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); 333} 334 335static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src, 336 int32_t src_stride, 337 uint8_t *dst, 338 int32_t dst_stride, 339 int8_t *filter_horiz, 340 int8_t *filter_vert, 341 int32_t height) { 342 if (4 == height) { 343 common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, 344 filter_horiz, filter_vert); 345 } else if (8 == height) { 346 common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, 347 filter_horiz, filter_vert); 348 } 349} 350 351static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src, 352 int32_t src_stride, 353 uint8_t *dst, 354 int32_t dst_stride, 355 int8_t *filter_horiz, 356 int8_t *filter_vert) { 357 v16i8 src0, src1, src2, src3, src4, mask; 358 v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; 359 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 360 v8i16 filt; 361 362 mask = LD_SB(&mc_filt_mask_arr[0]); 363 364 /* rearranging filter */ 365 filt = LD_SH(filter_horiz); 366 filt_hz = (v16u8)__msa_splati_h(filt, 0); 367 368 filt = LD_SH(filter_vert); 369 filt_vt = (v16u8)__msa_splati_h(filt, 0); 370 371 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 372 src += (5 * src_stride); 373 374 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 375 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 376 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 377 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 378 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 379 380 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 381 vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 382 tmp1 = __msa_dotp_u_h(vec1, filt_vt); 383 384 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 385 vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 386 tmp2 = __msa_dotp_u_h(vec2, filt_vt); 387 388 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 389 vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 390 tmp3 = __msa_dotp_u_h(vec3, filt_vt); 391 392 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 393 PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, 394 dst, dst_stride); 395} 396 397static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src, 398 int32_t src_stride, 399 uint8_t *dst, 400 int32_t dst_stride, 401 int8_t *filter_horiz, 402 int8_t *filter_vert, 403 int32_t height) { 404 uint32_t loop_cnt; 405 v16i8 src0, src1, src2, src3, src4, mask; 406 v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3; 407 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 408 v8i16 filt; 409 410 mask = LD_SB(&mc_filt_mask_arr[0]); 411 412 /* rearranging filter */ 413 filt = LD_SH(filter_horiz); 414 filt_hz = (v16u8)__msa_splati_h(filt, 0); 415 416 filt = LD_SH(filter_vert); 417 filt_vt = (v16u8)__msa_splati_h(filt, 0); 418 419 src0 = LD_SB(src); 420 src += src_stride; 421 422 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 423 424 for (loop_cnt = (height >> 2); loop_cnt--;) { 425 LD_SB4(src, src_stride, src1, src2, src3, src4); 426 src += (4 * src_stride); 427 428 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 429 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 430 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 431 432 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 433 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 434 tmp1 = __msa_dotp_u_h(vec0, filt_vt); 435 436 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 437 438 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 439 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 440 tmp2 = __msa_dotp_u_h(vec0, filt_vt); 441 442 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 443 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 444 tmp3 = __msa_dotp_u_h(vec0, filt_vt); 445 446 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 447 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 448 PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, 449 dst, dst_stride); 450 dst += (4 * dst_stride); 451 } 452} 453 454static void common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t *src, 455 int32_t src_stride, 456 uint8_t *dst, 457 int32_t dst_stride, 458 int8_t *filter_horiz, 459 int8_t *filter_vert, 460 int32_t height) { 461 if (4 == height) { 462 common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, 463 filter_horiz, filter_vert); 464 } else { 465 common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, 466 filter_horiz, filter_vert, 467 height); 468 } 469} 470 471static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src, 472 int32_t src_stride, 473 uint8_t *dst, 474 int32_t dst_stride, 475 int8_t *filter_horiz, 476 int8_t *filter_vert, 477 int32_t height) { 478 uint32_t loop_cnt; 479 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 480 v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; 481 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; 482 v8i16 filt; 483 484 mask = LD_SB(&mc_filt_mask_arr[0]); 485 486 /* rearranging filter */ 487 filt = LD_SH(filter_horiz); 488 filt_hz = (v16u8)__msa_splati_h(filt, 0); 489 490 filt = LD_SH(filter_vert); 491 filt_vt = (v16u8)__msa_splati_h(filt, 0); 492 493 LD_SB2(src, 8, src0, src1); 494 src += src_stride; 495 496 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 497 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 498 499 for (loop_cnt = (height >> 2); loop_cnt--;) { 500 LD_SB4(src, src_stride, src0, src2, src4, src6); 501 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 502 src += (4 * src_stride); 503 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 504 505 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 506 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 507 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 508 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 509 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 510 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); 511 dst += dst_stride; 512 513 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 514 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 515 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 516 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 517 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 518 PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst); 519 dst += dst_stride; 520 521 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 522 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); 523 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 524 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 525 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 526 PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); 527 dst += dst_stride; 528 529 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); 530 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); 531 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 532 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 533 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 534 PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst); 535 dst += dst_stride; 536 } 537} 538 539static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src, 540 int32_t src_stride, 541 uint8_t *dst, 542 int32_t dst_stride, 543 int8_t *filter_horiz, 544 int8_t *filter_vert, 545 int32_t height) { 546 int32_t multiple8_cnt; 547 for (multiple8_cnt = 2; multiple8_cnt--;) { 548 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, 549 filter_horiz, filter_vert, height); 550 src += 16; 551 dst += 16; 552 } 553} 554 555static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src, 556 int32_t src_stride, 557 uint8_t *dst, 558 int32_t dst_stride, 559 int8_t *filter_horiz, 560 int8_t *filter_vert, 561 int32_t height) { 562 int32_t multiple8_cnt; 563 for (multiple8_cnt = 4; multiple8_cnt--;) { 564 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, 565 filter_horiz, filter_vert, height); 566 src += 16; 567 dst += 16; 568 } 569} 570 571void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, 572 uint8_t *dst, ptrdiff_t dst_stride, 573 const int16_t *filter_x, int x_step_q4, 574 const int16_t *filter_y, int y_step_q4, 575 int w, int h) { 576 int8_t cnt, filt_hor[8], filt_ver[8]; 577 578 assert(x_step_q4 == 16); 579 assert(y_step_q4 == 16); 580 assert(((const int32_t *)filter_x)[1] != 0x800000); 581 assert(((const int32_t *)filter_y)[1] != 0x800000); 582 583 for (cnt = 0; cnt < 8; ++cnt) { 584 filt_hor[cnt] = filter_x[cnt]; 585 filt_ver[cnt] = filter_y[cnt]; 586 } 587 588 if (((const int32_t *)filter_x)[0] == 0 && 589 ((const int32_t *)filter_y)[0] == 0) { 590 switch (w) { 591 case 4: 592 common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, 593 dst, (int32_t)dst_stride, 594 &filt_hor[3], &filt_ver[3], h); 595 break; 596 case 8: 597 common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, 598 dst, (int32_t)dst_stride, 599 &filt_hor[3], &filt_ver[3], h); 600 break; 601 case 16: 602 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, 603 dst, (int32_t)dst_stride, 604 &filt_hor[3], &filt_ver[3], h); 605 break; 606 case 32: 607 common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, 608 dst, (int32_t)dst_stride, 609 &filt_hor[3], &filt_ver[3], h); 610 break; 611 case 64: 612 common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, 613 dst, (int32_t)dst_stride, 614 &filt_hor[3], &filt_ver[3], h); 615 break; 616 default: 617 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, 618 filter_x, x_step_q4, filter_y, y_step_q4, 619 w, h); 620 break; 621 } 622 } else if (((const int32_t *)filter_x)[0] == 0 || 623 ((const int32_t *)filter_y)[0] == 0) { 624 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, 625 filter_x, x_step_q4, filter_y, y_step_q4, 626 w, h); 627 } else { 628 switch (w) { 629 case 4: 630 common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, 631 dst, (int32_t)dst_stride, 632 filt_hor, filt_ver, h); 633 break; 634 case 8: 635 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, 636 dst, (int32_t)dst_stride, 637 filt_hor, filt_ver, h); 638 break; 639 case 16: 640 common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, 641 dst, (int32_t)dst_stride, 642 filt_hor, filt_ver, h); 643 break; 644 case 32: 645 common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, 646 dst, (int32_t)dst_stride, 647 filt_hor, filt_ver, h); 648 break; 649 case 64: 650 common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, 651 dst, (int32_t)dst_stride, 652 filt_hor, filt_ver, h); 653 break; 654 default: 655 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, 656 filter_x, x_step_q4, filter_y, y_step_q4, 657 w, h); 658 break; 659 } 660 } 661} 662