1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include "./vpx_dsp_rtcd.h" 13#include "vpx_dsp/mips/vpx_convolve_msa.h" 14 15static void common_hv_8ht_8vt_and_aver_dst_4w_msa( 16 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 17 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 18 uint32_t loop_cnt; 19 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 20 v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; 21 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; 22 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 23 v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; 24 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; 25 26 mask0 = LD_UB(&mc_filt_mask_arr[16]); 27 src -= (3 + 3 * src_stride); 28 29 /* rearranging filter */ 30 filt = LD_SH(filter_horiz); 31 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 32 33 mask1 = mask0 + 2; 34 mask2 = mask0 + 4; 35 mask3 = mask0 + 6; 36 37 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 38 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 39 src += (7 * src_stride); 40 41 hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, 42 filt_hz1, filt_hz2, filt_hz3); 43 hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, 44 filt_hz1, filt_hz2, filt_hz3); 45 hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, 46 filt_hz1, filt_hz2, filt_hz3); 47 hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, 48 filt_hz1, filt_hz2, filt_hz3); 49 SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); 50 51 filt = LD_SH(filter_vert); 52 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 53 54 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 55 vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); 56 57 for (loop_cnt = (height >> 2); loop_cnt--;) { 58 LD_SB4(src, src_stride, src7, src8, src9, src10); 59 XORI_B4_128_SB(src7, src8, src9, src10); 60 src += (4 * src_stride); 61 62 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 63 hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, 64 filt_hz1, filt_hz2, filt_hz3); 65 hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); 66 vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); 67 res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, 68 filt_vt2, filt_vt3); 69 70 hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, 71 filt_hz1, filt_hz2, filt_hz3); 72 hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); 73 vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); 74 res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, 75 filt_vt2, filt_vt3); 76 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); 77 78 SRARI_H2_SH(res0, res1, FILTER_BITS); 79 SAT_SH2_SH(res0, res1, 7); 80 PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1); 81 XORI_B2_128_UB(tmp0, tmp1); 82 AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1); 83 ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); 84 dst += (4 * dst_stride); 85 86 hz_out5 = hz_out9; 87 vec0 = vec2; 88 vec1 = vec3; 89 vec2 = vec4; 90 } 91} 92 93static void common_hv_8ht_8vt_and_aver_dst_8w_msa( 94 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 95 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 96 uint32_t loop_cnt; 97 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 98 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; 99 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; 100 v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3; 101 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 102 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; 103 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; 104 105 mask0 = LD_UB(&mc_filt_mask_arr[0]); 106 src -= (3 + 3 * src_stride); 107 108 /* rearranging filter */ 109 filt = LD_SH(filter_horiz); 110 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 111 112 mask1 = mask0 + 2; 113 mask2 = mask0 + 4; 114 mask3 = mask0 + 6; 115 116 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 117 src += (7 * src_stride); 118 119 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 120 hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, 121 filt_hz1, filt_hz2, filt_hz3); 122 hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, 123 filt_hz1, filt_hz2, filt_hz3); 124 hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, 125 filt_hz1, filt_hz2, filt_hz3); 126 hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, 127 filt_hz1, filt_hz2, filt_hz3); 128 hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, 129 filt_hz1, filt_hz2, filt_hz3); 130 hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, 131 filt_hz1, filt_hz2, filt_hz3); 132 hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, 133 filt_hz1, filt_hz2, filt_hz3); 134 135 filt = LD_SH(filter_vert); 136 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 137 138 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 139 ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); 140 ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); 141 142 for (loop_cnt = (height >> 2); loop_cnt--;) { 143 LD_SB4(src, src_stride, src7, src8, src9, src10); 144 XORI_B4_128_SB(src7, src8, src9, src10); 145 src += (4 * src_stride); 146 147 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 148 149 hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, 150 filt_hz1, filt_hz2, filt_hz3); 151 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); 152 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, 153 filt_vt2, filt_vt3); 154 155 hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, 156 filt_hz1, filt_hz2, filt_hz3); 157 out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); 158 tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, 159 filt_vt2, filt_vt3); 160 161 hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, 162 filt_hz1, filt_hz2, filt_hz3); 163 out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); 164 tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, 165 filt_vt2, filt_vt3); 166 167 hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, 168 filt_hz0, filt_hz1, filt_hz2, filt_hz3); 169 out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); 170 tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, 171 filt_vt2, filt_vt3); 172 173 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 174 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 175 CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst, 176 dst_stride); 177 dst += (4 * dst_stride); 178 179 hz_out6 = hz_out10; 180 out0 = out2; 181 out1 = out3; 182 out2 = out8; 183 out4 = out6; 184 out5 = out7; 185 out6 = out9; 186 } 187} 188 189static void common_hv_8ht_8vt_and_aver_dst_16w_msa( 190 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 191 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 192 int32_t multiple8_cnt; 193 for (multiple8_cnt = 2; multiple8_cnt--;) { 194 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 195 filter_horiz, filter_vert, height); 196 src += 8; 197 dst += 8; 198 } 199} 200 201static void common_hv_8ht_8vt_and_aver_dst_32w_msa( 202 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 203 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 204 int32_t multiple8_cnt; 205 for (multiple8_cnt = 4; multiple8_cnt--;) { 206 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 207 filter_horiz, filter_vert, height); 208 src += 8; 209 dst += 8; 210 } 211} 212 213static void common_hv_8ht_8vt_and_aver_dst_64w_msa( 214 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 215 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 216 int32_t multiple8_cnt; 217 for (multiple8_cnt = 8; multiple8_cnt--;) { 218 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 219 filter_horiz, filter_vert, height); 220 src += 8; 221 dst += 8; 222 } 223} 224 225static void common_hv_2ht_2vt_and_aver_dst_4x4_msa( 226 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 227 int8_t *filter_horiz, int8_t *filter_vert) { 228 v16i8 src0, src1, src2, src3, src4, mask; 229 v16u8 filt_hz, filt_vt, vec0, vec1; 230 v16u8 dst0, dst1, dst2, dst3, res0, res1; 231 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; 232 233 mask = LD_SB(&mc_filt_mask_arr[16]); 234 235 /* rearranging filter */ 236 filt = LD_UH(filter_horiz); 237 filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); 238 239 filt = LD_UH(filter_vert); 240 filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); 241 242 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 243 244 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 245 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 246 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 247 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 248 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); 249 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 250 251 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 252 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); 253 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 254 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 255 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); 256 AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); 257 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 258} 259 260static void common_hv_2ht_2vt_and_aver_dst_4x8_msa( 261 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 262 int8_t *filter_horiz, int8_t *filter_vert) { 263 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; 264 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3; 265 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 266 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 267 v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; 268 v8i16 filt; 269 270 mask = LD_SB(&mc_filt_mask_arr[16]); 271 272 /* rearranging filter */ 273 filt = LD_SH(filter_horiz); 274 filt_hz = (v16u8)__msa_splati_h(filt, 0); 275 276 filt = LD_SH(filter_vert); 277 filt_vt = (v16u8)__msa_splati_h(filt, 0); 278 279 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 280 src += (8 * src_stride); 281 src8 = LD_SB(src); 282 283 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 284 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 285 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); 286 hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); 287 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); 288 SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, 289 hz_out3, hz_out5, 8); 290 hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); 291 292 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 293 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, 294 dst6); 295 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 296 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); 297 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0, 298 tmp1, tmp2, tmp3); 299 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 300 PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2, 301 res3); 302 AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, 303 res3); 304 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 305 dst += (4 * dst_stride); 306 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); 307} 308 309static void common_hv_2ht_2vt_and_aver_dst_4w_msa( 310 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 311 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 312 if (4 == height) { 313 common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, 314 filter_horiz, filter_vert); 315 } else if (8 == height) { 316 common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, 317 filter_horiz, filter_vert); 318 } 319} 320 321static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( 322 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 323 int8_t *filter_horiz, int8_t *filter_vert) { 324 v16i8 src0, src1, src2, src3, src4, mask; 325 v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; 326 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 327 v8i16 filt; 328 329 mask = LD_SB(&mc_filt_mask_arr[0]); 330 331 /* rearranging filter */ 332 filt = LD_SH(filter_horiz); 333 filt_hz = (v16u8)__msa_splati_h(filt, 0); 334 335 filt = LD_SH(filter_vert); 336 filt_vt = (v16u8)__msa_splati_h(filt, 0); 337 338 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 339 src += (5 * src_stride); 340 341 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 342 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 343 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 344 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 345 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 346 347 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 348 vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 349 tmp1 = __msa_dotp_u_h(vec1, filt_vt); 350 351 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 352 vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 353 tmp2 = __msa_dotp_u_h(vec2, filt_vt); 354 355 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 356 vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 357 tmp3 = __msa_dotp_u_h(vec3, filt_vt); 358 359 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 360 PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, 361 dst_stride); 362} 363 364static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( 365 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 366 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 367 uint32_t loop_cnt; 368 v16i8 src0, src1, src2, src3, src4, mask; 369 v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3; 370 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 371 v8i16 filt; 372 373 mask = LD_SB(&mc_filt_mask_arr[0]); 374 375 /* rearranging filter */ 376 filt = LD_SH(filter_horiz); 377 filt_hz = (v16u8)__msa_splati_h(filt, 0); 378 379 filt = LD_SH(filter_vert); 380 filt_vt = (v16u8)__msa_splati_h(filt, 0); 381 382 src0 = LD_SB(src); 383 src += src_stride; 384 385 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 386 387 for (loop_cnt = (height >> 2); loop_cnt--;) { 388 LD_SB4(src, src_stride, src1, src2, src3, src4); 389 src += (4 * src_stride); 390 391 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 392 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 393 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 394 395 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 396 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 397 tmp1 = __msa_dotp_u_h(vec0, filt_vt); 398 399 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 400 401 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 402 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 403 tmp2 = __msa_dotp_u_h(vec0, filt_vt); 404 405 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 406 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 407 tmp3 = __msa_dotp_u_h(vec0, filt_vt); 408 409 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 410 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 411 PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, 412 dst_stride); 413 dst += (4 * dst_stride); 414 } 415} 416 417static void common_hv_2ht_2vt_and_aver_dst_8w_msa( 418 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 419 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 420 if (4 == height) { 421 common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, 422 filter_horiz, filter_vert); 423 } else { 424 common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( 425 src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); 426 } 427} 428 429static void common_hv_2ht_2vt_and_aver_dst_16w_msa( 430 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 431 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 432 uint32_t loop_cnt; 433 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 434 v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; 435 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; 436 v8i16 filt; 437 438 mask = LD_SB(&mc_filt_mask_arr[0]); 439 440 /* rearranging filter */ 441 filt = LD_SH(filter_horiz); 442 filt_hz = (v16u8)__msa_splati_h(filt, 0); 443 444 filt = LD_SH(filter_vert); 445 filt_vt = (v16u8)__msa_splati_h(filt, 0); 446 447 LD_SB2(src, 8, src0, src1); 448 src += src_stride; 449 450 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 451 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 452 453 for (loop_cnt = (height >> 2); loop_cnt--;) { 454 LD_SB4(src, src_stride, src0, src2, src4, src6); 455 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 456 src += (4 * src_stride); 457 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 458 459 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 460 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 461 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 462 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 463 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 464 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); 465 dst += dst_stride; 466 467 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 468 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 469 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 470 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 471 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 472 PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst); 473 dst += dst_stride; 474 475 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 476 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); 477 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 478 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 479 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 480 PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); 481 dst += dst_stride; 482 483 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); 484 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); 485 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 486 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 487 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 488 PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst); 489 dst += dst_stride; 490 } 491} 492 493static void common_hv_2ht_2vt_and_aver_dst_32w_msa( 494 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 495 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 496 int32_t multiple8_cnt; 497 for (multiple8_cnt = 2; multiple8_cnt--;) { 498 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, 499 filter_horiz, filter_vert, height); 500 src += 16; 501 dst += 16; 502 } 503} 504 505static void common_hv_2ht_2vt_and_aver_dst_64w_msa( 506 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 507 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 508 int32_t multiple8_cnt; 509 for (multiple8_cnt = 4; multiple8_cnt--;) { 510 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, 511 filter_horiz, filter_vert, height); 512 src += 16; 513 dst += 16; 514 } 515} 516 517void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, 518 uint8_t *dst, ptrdiff_t dst_stride, 519 const int16_t *filter_x, int x_step_q4, 520 const int16_t *filter_y, int y_step_q4, int w, 521 int h) { 522 int8_t cnt, filt_hor[8], filt_ver[8]; 523 524 assert(x_step_q4 == 16); 525 assert(y_step_q4 == 16); 526 assert(((const int32_t *)filter_x)[1] != 0x800000); 527 assert(((const int32_t *)filter_y)[1] != 0x800000); 528 529 for (cnt = 0; cnt < 8; ++cnt) { 530 filt_hor[cnt] = filter_x[cnt]; 531 filt_ver[cnt] = filter_y[cnt]; 532 } 533 534 if (((const int32_t *)filter_x)[0] == 0 && 535 ((const int32_t *)filter_y)[0] == 0) { 536 switch (w) { 537 case 4: 538 common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, 539 (int32_t)dst_stride, &filt_hor[3], 540 &filt_ver[3], h); 541 break; 542 case 8: 543 common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, 544 (int32_t)dst_stride, &filt_hor[3], 545 &filt_ver[3], h); 546 break; 547 case 16: 548 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, 549 (int32_t)dst_stride, 550 &filt_hor[3], &filt_ver[3], h); 551 break; 552 case 32: 553 common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, 554 (int32_t)dst_stride, 555 &filt_hor[3], &filt_ver[3], h); 556 break; 557 case 64: 558 common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, 559 (int32_t)dst_stride, 560 &filt_hor[3], &filt_ver[3], h); 561 break; 562 default: 563 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, 564 x_step_q4, filter_y, y_step_q4, w, h); 565 break; 566 } 567 } else if (((const int32_t *)filter_x)[0] == 0 || 568 ((const int32_t *)filter_y)[0] == 0) { 569 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, 570 filter_y, y_step_q4, w, h); 571 } else { 572 switch (w) { 573 case 4: 574 common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, 575 (int32_t)dst_stride, filt_hor, 576 filt_ver, h); 577 break; 578 case 8: 579 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, 580 (int32_t)dst_stride, filt_hor, 581 filt_ver, h); 582 break; 583 case 16: 584 common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, 585 (int32_t)dst_stride, filt_hor, 586 filt_ver, h); 587 break; 588 case 32: 589 common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, 590 (int32_t)dst_stride, filt_hor, 591 filt_ver, h); 592 break; 593 case 64: 594 common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, 595 (int32_t)dst_stride, filt_hor, 596 filt_ver, h); 597 break; 598 default: 599 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, 600 x_step_q4, filter_y, y_step_q4, w, h); 601 break; 602 } 603 } 604} 605