1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include "./vpx_dsp_rtcd.h" 13#include "vpx_dsp/mips/vpx_convolve_msa.h" 14 15static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, 16 uint8_t *dst, int32_t dst_stride, 17 int8_t *filter, int32_t height) { 18 uint32_t loop_cnt; 19 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 20 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 21 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; 22 v16i8 src10998, filt0, filt1, filt2, filt3; 23 v16u8 out; 24 v8i16 filt, out10, out32; 25 26 src -= (3 * src_stride); 27 28 filt = LD_SH(filter); 29 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 30 31 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 32 src += (7 * src_stride); 33 34 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 35 src54_r, src21_r); 36 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 37 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, 38 src4332, src6554); 39 XORI_B3_128_SB(src2110, src4332, src6554); 40 41 for (loop_cnt = (height >> 2); loop_cnt--;) { 42 LD_SB4(src, src_stride, src7, src8, src9, src10); 43 src += (4 * src_stride); 44 45 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 46 src87_r, src98_r, src109_r); 47 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); 48 XORI_B2_128_SB(src8776, src10998); 49 out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, 50 filt1, filt2, filt3); 51 out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, 52 filt1, filt2, filt3); 53 SRARI_H2_SH(out10, out32, FILTER_BITS); 54 SAT_SH2_SH(out10, out32, 7); 55 out = PCKEV_XORI128_UB(out10, out32); 56 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 57 dst += (4 * dst_stride); 58 59 src2110 = src6554; 60 src4332 = src8776; 61 src6554 = src10998; 62 src6 = src10; 63 } 64} 65 66static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, 67 uint8_t *dst, int32_t dst_stride, 68 int8_t *filter, int32_t height) { 69 uint32_t loop_cnt; 70 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 71 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 72 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; 73 v16u8 tmp0, tmp1; 74 v8i16 filt, out0_r, out1_r, out2_r, out3_r; 75 76 src -= (3 * src_stride); 77 78 filt = LD_SH(filter); 79 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 80 81 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 82 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 83 src += (7 * src_stride); 84 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 85 src54_r, src21_r); 86 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 87 88 for (loop_cnt = (height >> 2); loop_cnt--;) { 89 LD_SB4(src, src_stride, src7, src8, src9, src10); 90 XORI_B4_128_SB(src7, src8, src9, src10); 91 src += (4 * src_stride); 92 93 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 94 src87_r, src98_r, src109_r); 95 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, 96 filt1, filt2, filt3); 97 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, 98 filt1, filt2, filt3); 99 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, 100 filt1, filt2, filt3); 101 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, 102 filt1, filt2, filt3); 103 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); 104 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 105 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); 106 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); 107 ST8x4_UB(tmp0, tmp1, dst, dst_stride); 108 dst += (4 * dst_stride); 109 110 src10_r = src54_r; 111 src32_r = src76_r; 112 src54_r = src98_r; 113 src21_r = src65_r; 114 src43_r = src87_r; 115 src65_r = src109_r; 116 src6 = src10; 117 } 118} 119 120static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, 121 uint8_t *dst, int32_t dst_stride, 122 int8_t *filter, int32_t height) { 123 uint32_t loop_cnt; 124 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 125 v16i8 filt0, filt1, filt2, filt3; 126 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 127 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; 128 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; 129 v16u8 tmp0, tmp1, tmp2, tmp3; 130 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 131 132 src -= (3 * src_stride); 133 134 filt = LD_SH(filter); 135 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 136 137 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 138 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 139 src += (7 * src_stride); 140 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 141 src54_r, src21_r); 142 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 143 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, 144 src54_l, src21_l); 145 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 146 147 for (loop_cnt = (height >> 2); loop_cnt--;) { 148 LD_SB4(src, src_stride, src7, src8, src9, src10); 149 XORI_B4_128_SB(src7, src8, src9, src10); 150 src += (4 * src_stride); 151 152 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 153 src87_r, src98_r, src109_r); 154 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, 155 src87_l, src98_l, src109_l); 156 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, 157 filt1, filt2, filt3); 158 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, 159 filt1, filt2, filt3); 160 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, 161 filt1, filt2, filt3); 162 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, 163 filt1, filt2, filt3); 164 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, 165 filt1, filt2, filt3); 166 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, 167 filt1, filt2, filt3); 168 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, 169 filt1, filt2, filt3); 170 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, 171 filt1, filt2, filt3); 172 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); 173 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); 174 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 175 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 176 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r, 177 tmp0, tmp1, tmp2, tmp3); 178 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 179 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 180 dst += (4 * dst_stride); 181 182 src10_r = src54_r; 183 src32_r = src76_r; 184 src54_r = src98_r; 185 src21_r = src65_r; 186 src43_r = src87_r; 187 src65_r = src109_r; 188 src10_l = src54_l; 189 src32_l = src76_l; 190 src54_l = src98_l; 191 src21_l = src65_l; 192 src43_l = src87_l; 193 src65_l = src109_l; 194 src6 = src10; 195 } 196} 197 198static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, 199 uint8_t *dst, int32_t dst_stride, 200 int8_t *filter, int32_t height, 201 int32_t width) { 202 const uint8_t *src_tmp; 203 uint8_t *dst_tmp; 204 uint32_t loop_cnt, cnt; 205 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 206 v16i8 filt0, filt1, filt2, filt3; 207 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 208 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; 209 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; 210 v16u8 tmp0, tmp1, tmp2, tmp3; 211 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; 212 213 src -= (3 * src_stride); 214 215 filt = LD_SH(filter); 216 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 217 218 for (cnt = (width >> 4); cnt--;) { 219 src_tmp = src; 220 dst_tmp = dst; 221 222 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); 223 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 224 src_tmp += (7 * src_stride); 225 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, 226 src54_r, src21_r); 227 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); 228 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, 229 src54_l, src21_l); 230 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); 231 232 for (loop_cnt = (height >> 2); loop_cnt--;) { 233 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); 234 XORI_B4_128_SB(src7, src8, src9, src10); 235 src_tmp += (4 * src_stride); 236 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, 237 src87_r, src98_r, src109_r); 238 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, 239 src87_l, src98_l, src109_l); 240 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, 241 filt1, filt2, filt3); 242 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, 243 filt1, filt2, filt3); 244 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, 245 filt1, filt2, filt3); 246 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, 247 filt1, filt2, filt3); 248 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, 249 filt1, filt2, filt3); 250 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, 251 filt1, filt2, filt3); 252 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, 253 filt1, filt2, filt3); 254 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, 255 filt1, filt2, filt3); 256 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); 257 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); 258 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); 259 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); 260 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, 261 out3_r, tmp0, tmp1, tmp2, tmp3); 262 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); 263 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride); 264 dst_tmp += (4 * dst_stride); 265 266 src10_r = src54_r; 267 src32_r = src76_r; 268 src54_r = src98_r; 269 src21_r = src65_r; 270 src43_r = src87_r; 271 src65_r = src109_r; 272 src10_l = src54_l; 273 src32_l = src76_l; 274 src54_l = src98_l; 275 src21_l = src65_l; 276 src43_l = src87_l; 277 src65_l = src109_l; 278 src6 = src10; 279 } 280 281 src += 16; 282 dst += 16; 283 } 284} 285 286static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, 287 uint8_t *dst, int32_t dst_stride, 288 int8_t *filter, int32_t height) { 289 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, 290 32); 291} 292 293static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, 294 uint8_t *dst, int32_t dst_stride, 295 int8_t *filter, int32_t height) { 296 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, 297 64); 298} 299 300static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, 301 uint8_t *dst, int32_t dst_stride, 302 int8_t *filter) { 303 v16i8 src0, src1, src2, src3, src4; 304 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; 305 v16u8 filt0; 306 v8i16 filt; 307 v8u16 tmp0, tmp1; 308 309 filt = LD_SH(filter); 310 filt0 = (v16u8)__msa_splati_h(filt, 0); 311 312 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 313 src += (5 * src_stride); 314 315 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 316 src32_r, src43_r); 317 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 318 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); 319 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 320 src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 321 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); 322} 323 324static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, 325 uint8_t *dst, int32_t dst_stride, 326 int8_t *filter) { 327 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 328 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; 329 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; 330 v8u16 tmp0, tmp1, tmp2, tmp3; 331 v16u8 filt0; 332 v8i16 filt; 333 334 filt = LD_SH(filter); 335 filt0 = (v16u8)__msa_splati_h(filt, 0); 336 337 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 338 src += (8 * src_stride); 339 340 src8 = LD_SB(src); 341 src += src_stride; 342 343 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 344 src32_r, src43_r); 345 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 346 src76_r, src87_r); 347 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, 348 src76_r, src2110, src4332, src6554, src8776); 349 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, 350 tmp0, tmp1, tmp2, tmp3); 351 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 352 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); 353 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); 354 ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 355} 356 357static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, 358 uint8_t *dst, int32_t dst_stride, 359 int8_t *filter, int32_t height) { 360 if (4 == height) { 361 common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); 362 } else if (8 == height) { 363 common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); 364 } 365} 366 367static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, 368 uint8_t *dst, int32_t dst_stride, 369 int8_t *filter) { 370 v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; 371 v16i8 out0, out1; 372 v8u16 tmp0, tmp1, tmp2, tmp3; 373 v8i16 filt; 374 375 /* rearranging filter_y */ 376 filt = LD_SH(filter); 377 filt0 = (v16u8)__msa_splati_h(filt, 0); 378 379 LD_UB5(src, src_stride, src0, src1, src2, src3, src4); 380 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); 381 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); 382 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, 383 tmp2, tmp3); 384 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 385 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 386 ST8x4_UB(out0, out1, dst, dst_stride); 387} 388 389static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, 390 uint8_t *dst, int32_t dst_stride, 391 int8_t *filter, int32_t height) { 392 uint32_t loop_cnt; 393 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 394 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 395 v16i8 out0, out1; 396 v8u16 tmp0, tmp1, tmp2, tmp3; 397 v8i16 filt; 398 399 /* rearranging filter_y */ 400 filt = LD_SH(filter); 401 filt0 = (v16u8)__msa_splati_h(filt, 0); 402 403 src0 = LD_UB(src); 404 src += src_stride; 405 406 for (loop_cnt = (height >> 3); loop_cnt--;) { 407 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); 408 src += (8 * src_stride); 409 410 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, 411 vec3); 412 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, 413 vec7); 414 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, 415 tmp2, tmp3); 416 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 417 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 418 ST8x4_UB(out0, out1, dst, dst_stride); 419 dst += (4 * dst_stride); 420 421 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, 422 tmp2, tmp3); 423 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 424 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 425 ST8x4_UB(out0, out1, dst, dst_stride); 426 dst += (4 * dst_stride); 427 428 src0 = src8; 429 } 430} 431 432static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride, 433 uint8_t *dst, int32_t dst_stride, 434 int8_t *filter, int32_t height) { 435 if (4 == height) { 436 common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); 437 } else { 438 common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); 439 } 440} 441 442static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride, 443 uint8_t *dst, int32_t dst_stride, 444 int8_t *filter, int32_t height) { 445 uint32_t loop_cnt; 446 v16u8 src0, src1, src2, src3, src4; 447 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 448 v8u16 tmp0, tmp1, tmp2, tmp3; 449 v8i16 filt; 450 451 /* rearranging filter_y */ 452 filt = LD_SH(filter); 453 filt0 = (v16u8)__msa_splati_h(filt, 0); 454 455 src0 = LD_UB(src); 456 src += src_stride; 457 458 for (loop_cnt = (height >> 2); loop_cnt--;) { 459 LD_UB4(src, src_stride, src1, src2, src3, src4); 460 src += (4 * src_stride); 461 462 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 463 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 464 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 465 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 466 PCKEV_ST_SB(tmp0, tmp1, dst); 467 dst += dst_stride; 468 469 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); 470 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); 471 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 472 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 473 PCKEV_ST_SB(tmp2, tmp3, dst); 474 dst += dst_stride; 475 476 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 477 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 478 PCKEV_ST_SB(tmp0, tmp1, dst); 479 dst += dst_stride; 480 481 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 482 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 483 PCKEV_ST_SB(tmp2, tmp3, dst); 484 dst += dst_stride; 485 486 src0 = src4; 487 } 488} 489 490static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride, 491 uint8_t *dst, int32_t dst_stride, 492 int8_t *filter, int32_t height) { 493 uint32_t loop_cnt; 494 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; 495 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 496 v8u16 tmp0, tmp1, tmp2, tmp3; 497 v8i16 filt; 498 499 /* rearranging filter_y */ 500 filt = LD_SH(filter); 501 filt0 = (v16u8)__msa_splati_h(filt, 0); 502 503 src0 = LD_UB(src); 504 src5 = LD_UB(src + 16); 505 src += src_stride; 506 507 for (loop_cnt = (height >> 2); loop_cnt--;) { 508 LD_UB4(src, src_stride, src1, src2, src3, src4); 509 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 510 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 511 512 LD_UB4(src + 16, src_stride, src6, src7, src8, src9); 513 src += (4 * src_stride); 514 515 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 516 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 517 PCKEV_ST_SB(tmp0, tmp1, dst); 518 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 519 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 520 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); 521 522 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); 523 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); 524 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 525 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 526 PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); 527 528 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 529 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 530 PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); 531 532 ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); 533 ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); 534 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 535 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 536 PCKEV_ST_SB(tmp0, tmp1, dst + 16); 537 538 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 539 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 540 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); 541 542 ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); 543 ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); 544 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 545 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 546 PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); 547 548 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 549 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 550 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); 551 dst += (4 * dst_stride); 552 553 src0 = src4; 554 src5 = src9; 555 } 556} 557 558static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, 559 uint8_t *dst, int32_t dst_stride, 560 int8_t *filter, int32_t height) { 561 uint32_t loop_cnt; 562 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 563 v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 564 v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 565 v8i16 filt; 566 567 /* rearranging filter_y */ 568 filt = LD_SH(filter); 569 filt0 = (v16u8)__msa_splati_h(filt, 0); 570 571 LD_UB4(src, 16, src0, src3, src6, src9); 572 src += src_stride; 573 574 for (loop_cnt = (height >> 1); loop_cnt--;) { 575 LD_UB2(src, src_stride, src1, src2); 576 LD_UB2(src + 16, src_stride, src4, src5); 577 LD_UB2(src + 32, src_stride, src7, src8); 578 LD_UB2(src + 48, src_stride, src10, src11); 579 src += (2 * src_stride); 580 581 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 582 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 583 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 584 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 585 PCKEV_ST_SB(tmp0, tmp1, dst); 586 587 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 588 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 589 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); 590 591 ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); 592 ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); 593 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); 594 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); 595 PCKEV_ST_SB(tmp4, tmp5, dst + 16); 596 597 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); 598 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); 599 PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); 600 601 ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); 602 ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); 603 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 604 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 605 PCKEV_ST_SB(tmp0, tmp1, dst + 32); 606 607 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 608 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 609 PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); 610 611 ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); 612 ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); 613 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); 614 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); 615 PCKEV_ST_SB(tmp4, tmp5, dst + 48); 616 617 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); 618 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); 619 PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); 620 dst += (2 * dst_stride); 621 622 src0 = src2; 623 src3 = src5; 624 src6 = src8; 625 src9 = src11; 626 } 627} 628 629void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, 630 uint8_t *dst, ptrdiff_t dst_stride, 631 const InterpKernel *filter, int x0_q4, 632 int32_t x_step_q4, int y0_q4, int y_step_q4, int w, 633 int h) { 634 const int16_t *const filter_y = filter[y0_q4]; 635 int8_t cnt, filt_ver[8]; 636 637 assert(y_step_q4 == 16); 638 assert(((const int32_t *)filter_y)[1] != 0x800000); 639 640 for (cnt = 8; cnt--;) { 641 filt_ver[cnt] = filter_y[cnt]; 642 } 643 644 if (((const int32_t *)filter_y)[0] == 0) { 645 switch (w) { 646 case 4: 647 common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 648 &filt_ver[3], h); 649 break; 650 case 8: 651 common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 652 &filt_ver[3], h); 653 break; 654 case 16: 655 common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 656 &filt_ver[3], h); 657 break; 658 case 32: 659 common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 660 &filt_ver[3], h); 661 break; 662 case 64: 663 common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 664 &filt_ver[3], h); 665 break; 666 default: 667 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, 668 x_step_q4, y0_q4, y_step_q4, w, h); 669 break; 670 } 671 } else { 672 switch (w) { 673 case 4: 674 common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 675 filt_ver, h); 676 break; 677 case 8: 678 common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 679 filt_ver, h); 680 break; 681 case 16: 682 common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 683 filt_ver, h); 684 break; 685 case 32: 686 common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 687 filt_ver, h); 688 break; 689 case 64: 690 common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, 691 filt_ver, h); 692 break; 693 default: 694 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, 695 x_step_q4, y0_q4, y_step_q4, w, h); 696 break; 697 } 698 } 699} 700