1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* 2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * 4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */ 10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include <assert.h> 12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h" 13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/mips/vpx_convolve_msa.h" 14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 157bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_8ht_8vt_and_aver_dst_4w_msa( 167bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 177bc9febe8749e98a3812a0dc4380ceae75c29450Johann int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 19df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t tp0, tp1, tp2, tp3; 20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 21df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 dst0 = { 0 }, mask0, mask1, mask2, mask3, res; 22da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; 23da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 24da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; 25da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; 26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 27da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask0 = LD_UB(&mc_filt_mask_arr[16]); 28da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src -= (3 + 3 * src_stride); 29da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rearranging filter */ 31da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_horiz); 32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 33da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 34da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask1 = mask0 + 2; 35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask2 = mask0 + 4; 36da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask3 = mask0 + 6; 37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (7 * src_stride); 41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, 43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, 45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 46da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, 47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 48da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, 49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 50da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); 51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_vert); 53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 56da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); 57da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src, src_stride, src7, src8, src9, src10); 60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian XORI_B4_128_SB(src7, src8, src9, src10); 61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 63df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 64df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 657bc9febe8749e98a3812a0dc4380ceae75c29450Johann hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, 667bc9febe8749e98a3812a0dc4380ceae75c29450Johann filt_hz1, filt_hz2, filt_hz3); 67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); 68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); 69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, 70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt2, filt_vt3); 71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 727bc9febe8749e98a3812a0dc4380ceae75c29450Johann hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, 737bc9febe8749e98a3812a0dc4380ceae75c29450Johann filt_hz1, filt_hz2, filt_hz3); 74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); 75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); 76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, 77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt2, filt_vt3); 78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_SH(res0, res1, FILTER_BITS); 80da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SAT_SH2_SH(res0, res1, 7); 81df37111358d02836cb29bbcb9c6e4c95dff90a16Johann res = PCKEV_XORI128_UB(res0, res1); 82df37111358d02836cb29bbcb9c6e4c95dff90a16Johann res = (v16u8)__msa_aver_u_b(res, dst0); 83df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); 84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out5 = hz_out9; 87da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = vec2; 88da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = vec3; 89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = vec4; 90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 937bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_8ht_8vt_and_aver_dst_8w_msa( 947bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 957bc9febe8749e98a3812a0dc4380ceae75c29450Johann int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 97df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint64_t tp0, tp1, tp2, tp3; 98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; 100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; 101df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 dst0 = { 0 }, dst1 = { 0 }, mask0, mask1, mask2, mask3; 102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; 104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; 105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask0 = LD_UB(&mc_filt_mask_arr[0]); 107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src -= (3 + 3 * src_stride); 108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rearranging filter */ 110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_horiz); 111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask1 = mask0 + 2; 114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask2 = mask0 + 4; 115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask3 = mask0 + 6; 116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (7 * src_stride); 119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, 122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, 124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, 126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, 128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, 130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, 132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, 134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_vert); 137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); 141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); 142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src, src_stride, src7, src8, src9, src10); 145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian XORI_B4_128_SB(src7, src8, src9, src10); 146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 148df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 149df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(tp0, tp1, dst0); 150df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(tp2, tp3, dst1); 151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1527bc9febe8749e98a3812a0dc4380ceae75c29450Johann hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, 1537bc9febe8749e98a3812a0dc4380ceae75c29450Johann filt_hz1, filt_hz2, filt_hz3); 154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); 155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, 156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt2, filt_vt3); 157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1587bc9febe8749e98a3812a0dc4380ceae75c29450Johann hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, 1597bc9febe8749e98a3812a0dc4380ceae75c29450Johann filt_hz1, filt_hz2, filt_hz3); 160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); 161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, 162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt2, filt_vt3); 163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1647bc9febe8749e98a3812a0dc4380ceae75c29450Johann hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, 1657bc9febe8749e98a3812a0dc4380ceae75c29450Johann filt_hz1, filt_hz2, filt_hz3); 166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); 167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, 168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt2, filt_vt3); 169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, 171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz0, filt_hz1, filt_hz2, filt_hz3); 172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); 173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, 174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt2, filt_vt3); 175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 178df37111358d02836cb29bbcb9c6e4c95dff90a16Johann CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, 1797bc9febe8749e98a3812a0dc4380ceae75c29450Johann dst_stride); 180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out6 = hz_out10; 183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out0 = out2; 184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out1 = out3; 185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out2 = out8; 186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out4 = out6; 187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out5 = out7; 188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out6 = out9; 189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1927bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_8ht_8vt_and_aver_dst_16w_msa( 1937bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 1947bc9febe8749e98a3812a0dc4380ceae75c29450Johann int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t multiple8_cnt; 196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (multiple8_cnt = 2; multiple8_cnt--;) { 197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_horiz, filter_vert, height); 199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 8; 200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 8; 201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2047bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_8ht_8vt_and_aver_dst_32w_msa( 2057bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 2067bc9febe8749e98a3812a0dc4380ceae75c29450Johann int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t multiple8_cnt; 208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (multiple8_cnt = 4; multiple8_cnt--;) { 209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_horiz, filter_vert, height); 211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 8; 212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 8; 213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2167bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_8ht_8vt_and_aver_dst_64w_msa( 2177bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 2187bc9febe8749e98a3812a0dc4380ceae75c29450Johann int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t multiple8_cnt; 220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (multiple8_cnt = 8; multiple8_cnt--;) { 221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_horiz, filter_vert, height); 223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 8; 224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 8; 225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2287bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_4x4_msa( 2297bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 2307bc9febe8749e98a3812a0dc4380ceae75c29450Johann int8_t *filter_horiz, int8_t *filter_vert) { 231df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t tp0, tp1, tp2, tp3; 232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, mask; 233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt_hz, filt_vt, vec0, vec1; 234df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 dst0 = { 0 }, out; 235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; 236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask = LD_SB(&mc_filt_mask_arr[16]); 238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rearranging filter */ 240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_UH(filter_horiz); 241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); 242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_UH(filter_vert); 244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); 245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); 253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 255df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 256df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 259df37111358d02836cb29bbcb9c6e4c95dff90a16Johann out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 260df37111358d02836cb29bbcb9c6e4c95dff90a16Johann out = __msa_aver_u_b(out, dst0); 261df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 2647bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_4x8_msa( 2657bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 2667bc9febe8749e98a3812a0dc4380ceae75c29450Johann int8_t *filter_horiz, int8_t *filter_vert) { 267df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t tp0, tp1, tp2, tp3; 268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; 269df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1; 270df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 dst0 = { 0 }, dst1 = { 0 }; 271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; 273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 filt; 274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask = LD_SB(&mc_filt_mask_arr[16]); 276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rearranging filter */ 278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_horiz); 279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_splati_h(filt, 0); 280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_vert); 282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_splati_h(filt, 0); 283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (8 * src_stride); 286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src8 = LD_SB(src); 287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); 291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); 292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); 293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, 294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3, hz_out5, 8); 295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); 296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 297df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LW4(dst, dst_stride, tp0, tp1, tp2, tp3); 298df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); 299df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); 300df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); 301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); 3037bc9febe8749e98a3812a0dc4380ceae75c29450Johann DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0, 3047bc9febe8749e98a3812a0dc4380ceae75c29450Johann tmp1, tmp2, tmp3); 305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 306df37111358d02836cb29bbcb9c6e4c95dff90a16Johann PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1); 307df37111358d02836cb29bbcb9c6e4c95dff90a16Johann AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); 308df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ST4x8_UB(res0, res1, dst, dst_stride); 309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3117bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_4w_msa( 3127bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 3137bc9febe8749e98a3812a0dc4380ceae75c29450Johann int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (4 == height) { 315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, 316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_horiz, filter_vert); 317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else if (8 == height) { 318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, 319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_horiz, filter_vert); 320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3237bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_8x4_msa( 3247bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 3257bc9febe8749e98a3812a0dc4380ceae75c29450Johann int8_t *filter_horiz, int8_t *filter_vert) { 326df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint64_t tp0, tp1, tp2, tp3; 327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, mask; 328df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 filt_hz, filt_vt, dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3; 329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 filt; 331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask = LD_SB(&mc_filt_mask_arr[0]); 333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rearranging filter */ 335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_horiz); 336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_splati_h(filt, 0); 337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_vert); 339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_splati_h(filt, 0); 340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (5 * src_stride); 343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 344df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 345df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(tp0, tp1, dst0); 346df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(tp2, tp3, dst1); 347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = __msa_dotp_u_h(vec0, filt_vt); 351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = __msa_dotp_u_h(vec1, filt_vt); 355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = __msa_dotp_u_h(vec2, filt_vt); 359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = __msa_dotp_u_h(vec3, filt_vt); 363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 365df37111358d02836cb29bbcb9c6e4c95dff90a16Johann PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); 366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 3687bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( 3697bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 3707bc9febe8749e98a3812a0dc4380ceae75c29450Johann int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 372df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint64_t tp0, tp1, tp2, tp3; 373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, mask; 374df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 filt_hz, filt_vt, vec0, dst0 = { 0 }, dst1 = { 0 }; 375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 filt; 377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask = LD_SB(&mc_filt_mask_arr[0]); 379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rearranging filter */ 381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_horiz); 382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_splati_h(filt, 0); 383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_vert); 385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_splati_h(filt, 0); 386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = LD_SB(src); 388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src, src_stride, src1, src2, src3, src4); 394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = __msa_dotp_u_h(vec0, filt_vt); 399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = __msa_dotp_u_h(vec0, filt_vt); 403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = __msa_dotp_u_h(vec0, filt_vt); 409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = __msa_dotp_u_h(vec0, filt_vt); 413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 415df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD4(dst, dst_stride, tp0, tp1, tp2, tp3); 416df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(tp0, tp1, dst0); 417df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(tp2, tp3, dst1); 418df37111358d02836cb29bbcb9c6e4c95dff90a16Johann PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); 419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 4237bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_8w_msa( 4247bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 4257bc9febe8749e98a3812a0dc4380ceae75c29450Johann int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (4 == height) { 427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, 428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_horiz, filter_vert); 429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 4307bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( 4317bc9febe8749e98a3812a0dc4380ceae75c29450Johann src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); 432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 4357bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_16w_msa( 4367bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 4377bc9febe8749e98a3812a0dc4380ceae75c29450Johann int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; 441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; 442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 filt; 443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask = LD_SB(&mc_filt_mask_arr[0]); 445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rearranging filter */ 447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_horiz); 448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_splati_h(filt, 0); 449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_vert); 451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_splati_h(filt, 0); 452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB2(src, 8, src0, src1); 454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src, src_stride, src0, src2, src4, src6); 461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); 471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += dst_stride; 472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 477da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst); 479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += dst_stride; 480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); 483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); 487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += dst_stride; 488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); 490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); 491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst); 495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += dst_stride; 496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 4997bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_32w_msa( 5007bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 5017bc9febe8749e98a3812a0dc4380ceae75c29450Johann int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t multiple8_cnt; 503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (multiple8_cnt = 2; multiple8_cnt--;) { 504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, 505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_horiz, filter_vert, height); 506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 5117bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_64w_msa( 5127bc9febe8749e98a3812a0dc4380ceae75c29450Johann const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 5137bc9febe8749e98a3812a0dc4380ceae75c29450Johann int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { 514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t multiple8_cnt; 515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (multiple8_cnt = 4; multiple8_cnt--;) { 516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, 517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_horiz, filter_vert, height); 518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, 524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint8_t *dst, ptrdiff_t dst_stride, 525df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const InterpKernel *filter, int x0_q4, int x_step_q4, 526df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int y0_q4, int y_step_q4, int w, int h) { 527df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *const filter_x = filter[x0_q4]; 528df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *const filter_y = filter[y0_q4]; 529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t cnt, filt_hor[8], filt_ver[8]; 530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian assert(x_step_q4 == 16); 532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian assert(y_step_q4 == 16); 533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian assert(((const int32_t *)filter_x)[1] != 0x800000); 534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian assert(((const int32_t *)filter_y)[1] != 0x800000); 535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (cnt = 0; cnt < 8; ++cnt) { 537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hor[cnt] = filter_x[cnt]; 538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_ver[cnt] = filter_y[cnt]; 539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (((const int32_t *)filter_x)[0] == 0 && 542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ((const int32_t *)filter_y)[0] == 0) { 543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian switch (w) { 544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 4: 5457bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, 5467bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, &filt_hor[3], 5477bc9febe8749e98a3812a0dc4380ceae75c29450Johann &filt_ver[3], h); 548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 8: 5507bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, 5517bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, &filt_hor[3], 5527bc9febe8749e98a3812a0dc4380ceae75c29450Johann &filt_ver[3], h); 553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 16: 5557bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, 5567bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, 557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian &filt_hor[3], &filt_ver[3], h); 558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 32: 5607bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, 5617bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, 562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian &filt_hor[3], &filt_ver[3], h); 563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 64: 5657bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, 5667bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, 567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian &filt_hor[3], &filt_ver[3], h); 568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian default: 570df37111358d02836cb29bbcb9c6e4c95dff90a16Johann vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, 571df37111358d02836cb29bbcb9c6e4c95dff90a16Johann x_step_q4, y0_q4, y_step_q4, w, h); 572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else if (((const int32_t *)filter_x)[0] == 0 || 575da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ((const int32_t *)filter_y)[0] == 0) { 576df37111358d02836cb29bbcb9c6e4c95dff90a16Johann vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, 577df37111358d02836cb29bbcb9c6e4c95dff90a16Johann x_step_q4, y0_q4, y_step_q4, w, h); 578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian switch (w) { 580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 4: 5817bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, 5827bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, filt_hor, 5837bc9febe8749e98a3812a0dc4380ceae75c29450Johann filt_ver, h); 584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 8: 5867bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, 5877bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, filt_hor, 5887bc9febe8749e98a3812a0dc4380ceae75c29450Johann filt_ver, h); 589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 16: 5917bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, 5927bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, filt_hor, 5937bc9febe8749e98a3812a0dc4380ceae75c29450Johann filt_ver, h); 594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 32: 5967bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, 5977bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, filt_hor, 5987bc9febe8749e98a3812a0dc4380ceae75c29450Johann filt_ver, h); 599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 64: 6017bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, 6027bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, filt_hor, 6037bc9febe8749e98a3812a0dc4380ceae75c29450Johann filt_ver, h); 604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian default: 606df37111358d02836cb29bbcb9c6e4c95dff90a16Johann vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, 607df37111358d02836cb29bbcb9c6e4c95dff90a16Johann x_step_q4, y0_q4, y_step_q4, w, h); 608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 612