1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/* 2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * 4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */ 10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include <assert.h> 12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h" 13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/mips/vpx_convolve_msa.h" 14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianconst uint8_t mc_filt_mask_arr[16 * 3] = { 16da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 8 width cases */ 17da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 width cases */ 19da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* 4 width cases */ 21da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 22da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}; 23da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 24da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, 25da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint8_t *dst, int32_t dst_stride, 26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_horiz, int8_t *filter_vert, 27da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t height) { 28da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 29da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; 31da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 mask0, mask1, mask2, mask3, out; 32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 33da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4; 34da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; 35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 36da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask0 = LD_UB(&mc_filt_mask_arr[16]); 37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src -= (3 + 3 * src_stride); 38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rearranging filter */ 40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_horiz); 41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask1 = mask0 + 2; 44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask2 = mask0 + 4; 45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask3 = mask0 + 6; 46da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 48da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (7 * src_stride); 50da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, 52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, 54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, 56da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 57da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, 58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); 60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_vert); 62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 63da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 64da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 65da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); 66da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src, src_stride, src7, src8, src9, src10); 69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian XORI_B4_128_SB(src7, src8, src9, src10); 70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 727bc9febe8749e98a3812a0dc4380ceae75c29450Johann hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, 737bc9febe8749e98a3812a0dc4380ceae75c29450Johann filt_hz1, filt_hz2, filt_hz3); 74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); 75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); 76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, 77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt2, filt_vt3); 78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 797bc9febe8749e98a3812a0dc4380ceae75c29450Johann hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, 807bc9febe8749e98a3812a0dc4380ceae75c29450Johann filt_hz1, filt_hz2, filt_hz3); 81da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); 82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); 83da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1, 84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt2, filt_vt3); 85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_SH(tmp0, tmp1, FILTER_BITS); 86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SAT_SH2_SH(tmp0, tmp1, 7); 87da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out = PCKEV_XORI128_UB(tmp0, tmp1); 88da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); 89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out5 = hz_out9; 92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out0 = out2; 93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out1 = out3; 94da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out2 = out4; 95da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, 99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint8_t *dst, int32_t dst_stride, 100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_horiz, int8_t *filter_vert, 101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t height) { 102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; 105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 mask0, mask1, mask2, mask3, vec0, vec1; 106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; 107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; 109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; 110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask0 = LD_UB(&mc_filt_mask_arr[0]); 112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src -= (3 + 3 * src_stride); 113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rearranging filter */ 115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_horiz); 116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); 117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask1 = mask0 + 2; 119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask2 = mask0 + 4; 120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask3 = mask0 + 6; 121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (7 * src_stride); 124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); 126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, 127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, 129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, 131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, 133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, 135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, 137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, 139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz1, filt_hz2, filt_hz3); 140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_vert); 142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); 143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); 145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); 146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); 147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src, src_stride, src7, src8, src9, src10); 150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian XORI_B4_128_SB(src7, src8, src9, src10); 153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1547bc9febe8749e98a3812a0dc4380ceae75c29450Johann hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, 1557bc9febe8749e98a3812a0dc4380ceae75c29450Johann filt_hz1, filt_hz2, filt_hz3); 156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); 157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, 158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt2, filt_vt3); 159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1607bc9febe8749e98a3812a0dc4380ceae75c29450Johann hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, 1617bc9febe8749e98a3812a0dc4380ceae75c29450Johann filt_hz1, filt_hz2, filt_hz3); 162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); 163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, 164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt2, filt_vt3); 165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1667bc9febe8749e98a3812a0dc4380ceae75c29450Johann hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, 1677bc9febe8749e98a3812a0dc4380ceae75c29450Johann filt_hz1, filt_hz2, filt_hz3); 168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); 169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, 170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt2, filt_vt3); 171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, 173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz0, filt_hz1, filt_hz2, filt_hz3); 174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); 175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, 176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt2, filt_vt3); 177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); 179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = PCKEV_XORI128_UB(tmp0, tmp1); 180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = PCKEV_XORI128_UB(tmp2, tmp3); 181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST8x4_UB(vec0, vec1, dst, dst_stride); 182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out6 = hz_out10; 185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out0 = out2; 186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out1 = out3; 187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out2 = out8; 188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out4 = out6; 189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out5 = out7; 190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian out6 = out9; 191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, 195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint8_t *dst, int32_t dst_stride, 196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_horiz, int8_t *filter_vert, 197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t height) { 198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t multiple8_cnt; 199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (multiple8_cnt = 2; multiple8_cnt--;) { 200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, 201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_vert, height); 202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 8; 203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 8; 204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, 208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint8_t *dst, int32_t dst_stride, 209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_horiz, int8_t *filter_vert, 210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t height) { 211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t multiple8_cnt; 212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (multiple8_cnt = 4; multiple8_cnt--;) { 213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, 214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_vert, height); 215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 8; 216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 8; 217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, 221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint8_t *dst, int32_t dst_stride, 222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_horiz, int8_t *filter_vert, 223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t height) { 224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t multiple8_cnt; 225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (multiple8_cnt = 8; multiple8_cnt--;) { 226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, 227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_vert, height); 228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 8; 229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 8; 230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, 234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint8_t *dst, int32_t dst_stride, 235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_horiz, 236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_vert) { 237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, mask; 238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; 239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; 240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask = LD_SB(&mc_filt_mask_arr[16]); 242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rearranging filter */ 244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_UH(filter_horiz); 245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); 246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_UH(filter_vert); 248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); 249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); 256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); 261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, 265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint8_t *dst, int32_t dst_stride, 266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_horiz, 267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_vert) { 268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; 269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 res0, res1, res2, res3; 270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; 271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; 272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; 273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask = LD_SB(&mc_filt_mask_arr[16]); 275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rearranging filter */ 277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_UH(filter_horiz); 278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); 279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_UH(filter_vert); 281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); 282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (8 * src_stride); 285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src8 = LD_SB(src); 286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); 290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); 291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); 292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, 293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3, hz_out5, 8); 294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); 295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); 2987bc9febe8749e98a3812a0dc4380ceae75c29450Johann DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4, 2997bc9febe8749e98a3812a0dc4380ceae75c29450Johann vec5, vec6, vec7); 300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); 3017bc9febe8749e98a3812a0dc4380ceae75c29450Johann PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, 3027bc9febe8749e98a3812a0dc4380ceae75c29450Johann res3); 303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); 306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride, 309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint8_t *dst, int32_t dst_stride, 310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_horiz, int8_t *filter_vert, 311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t height) { 312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (4 == height) { 313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz, 314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_vert); 315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else if (8 == height) { 316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz, 317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_vert); 318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, 322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint8_t *dst, int32_t dst_stride, 323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_horiz, 324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_vert) { 325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, mask, out0, out1; 326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; 327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; 328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 filt; 329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask = LD_SB(&mc_filt_mask_arr[0]); 331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rearranging filter */ 333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_horiz); 334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_splati_h(filt, 0); 335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_vert); 337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_splati_h(filt, 0); 338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp0 = __msa_dotp_u_h(vec0, filt_vt); 345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = __msa_dotp_u_h(vec1, filt_vt); 349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = __msa_dotp_u_h(vec2, filt_vt); 353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = __msa_dotp_u_h(vec3, filt_vt); 357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST8x4_UB(out0, out1, dst, dst_stride); 361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, 3647bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t src_stride, uint8_t *dst, 365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t dst_stride, 366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_horiz, 3677bc9febe8749e98a3812a0dc4380ceae75c29450Johann int8_t *filter_vert, int32_t height) { 368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, mask, out0, out1; 370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt_hz, filt_vt, vec0; 371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; 372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 filt; 373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask = LD_SB(&mc_filt_mask_arr[0]); 375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rearranging filter */ 377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_horiz); 378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_splati_h(filt, 0); 379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_vert); 381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_splati_h(filt, 0); 382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src0 = LD_SB(src); 384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 3); loop_cnt--;) { 389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src, src_stride, src1, src2, src3, src4); 390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp1 = __msa_dotp_u_h(vec0, filt_vt); 395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp2 = __msa_dotp_u_h(vec0, filt_vt); 399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); 401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp3 = __msa_dotp_u_h(vec0, filt_vt); 405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src, src_stride, src1, src2, src3, src4); 408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp4 = __msa_dotp_u_h(vec0, filt_vt); 411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp3, tmp4, FILTER_BITS); 413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); 414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST8x4_UB(out0, out1, dst, dst_stride); 415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp5 = __msa_dotp_u_h(vec0, filt_vt); 420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp6 = __msa_dotp_u_h(vec0, filt_vt); 424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp7 = __msa_dotp_u_h(vec0, filt_vt); 428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian tmp8 = __msa_dotp_u_h(vec0, filt_vt); 432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS); 434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); 435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ST8x4_UB(out0, out1, dst, dst_stride); 436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += (4 * dst_stride); 437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride, 441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint8_t *dst, int32_t dst_stride, 442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_horiz, int8_t *filter_vert, 443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t height) { 444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (4 == height) { 445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz, 446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_vert); 447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, 449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_horiz, filter_vert, height); 450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride, 454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint8_t *dst, int32_t dst_stride, 455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_horiz, int8_t *filter_vert, 456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t height) { 457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint32_t loop_cnt; 458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v16u8 filt_hz, filt_vt, vec0, vec1; 460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; 461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian v8i16 filt; 462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian mask = LD_SB(&mc_filt_mask_arr[0]); 464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian /* rearranging filter */ 466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_horiz); 467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hz = (v16u8)__msa_splati_h(filt, 0); 468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt = LD_SH(filter_vert); 470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_vt = (v16u8)__msa_splati_h(filt, 0); 471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB2(src, 8, src0, src1); 473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += src_stride; 474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 477da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (loop_cnt = (height >> 2); loop_cnt--;) { 479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src, src_stride, src0, src2, src4, src6); 480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += (4 * src_stride); 482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); 488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_ST_SB(tmp1, tmp2, dst); 489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += dst_stride; 490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); 496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_ST_SB(tmp1, tmp2, dst); 497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += dst_stride; 498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); 501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); 504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_ST_SB(tmp1, tmp2, dst); 505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += dst_stride; 506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); 508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); 509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); 511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); 512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian PCKEV_ST_SB(tmp1, tmp2, dst); 513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += dst_stride; 514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride, 518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint8_t *dst, int32_t dst_stride, 519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_horiz, int8_t *filter_vert, 520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t height) { 521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t multiple8_cnt; 522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (multiple8_cnt = 2; multiple8_cnt--;) { 523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, 524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_vert, height); 525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride, 531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian uint8_t *dst, int32_t dst_stride, 532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t *filter_horiz, int8_t *filter_vert, 533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t height) { 534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int32_t multiple8_cnt; 535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (multiple8_cnt = 4; multiple8_cnt--;) { 536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, 537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filter_vert, height); 538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian src += 16; 539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian dst += 16; 540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 5437bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 544df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ptrdiff_t dst_stride, const InterpKernel *filter, 545df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int x0_q4, int32_t x_step_q4, int y0_q4, 5467bc9febe8749e98a3812a0dc4380ceae75c29450Johann int32_t y_step_q4, int32_t w, int32_t h) { 547df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *const filter_x = filter[x0_q4]; 548df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *const filter_y = filter[y0_q4]; 549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian int8_t cnt, filt_hor[8], filt_ver[8]; 550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian assert(x_step_q4 == 16); 552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian assert(y_step_q4 == 16); 553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian assert(((const int32_t *)filter_x)[1] != 0x800000); 554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian assert(((const int32_t *)filter_y)[1] != 0x800000); 555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian for (cnt = 0; cnt < 8; ++cnt) { 557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_hor[cnt] = filter_x[cnt]; 558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian filt_ver[cnt] = filter_y[cnt]; 559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian if (((const int32_t *)filter_x)[0] == 0 && 562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ((const int32_t *)filter_y)[0] == 0) { 563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian switch (w) { 564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 4: 5657bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst, 5667bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, &filt_hor[3], 5677bc9febe8749e98a3812a0dc4380ceae75c29450Johann &filt_ver[3], (int32_t)h); 568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 8: 5707bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst, 5717bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, &filt_hor[3], 5727bc9febe8749e98a3812a0dc4380ceae75c29450Johann &filt_ver[3], (int32_t)h); 573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 16: 5757bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst, 5767bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, &filt_hor[3], 5777bc9febe8749e98a3812a0dc4380ceae75c29450Johann &filt_ver[3], (int32_t)h); 578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 32: 5807bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst, 5817bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, &filt_hor[3], 5827bc9febe8749e98a3812a0dc4380ceae75c29450Johann &filt_ver[3], (int32_t)h); 583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 64: 5857bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst, 5867bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, &filt_hor[3], 5877bc9febe8749e98a3812a0dc4380ceae75c29450Johann &filt_ver[3], (int32_t)h); 588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian default: 590df37111358d02836cb29bbcb9c6e4c95dff90a16Johann vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, 591df37111358d02836cb29bbcb9c6e4c95dff90a16Johann x_step_q4, y0_q4, y_step_q4, w, h); 592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else if (((const int32_t *)filter_x)[0] == 0 || 595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ((const int32_t *)filter_y)[0] == 0) { 596df37111358d02836cb29bbcb9c6e4c95dff90a16Johann vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, 597df37111358d02836cb29bbcb9c6e4c95dff90a16Johann y0_q4, y_step_q4, w, h); 598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } else { 599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian switch (w) { 600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 4: 6017bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst, 6027bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, filt_hor, filt_ver, 6037bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)h); 604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 8: 6067bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst, 6077bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, filt_hor, filt_ver, 6087bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)h); 609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 16: 6117bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst, 6127bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, filt_hor, filt_ver, 6137bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)h); 614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 32: 6167bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst, 6177bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, filt_hor, filt_ver, 6187bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)h); 619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian case 64: 6217bc9febe8749e98a3812a0dc4380ceae75c29450Johann common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst, 6227bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)dst_stride, filt_hor, filt_ver, 6237bc9febe8749e98a3812a0dc4380ceae75c29450Johann (int32_t)h); 624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian default: 626df37111358d02836cb29bbcb9c6e4c95dff90a16Johann vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, 627df37111358d02836cb29bbcb9c6e4c95dff90a16Johann x_step_q4, y0_q4, y_step_q4, w, h); 628da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian break; 629da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian } 631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian} 632df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 633df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void filter_horiz_w4_msa(const uint8_t *src_x, ptrdiff_t src_pitch, 634df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint8_t *dst, const int16_t *x_filter) { 635df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint64_t srcd0, srcd1, srcd2, srcd3; 636df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t res; 637df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 src0 = { 0 }, src1 = { 0 }, dst0; 638df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 out0, out1; 639df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 shf1 = { 0, 8, 16, 24, 4, 12, 20, 28, 1, 9, 17, 25, 5, 13, 21, 29 }; 640df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 shf2 = shf1 + 2; 641df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 }; 642df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 filt_shf1 = filt_shf0 + 2; 643df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 filt_shf2 = filt_shf0 + 4; 644df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 filt_shf3 = filt_shf0 + 6; 645df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v8i16 filt, src0_h, src1_h, src2_h, src3_h, filt0, filt1, filt2, filt3; 646df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 647df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3); 648df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(srcd0, srcd1, src0); 649df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(srcd2, srcd3, src1); 650df37111358d02836cb29bbcb9c6e4c95dff90a16Johann VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1); 651df37111358d02836cb29bbcb9c6e4c95dff90a16Johann XORI_B2_128_SB(out0, out1); 652df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out0, src0_h, src1_h); 653df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out1, src2_h, src3_h); 654df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 655df37111358d02836cb29bbcb9c6e4c95dff90a16Johann filt = LD_SH(x_filter); 656df37111358d02836cb29bbcb9c6e4c95dff90a16Johann VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1); 657df37111358d02836cb29bbcb9c6e4c95dff90a16Johann VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3); 658df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 659df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h *= filt0; 660df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h += src1_h * filt1; 661df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h += src2_h * filt2; 662df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h += src3_h * filt3; 663df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 664df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8); 665df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 666df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h = __msa_adds_s_h(src0_h, src1_h); 667df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h = __msa_srari_h(src0_h, FILTER_BITS); 668df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h = __msa_sat_s_h(src0_h, 7); 669df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst0 = PCKEV_XORI128_UB(src0_h, src0_h); 670df37111358d02836cb29bbcb9c6e4c95dff90a16Johann res = __msa_copy_u_w((v4i32)dst0, 0); 671df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SW(res, dst); 672df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 673df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 674df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void filter_horiz_w8_msa(const uint8_t *src_x, ptrdiff_t src_pitch, 675df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint8_t *dst, const int16_t *x_filter) { 676df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint64_t srcd0, srcd1, srcd2, srcd3; 677df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 678df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 tmp0, tmp1, tmp2, tmp3, dst0; 679df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 out0, out1, out2, out3; 680df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; 681df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 shf2 = shf1 + 4; 682df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; 683df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; 684df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 685df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3); 686df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(srcd0, srcd1, src0); 687df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(srcd2, srcd3, src1); 688df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); 689df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(srcd0, srcd1, src2); 690df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(srcd2, srcd3, src3); 691df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 692df37111358d02836cb29bbcb9c6e4c95dff90a16Johann filt = LD_SH(x_filter); 693df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 694df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); 695df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 696df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // transpose 697df37111358d02836cb29bbcb9c6e4c95dff90a16Johann VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1); 698df37111358d02836cb29bbcb9c6e4c95dff90a16Johann VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3); 699df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ILVRL_W2_SB(tmp2, tmp0, out0, out1); 700df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ILVRL_W2_SB(tmp3, tmp1, out2, out3); 701df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 702df37111358d02836cb29bbcb9c6e4c95dff90a16Johann XORI_B4_128_SB(out0, out1, out2, out3); 703df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out0, src0_h, src1_h); 704df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out1, src2_h, src3_h); 705df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out2, src4_h, src5_h); 706df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out3, src6_h, src7_h); 707df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 708df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h *= filt0; 709df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src4_h *= filt4; 710df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h += src1_h * filt1; 711df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src4_h += src5_h * filt5; 712df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h += src2_h * filt2; 713df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src4_h += src6_h * filt6; 714df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h += src3_h * filt3; 715df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src4_h += src7_h * filt7; 716df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 717df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h = __msa_adds_s_h(src0_h, src4_h); 718df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h = __msa_srari_h(src0_h, FILTER_BITS); 719df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h = __msa_sat_s_h(src0_h, 7); 720df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst0 = PCKEV_XORI128_UB(src0_h, src0_h); 721df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ST8x1_UB(dst0, dst); 722df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 723df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 724df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void filter_horiz_w16_msa(const uint8_t *src_x, ptrdiff_t src_pitch, 725df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint8_t *dst, const int16_t *x_filter) { 726df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint64_t srcd0, srcd1, srcd2, srcd3; 727df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 728df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 src4 = { 0 }, src5 = { 0 }, src6 = { 0 }, src7 = { 0 }; 729df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 tmp0, tmp1, tmp2, tmp3, dst0; 730df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 out0, out1, out2, out3, out4, out5, out6, out7; 731df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; 732df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 shf2 = shf1 + 4; 733df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; 734df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; 735df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v8i16 dst0_h, dst1_h, dst2_h, dst3_h; 736df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 737df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3); 738df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(srcd0, srcd1, src0); 739df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(srcd2, srcd3, src1); 740df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); 741df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(srcd0, srcd1, src2); 742df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(srcd2, srcd3, src3); 743df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD4(src_x + 8 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); 744df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(srcd0, srcd1, src4); 745df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(srcd2, srcd3, src5); 746df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD4(src_x + 12 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); 747df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(srcd0, srcd1, src6); 748df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_UB(srcd2, srcd3, src7); 749df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 750df37111358d02836cb29bbcb9c6e4c95dff90a16Johann filt = LD_SH(x_filter); 751df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 752df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); 753df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 754df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // transpose 755df37111358d02836cb29bbcb9c6e4c95dff90a16Johann VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1); 756df37111358d02836cb29bbcb9c6e4c95dff90a16Johann VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3); 757df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ILVRL_W2_SB(tmp2, tmp0, out0, out1); 758df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ILVRL_W2_SB(tmp3, tmp1, out2, out3); 759df37111358d02836cb29bbcb9c6e4c95dff90a16Johann XORI_B4_128_SB(out0, out1, out2, out3); 760df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 761df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out0, src0_h, src1_h); 762df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out1, src2_h, src3_h); 763df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out2, src4_h, src5_h); 764df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out3, src6_h, src7_h); 765df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 766df37111358d02836cb29bbcb9c6e4c95dff90a16Johann VSHF_B2_UB(src4, src5, src4, src5, shf1, shf2, tmp0, tmp1); 767df37111358d02836cb29bbcb9c6e4c95dff90a16Johann VSHF_B2_UB(src6, src7, src6, src7, shf1, shf2, tmp2, tmp3); 768df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ILVRL_W2_SB(tmp2, tmp0, out4, out5); 769df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ILVRL_W2_SB(tmp3, tmp1, out6, out7); 770df37111358d02836cb29bbcb9c6e4c95dff90a16Johann XORI_B4_128_SB(out4, out5, out6, out7); 771df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 772df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst0_h = src0_h * filt0; 773df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst1_h = src4_h * filt4; 774df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst0_h += src1_h * filt1; 775df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst1_h += src5_h * filt5; 776df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst0_h += src2_h * filt2; 777df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst1_h += src6_h * filt6; 778df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst0_h += src3_h * filt3; 779df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst1_h += src7_h * filt7; 780df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 781df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out4, src0_h, src1_h); 782df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out5, src2_h, src3_h); 783df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out6, src4_h, src5_h); 784df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out7, src6_h, src7_h); 785df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 786df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst2_h = src0_h * filt0; 787df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst3_h = src4_h * filt4; 788df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst2_h += src1_h * filt1; 789df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst3_h += src5_h * filt5; 790df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst2_h += src2_h * filt2; 791df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst3_h += src6_h * filt6; 792df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst2_h += src3_h * filt3; 793df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst3_h += src7_h * filt7; 794df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 795df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ADDS_SH2_SH(dst0_h, dst1_h, dst2_h, dst3_h, dst0_h, dst2_h); 796df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SRARI_H2_SH(dst0_h, dst2_h, FILTER_BITS); 797df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SAT_SH2_SH(dst0_h, dst2_h, 7); 798df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst0 = PCKEV_XORI128_UB(dst0_h, dst2_h); 799df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ST_UB(dst0, dst); 800df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 801df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 802df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void transpose4x4_to_dst(const uint8_t *src, uint8_t *dst, 803df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ptrdiff_t dst_stride) { 804df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 in0; 805df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 out0 = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; 806df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 807df37111358d02836cb29bbcb9c6e4c95dff90a16Johann in0 = LD_UB(src); 808df37111358d02836cb29bbcb9c6e4c95dff90a16Johann out0 = __msa_vshf_b(out0, (v16i8)in0, (v16i8)in0); 809df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride); 810df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 811df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 812df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void transpose8x8_to_dst(const uint8_t *src, uint8_t *dst, 813df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ptrdiff_t dst_stride) { 814df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3; 815df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; 816df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 shf2 = shf1 + 4; 817df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 818df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD_UB4(src, 16, in0, in1, in2, in3); 819df37111358d02836cb29bbcb9c6e4c95dff90a16Johann VSHF_B2_UB(in0, in1, in0, in1, shf1, shf2, tmp0, tmp1); 820df37111358d02836cb29bbcb9c6e4c95dff90a16Johann VSHF_B2_UB(in2, in3, in2, in3, shf1, shf2, tmp2, tmp3); 821df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ILVRL_W2_UB(tmp2, tmp0, out0, out1); 822df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ILVRL_W2_UB(tmp3, tmp1, out2, out3); 823df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ST8x4_UB(out0, out1, dst, dst_stride); 824df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ST8x4_UB(out2, out3, dst + 4 * dst_stride, dst_stride); 825df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 826df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 827df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void transpose16x16_to_dst(const uint8_t *src, uint8_t *dst, 828df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ptrdiff_t dst_stride) { 829df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12; 830df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7, out8; 831df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 out9, out10, out11, out12, out13, out14, out15; 832df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 833df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD_UB8(src, 16, in0, in1, in2, in3, in4, in5, in6, in7); 834df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD_UB8(src + 16 * 8, 16, in8, in9, in10, in11, in12, in13, in14, in15); 835df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 836df37111358d02836cb29bbcb9c6e4c95dff90a16Johann TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, 837df37111358d02836cb29bbcb9c6e4c95dff90a16Johann in11, in12, in13, in14, in15, out0, out1, out2, out3, 838df37111358d02836cb29bbcb9c6e4c95dff90a16Johann out4, out5, out6, out7); 839df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ST_UB8(out0, out1, out2, out3, out4, out5, out6, out7, dst, dst_stride); 840df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst += 8 * dst_stride; 841df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 842df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SLDI_B4_0_UB(in0, in1, in2, in3, in0, in1, in2, in3, 8); 843df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SLDI_B4_0_UB(in4, in5, in6, in7, in4, in5, in6, in7, 8); 844df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SLDI_B4_0_UB(in8, in9, in10, in11, in8, in9, in10, in11, 8); 845df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SLDI_B4_0_UB(in12, in13, in14, in15, in12, in13, in14, in15, 8); 846df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 847df37111358d02836cb29bbcb9c6e4c95dff90a16Johann TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, 848df37111358d02836cb29bbcb9c6e4c95dff90a16Johann in11, in12, in13, in14, in15, out8, out9, out10, out11, 849df37111358d02836cb29bbcb9c6e4c95dff90a16Johann out12, out13, out14, out15); 850df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ST_UB8(out8, out9, out10, out11, out12, out13, out14, out15, dst, dst_stride); 851df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 852df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 853df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, 854df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint8_t *dst, ptrdiff_t dst_stride, 855df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const InterpKernel *x_filters, int x0_q4, 856df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int x_step_q4, int h) { 857df37111358d02836cb29bbcb9c6e4c95dff90a16Johann DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); 858df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int y, z, i; 859df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src -= SUBPEL_TAPS / 2 - 1; 860df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 861df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (y = 0; y < h; y += 4) { 862df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int x_q4 = x0_q4; 863df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (z = 0; z < 4; ++z) { 864df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; 865df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; 866df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 867df37111358d02836cb29bbcb9c6e4c95dff90a16Johann if (x_q4 & SUBPEL_MASK) { 868df37111358d02836cb29bbcb9c6e4c95dff90a16Johann filter_horiz_w4_msa(src_x, src_stride, temp + (z * 4), x_filter); 869df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } else { 870df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (i = 0; i < 4; ++i) { 871df37111358d02836cb29bbcb9c6e4c95dff90a16Johann temp[z * 4 + i] = src_x[i * src_stride + 3]; 872df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 873df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 874df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 875df37111358d02836cb29bbcb9c6e4c95dff90a16Johann x_q4 += x_step_q4; 876df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 877df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 878df37111358d02836cb29bbcb9c6e4c95dff90a16Johann transpose4x4_to_dst(temp, dst, dst_stride); 879df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 880df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src += src_stride * 4; 881df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst += dst_stride * 4; 882df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 883df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 884df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 885df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, 886df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint8_t *dst, ptrdiff_t dst_stride, 887df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const InterpKernel *x_filters, int x0_q4, 888df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int x_step_q4, int h) { 889df37111358d02836cb29bbcb9c6e4c95dff90a16Johann DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); 890df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int y, z, i; 891df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src -= SUBPEL_TAPS / 2 - 1; 892df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 893df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // This function processes 8x8 areas. The intermediate height is not always 894df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // a multiple of 8, so force it to be a multiple of 8 here. 895df37111358d02836cb29bbcb9c6e4c95dff90a16Johann y = h + (8 - (h & 0x7)); 896df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 897df37111358d02836cb29bbcb9c6e4c95dff90a16Johann do { 898df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int x_q4 = x0_q4; 899df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (z = 0; z < 8; ++z) { 900df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; 901df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; 902df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 903df37111358d02836cb29bbcb9c6e4c95dff90a16Johann if (x_q4 & SUBPEL_MASK) { 904df37111358d02836cb29bbcb9c6e4c95dff90a16Johann filter_horiz_w8_msa(src_x, src_stride, temp + (z * 8), x_filter); 905df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } else { 906df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (i = 0; i < 8; ++i) { 907df37111358d02836cb29bbcb9c6e4c95dff90a16Johann temp[z * 8 + i] = src_x[3 + i * src_stride]; 908df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 909df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 910df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 911df37111358d02836cb29bbcb9c6e4c95dff90a16Johann x_q4 += x_step_q4; 912df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 913df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 914df37111358d02836cb29bbcb9c6e4c95dff90a16Johann transpose8x8_to_dst(temp, dst, dst_stride); 915df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 916df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src += src_stride * 8; 917df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst += dst_stride * 8; 918df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } while (y -= 8); 919df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 920df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 921df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void scaledconvolve_horiz_mul16(const uint8_t *src, ptrdiff_t src_stride, 922df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint8_t *dst, ptrdiff_t dst_stride, 923df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const InterpKernel *x_filters, int x0_q4, 924df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int x_step_q4, int w, int h) { 925df37111358d02836cb29bbcb9c6e4c95dff90a16Johann DECLARE_ALIGNED(16, uint8_t, temp[16 * 16]); 926df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int x, y, z, i; 927df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 928df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src -= SUBPEL_TAPS / 2 - 1; 929df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 930df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // This function processes 16x16 areas. The intermediate height is not always 931df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // a multiple of 16, so force it to be a multiple of 8 here. 932df37111358d02836cb29bbcb9c6e4c95dff90a16Johann y = h + (16 - (h & 0xF)); 933df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 934df37111358d02836cb29bbcb9c6e4c95dff90a16Johann do { 935df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int x_q4 = x0_q4; 936df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (x = 0; x < w; x += 16) { 937df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (z = 0; z < 16; ++z) { 938df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; 939df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; 940df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 941df37111358d02836cb29bbcb9c6e4c95dff90a16Johann if (x_q4 & SUBPEL_MASK) { 942df37111358d02836cb29bbcb9c6e4c95dff90a16Johann filter_horiz_w16_msa(src_x, src_stride, temp + (z * 16), x_filter); 943df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } else { 944df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (i = 0; i < 16; ++i) { 945df37111358d02836cb29bbcb9c6e4c95dff90a16Johann temp[z * 16 + i] = src_x[3 + i * src_stride]; 946df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 947df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 948df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 949df37111358d02836cb29bbcb9c6e4c95dff90a16Johann x_q4 += x_step_q4; 950df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 951df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 952df37111358d02836cb29bbcb9c6e4c95dff90a16Johann transpose16x16_to_dst(temp, dst + x, dst_stride); 953df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 954df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 955df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src += src_stride * 16; 956df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst += dst_stride * 16; 957df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } while (y -= 16); 958df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 959df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 960df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void filter_vert_w4_msa(const uint8_t *src_y, ptrdiff_t src_pitch, 961df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint8_t *dst, const int16_t *y_filter) { 962df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t srcw0, srcw1, srcw2, srcw3, srcw4, srcw5, srcw6, srcw7; 963df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t res; 964df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 src0 = { 0 }, src1 = { 0 }, dst0; 965df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 out0, out1; 966df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 shf1 = { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 }; 967df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 shf2 = shf1 + 8; 968df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 }; 969df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 filt_shf1 = filt_shf0 + 2; 970df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 filt_shf2 = filt_shf0 + 4; 971df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 filt_shf3 = filt_shf0 + 6; 972df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v8i16 filt, src0_h, src1_h, src2_h, src3_h; 973df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v8i16 filt0, filt1, filt2, filt3; 974df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 975df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LW4(src_y, src_pitch, srcw0, srcw1, srcw2, srcw3); 976df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LW4(src_y + 4 * src_pitch, src_pitch, srcw4, srcw5, srcw6, srcw7); 977df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_W4_UB(srcw0, srcw1, srcw2, srcw3, src0); 978df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_W4_UB(srcw4, srcw5, srcw6, srcw7, src1); 979df37111358d02836cb29bbcb9c6e4c95dff90a16Johann VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1); 980df37111358d02836cb29bbcb9c6e4c95dff90a16Johann XORI_B2_128_SB(out0, out1); 981df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out0, src0_h, src1_h); 982df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(out1, src2_h, src3_h); 983df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 984df37111358d02836cb29bbcb9c6e4c95dff90a16Johann filt = LD_SH(y_filter); 985df37111358d02836cb29bbcb9c6e4c95dff90a16Johann VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1); 986df37111358d02836cb29bbcb9c6e4c95dff90a16Johann VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3); 987df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 988df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h *= filt0; 989df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h += src1_h * filt1; 990df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h += src2_h * filt2; 991df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h += src3_h * filt3; 992df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 993df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8); 994df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 995df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h = __msa_adds_s_h(src0_h, src1_h); 996df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h = __msa_srari_h(src0_h, FILTER_BITS); 997df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h = __msa_sat_s_h(src0_h, 7); 998df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst0 = PCKEV_XORI128_UB(src0_h, src0_h); 999df37111358d02836cb29bbcb9c6e4c95dff90a16Johann res = __msa_copy_u_w((v4i32)dst0, 0); 1000df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SW(res, dst); 1001df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 1002df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1003df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void filter_vert_w8_msa(const uint8_t *src_y, ptrdiff_t src_pitch, 1004df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint8_t *dst, const int16_t *y_filter) { 1005df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint64_t srcd0, srcd1, srcd2, srcd3; 1006df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 dst0; 1007df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; 1008df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; 1009df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; 1010df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1011df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD4(src_y, src_pitch, srcd0, srcd1, srcd2, srcd3); 1012df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_SB(srcd0, srcd1, src0); 1013df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_SB(srcd2, srcd3, src1); 1014df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD4(src_y + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); 1015df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_SB(srcd0, srcd1, src2); 1016df37111358d02836cb29bbcb9c6e4c95dff90a16Johann INSERT_D2_SB(srcd2, srcd3, src3); 1017df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1018df37111358d02836cb29bbcb9c6e4c95dff90a16Johann filt = LD_SH(y_filter); 1019df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1020df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); 1021df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1022df37111358d02836cb29bbcb9c6e4c95dff90a16Johann XORI_B4_128_SB(src0, src1, src2, src3); 1023df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(src0, src0_h, src1_h); 1024df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(src1, src2_h, src3_h); 1025df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(src2, src4_h, src5_h); 1026df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(src3, src6_h, src7_h); 1027df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1028df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h *= filt0; 1029df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src4_h *= filt4; 1030df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h += src1_h * filt1; 1031df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src4_h += src5_h * filt5; 1032df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h += src2_h * filt2; 1033df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src4_h += src6_h * filt6; 1034df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h += src3_h * filt3; 1035df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src4_h += src7_h * filt7; 1036df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1037df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h = __msa_adds_s_h(src0_h, src4_h); 1038df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h = __msa_srari_h(src0_h, FILTER_BITS); 1039df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h = __msa_sat_s_h(src0_h, 7); 1040df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst0 = PCKEV_XORI128_UB(src0_h, src0_h); 1041df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ST8x1_UB(dst0, dst); 1042df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 1043df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1044df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void filter_vert_mul_w16_msa(const uint8_t *src_y, ptrdiff_t src_pitch, 1045df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint8_t *dst, const int16_t *y_filter, 1046df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int w) { 1047df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int x; 1048df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16u8 dst0; 1049df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 1050df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; 1051df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v8i16 src8_h, src9_h, src10_h, src11_h, src12_h, src13_h, src14_h, src15_h; 1052df37111358d02836cb29bbcb9c6e4c95dff90a16Johann v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; 1053df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1054df37111358d02836cb29bbcb9c6e4c95dff90a16Johann filt = LD_SH(y_filter); 1055df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); 1056df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); 1057df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1058df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (x = 0; x < w; x += 16) { 1059df37111358d02836cb29bbcb9c6e4c95dff90a16Johann LD_SB8(src_y, src_pitch, src0, src1, src2, src3, src4, src5, src6, src7); 1060df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src_y += 16; 1061df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1062df37111358d02836cb29bbcb9c6e4c95dff90a16Johann XORI_B4_128_SB(src0, src1, src2, src3); 1063df37111358d02836cb29bbcb9c6e4c95dff90a16Johann XORI_B4_128_SB(src4, src5, src6, src7); 1064df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(src0, src0_h, src1_h); 1065df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(src1, src2_h, src3_h); 1066df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(src2, src4_h, src5_h); 1067df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(src3, src6_h, src7_h); 1068df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(src4, src8_h, src9_h); 1069df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(src5, src10_h, src11_h); 1070df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(src6, src12_h, src13_h); 1071df37111358d02836cb29bbcb9c6e4c95dff90a16Johann UNPCK_SB_SH(src7, src14_h, src15_h); 1072df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1073df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h *= filt0; 1074df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src1_h *= filt0; 1075df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src8_h *= filt4; 1076df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src9_h *= filt4; 1077df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h += src2_h * filt1; 1078df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src1_h += src3_h * filt1; 1079df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src8_h += src10_h * filt5; 1080df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src9_h += src11_h * filt5; 1081df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h += src4_h * filt2; 1082df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src1_h += src5_h * filt2; 1083df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src8_h += src12_h * filt6; 1084df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src9_h += src13_h * filt6; 1085df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src0_h += src6_h * filt3; 1086df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src1_h += src7_h * filt3; 1087df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src8_h += src14_h * filt7; 1088df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src9_h += src15_h * filt7; 1089df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1090df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ADDS_SH2_SH(src0_h, src8_h, src1_h, src9_h, src0_h, src1_h); 1091df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SRARI_H2_SH(src0_h, src1_h, FILTER_BITS); 1092df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SAT_SH2_SH(src0_h, src1_h, 7); 1093df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst0 = PCKEV_XORI128_UB(src0_h, src1_h); 1094df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ST_UB(dst0, dst); 1095df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst += 16; 1096df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 1097df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 1098df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1099df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, 1100df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint8_t *dst, ptrdiff_t dst_stride, 1101df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const InterpKernel *y_filters, int y0_q4, 1102df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int y_step_q4, int h) { 1103df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int y; 1104df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int y_q4 = y0_q4; 1105df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1106df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src -= src_stride * (SUBPEL_TAPS / 2 - 1); 1107df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1108df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (y = 0; y < h; ++y) { 1109df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 1110df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 1111df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1112df37111358d02836cb29bbcb9c6e4c95dff90a16Johann if (y_q4 & SUBPEL_MASK) { 1113df37111358d02836cb29bbcb9c6e4c95dff90a16Johann filter_vert_w4_msa(src_y, src_stride, &dst[y * dst_stride], y_filter); 1114df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } else { 1115df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint32_t srcd = LW(src_y + 3 * src_stride); 1116df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SW(srcd, dst + y * dst_stride); 1117df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 1118df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1119df37111358d02836cb29bbcb9c6e4c95dff90a16Johann y_q4 += y_step_q4; 1120df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 1121df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 1122df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1123df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, 1124df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint8_t *dst, ptrdiff_t dst_stride, 1125df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const InterpKernel *y_filters, int y0_q4, 1126df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int y_step_q4, int h) { 1127df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int y; 1128df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int y_q4 = y0_q4; 1129df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1130df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src -= src_stride * (SUBPEL_TAPS / 2 - 1); 1131df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1132df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (y = 0; y < h; ++y) { 1133df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 1134df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 1135df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1136df37111358d02836cb29bbcb9c6e4c95dff90a16Johann if (y_q4 & SUBPEL_MASK) { 1137df37111358d02836cb29bbcb9c6e4c95dff90a16Johann filter_vert_w8_msa(src_y, src_stride, &dst[y * dst_stride], y_filter); 1138df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } else { 1139df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint64_t srcd = LD(src_y + 3 * src_stride); 1140df37111358d02836cb29bbcb9c6e4c95dff90a16Johann SD(srcd, dst + y * dst_stride); 1141df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 1142df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1143df37111358d02836cb29bbcb9c6e4c95dff90a16Johann y_q4 += y_step_q4; 1144df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 1145df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 1146df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1147df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic void scaledconvolve_vert_mul16(const uint8_t *src, ptrdiff_t src_stride, 1148df37111358d02836cb29bbcb9c6e4c95dff90a16Johann uint8_t *dst, ptrdiff_t dst_stride, 1149df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const InterpKernel *y_filters, int y0_q4, 1150df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int y_step_q4, int w, int h) { 1151df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int x, y; 1152df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int y_q4 = y0_q4; 1153df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src -= src_stride * (SUBPEL_TAPS / 2 - 1); 1154df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1155df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (y = 0; y < h; ++y) { 1156df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 1157df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 1158df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1159df37111358d02836cb29bbcb9c6e4c95dff90a16Johann if (y_q4 & SUBPEL_MASK) { 1160df37111358d02836cb29bbcb9c6e4c95dff90a16Johann filter_vert_mul_w16_msa(src_y, src_stride, &dst[y * dst_stride], y_filter, 1161df37111358d02836cb29bbcb9c6e4c95dff90a16Johann w); 1162df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } else { 1163df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (x = 0; x < w; ++x) { 1164df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst[x + y * dst_stride] = src_y[x + 3 * src_stride]; 1165df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 1166df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 1167df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1168df37111358d02836cb29bbcb9c6e4c95dff90a16Johann y_q4 += y_step_q4; 1169df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 1170df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 1171df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1172df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 1173df37111358d02836cb29bbcb9c6e4c95dff90a16Johann ptrdiff_t dst_stride, const InterpKernel *filter, 1174df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, 1175df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int w, int h) { 1176df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // Note: Fixed size intermediate buffer, temp, places limits on parameters. 1177df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // 2d filtering proceeds in 2 steps: 1178df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // (1) Interpolate horizontally into an intermediate buffer, temp. 1179df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // (2) Interpolate temp vertically to derive the sub-pixel result. 1180df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // Deriving the maximum number of rows in the temp buffer (135): 1181df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). 1182df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // --Largest block size is 64x64 pixels. 1183df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the 1184df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // original frame (in 1/16th pixel units). 1185df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // --Must round-up because block may be located at sub-pixel position. 1186df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. 1187df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. 1188df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // --Require an additional 8 rows for the horiz_w8 transpose tail. 1189df37111358d02836cb29bbcb9c6e4c95dff90a16Johann DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); 1190df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int intermediate_height = 1191df37111358d02836cb29bbcb9c6e4c95dff90a16Johann (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; 1192df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1193df37111358d02836cb29bbcb9c6e4c95dff90a16Johann assert(w <= 64); 1194df37111358d02836cb29bbcb9c6e4c95dff90a16Johann assert(h <= 64); 1195df37111358d02836cb29bbcb9c6e4c95dff90a16Johann assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); 1196df37111358d02836cb29bbcb9c6e4c95dff90a16Johann assert(x_step_q4 <= 64); 1197df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1198df37111358d02836cb29bbcb9c6e4c95dff90a16Johann if ((0 == x0_q4) && (16 == x_step_q4) && (0 == y0_q4) && (16 == y_step_q4)) { 1199df37111358d02836cb29bbcb9c6e4c95dff90a16Johann vpx_convolve_copy_msa(src, src_stride, dst, dst_stride, filter, x0_q4, 1200df37111358d02836cb29bbcb9c6e4c95dff90a16Johann x_step_q4, y0_q4, y_step_q4, w, h); 1201df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } else { 1202df37111358d02836cb29bbcb9c6e4c95dff90a16Johann if (w >= 16) { 1203df37111358d02836cb29bbcb9c6e4c95dff90a16Johann scaledconvolve_horiz_mul16(src - src_stride * (SUBPEL_TAPS / 2 - 1), 1204df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src_stride, temp, 64, filter, x0_q4, x_step_q4, 1205df37111358d02836cb29bbcb9c6e4c95dff90a16Johann w, intermediate_height); 1206df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } else if (w == 8) { 1207df37111358d02836cb29bbcb9c6e4c95dff90a16Johann scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), 1208df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src_stride, temp, 64, filter, x0_q4, x_step_q4, 1209df37111358d02836cb29bbcb9c6e4c95dff90a16Johann intermediate_height); 1210df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } else { 1211df37111358d02836cb29bbcb9c6e4c95dff90a16Johann scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), 1212df37111358d02836cb29bbcb9c6e4c95dff90a16Johann src_stride, temp, 64, filter, x0_q4, x_step_q4, 1213df37111358d02836cb29bbcb9c6e4c95dff90a16Johann intermediate_height); 1214df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 1215df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1216df37111358d02836cb29bbcb9c6e4c95dff90a16Johann if (w >= 16) { 1217df37111358d02836cb29bbcb9c6e4c95dff90a16Johann scaledconvolve_vert_mul16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, 1218df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst_stride, filter, y0_q4, y_step_q4, w, h); 1219df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } else if (w == 8) { 1220df37111358d02836cb29bbcb9c6e4c95dff90a16Johann scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, 1221df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst_stride, filter, y0_q4, y_step_q4, h); 1222df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } else { 1223df37111358d02836cb29bbcb9c6e4c95dff90a16Johann scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, 1224df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dst_stride, filter, y0_q4, y_step_q4, h); 1225df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 1226df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 1227df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 1228