1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/*
2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *
4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */
10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include <assert.h>
12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h"
13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/mips/vpx_convolve_msa.h"
14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
157bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_8ht_8vt_and_aver_dst_4w_msa(
167bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
177bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
19df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint32_t tp0, tp1, tp2, tp3;
20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
21df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 dst0 = { 0 }, mask0, mask1, mask2, mask3, res;
22da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
23da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
24da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
25da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
27da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask0 = LD_UB(&mc_filt_mask_arr[16]);
28da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src -= (3 + 3 * src_stride);
29da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
31da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_horiz);
32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
33da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
34da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask1 = mask0 + 2;
35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask2 = mask0 + 4;
36da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask3 = mask0 + 6;
37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += (7 * src_stride);
41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
46da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
48da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
50da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_vert);
53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
56da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
57da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src7, src8, src9, src10);
60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    XORI_B4_128_SB(src7, src8, src9, src10);
61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
63df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
64df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
657bc9febe8749e98a3812a0dc4380ceae75c29450Johann    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
667bc9febe8749e98a3812a0dc4380ceae75c29450Johann                              filt_hz1, filt_hz2, filt_hz3);
67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               filt_vt2, filt_vt3);
71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
727bc9febe8749e98a3812a0dc4380ceae75c29450Johann    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
737bc9febe8749e98a3812a0dc4380ceae75c29450Johann                              filt_hz1, filt_hz2, filt_hz3);
74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               filt_vt2, filt_vt3);
78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_SH(res0, res1, FILTER_BITS);
80da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SAT_SH2_SH(res0, res1, 7);
81df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    res = PCKEV_XORI128_UB(res0, res1);
82df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    res = (v16u8)__msa_aver_u_b(res, dst0);
83df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out5 = hz_out9;
87da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = vec2;
88da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec1 = vec3;
89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec2 = vec4;
90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
937bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_8ht_8vt_and_aver_dst_8w_msa(
947bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
957bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
97df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint64_t tp0, tp1, tp2, tp3;
98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
101df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 dst0 = { 0 }, dst1 = { 0 }, mask0, mask1, mask2, mask3;
102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask0 = LD_UB(&mc_filt_mask_arr[0]);
107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src -= (3 + 3 * src_stride);
108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_horiz);
111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask1 = mask0 + 2;
114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask2 = mask0 + 4;
115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask3 = mask0 + 6;
116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += (7 * src_stride);
119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            filt_hz1, filt_hz2, filt_hz3);
135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_vert);
137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src7, src8, src9, src10);
145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    XORI_B4_128_SB(src7, src8, src9, src10);
146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
148df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
149df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    INSERT_D2_UB(tp0, tp1, dst0);
150df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    INSERT_D2_UB(tp2, tp3, dst1);
151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1527bc9febe8749e98a3812a0dc4380ceae75c29450Johann    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
1537bc9febe8749e98a3812a0dc4380ceae75c29450Johann                              filt_hz1, filt_hz2, filt_hz3);
154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               filt_vt2, filt_vt3);
157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1587bc9febe8749e98a3812a0dc4380ceae75c29450Johann    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
1597bc9febe8749e98a3812a0dc4380ceae75c29450Johann                              filt_hz1, filt_hz2, filt_hz3);
160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               filt_vt2, filt_vt3);
163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1647bc9febe8749e98a3812a0dc4380ceae75c29450Johann    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
1657bc9febe8749e98a3812a0dc4380ceae75c29450Johann                              filt_hz1, filt_hz2, filt_hz3);
166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               filt_vt2, filt_vt3);
169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               filt_vt2, filt_vt3);
175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
178df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst,
1797bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            dst_stride);
180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out6 = hz_out10;
183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out0 = out2;
184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out1 = out3;
185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out2 = out8;
186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out4 = out6;
187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out5 = out7;
188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    out6 = out9;
189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1927bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_8ht_8vt_and_aver_dst_16w_msa(
1937bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
1947bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t multiple8_cnt;
196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (multiple8_cnt = 2; multiple8_cnt--;) {
197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                          filter_horiz, filter_vert, height);
199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 8;
200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 8;
201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2047bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_8ht_8vt_and_aver_dst_32w_msa(
2057bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2067bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t multiple8_cnt;
208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (multiple8_cnt = 4; multiple8_cnt--;) {
209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                          filter_horiz, filter_vert, height);
211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 8;
212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 8;
213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2167bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_8ht_8vt_and_aver_dst_64w_msa(
2177bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2187bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t multiple8_cnt;
220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (multiple8_cnt = 8; multiple8_cnt--;) {
221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                          filter_horiz, filter_vert, height);
223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 8;
224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 8;
225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2287bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
2297bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2307bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int8_t *filter_horiz, int8_t *filter_vert) {
231df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint32_t tp0, tp1, tp2, tp3;
232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, mask;
233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt_hz, filt_vt, vec0, vec1;
234df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 dst0 = { 0 }, out;
235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[16]);
238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_UH(filter_horiz);
241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_UH(filter_vert);
244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
255df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
256df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
259df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
260df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out = __msa_aver_u_b(out, dst0);
261df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2647bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
2657bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2667bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int8_t *filter_horiz, int8_t *filter_vert) {
267df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint32_t tp0, tp1, tp2, tp3;
268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
269df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
270df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 dst0 = { 0 }, dst1 = { 0 };
271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt;
274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[16]);
276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_horiz);
279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_splati_h(filt, 0);
280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_vert);
282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_splati_h(filt, 0);
283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += (8 * src_stride);
286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src8 = LD_SB(src);
287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian             hz_out3, hz_out5, 8);
295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
297df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
298df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
299df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
300df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
3037bc9febe8749e98a3812a0dc4380ceae75c29450Johann  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
3047bc9febe8749e98a3812a0dc4380ceae75c29450Johann              tmp1, tmp2, tmp3);
305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
306df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
307df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
308df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ST4x8_UB(res0, res1, dst, dst_stride);
309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3117bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_4w_msa(
3127bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
3137bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (4 == height) {
315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                           filter_horiz, filter_vert);
317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else if (8 == height) {
318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                           filter_horiz, filter_vert);
320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3237bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
3247bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
3257bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int8_t *filter_horiz, int8_t *filter_vert) {
326df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint64_t tp0, tp1, tp2, tp3;
327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, mask;
328df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 filt_hz, filt_vt, dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3;
329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt;
331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[0]);
333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_horiz);
336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_splati_h(filt, 0);
337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_vert);
339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_splati_h(filt, 0);
340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += (5 * src_stride);
343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
344df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
345df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(tp0, tp1, dst0);
346df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  INSERT_D2_UB(tp2, tp3, dst1);
347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
365df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
3687bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
3697bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
3707bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
372df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint64_t tp0, tp1, tp2, tp3;
373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, mask;
374df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v16u8 filt_hz, filt_vt, vec0, dst0 = { 0 }, dst1 = { 0 };
375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt;
377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[0]);
379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_horiz);
382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_splati_h(filt, 0);
383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_vert);
385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_splati_h(filt, 0);
386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src0 = LD_SB(src);
388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src1, src2, src3, src4);
394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
415df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
416df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    INSERT_D2_UB(tp0, tp1, dst0);
417df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    INSERT_D2_UB(tp2, tp3, dst1);
418df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
4237bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_8w_msa(
4247bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
4257bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (4 == height) {
427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                           filter_horiz, filter_vert);
429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
4307bc9febe8749e98a3812a0dc4380ceae75c29450Johann    common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
4317bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
4357bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_16w_msa(
4367bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
4377bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt;
443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[0]);
445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_horiz);
448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_hz = (v16u8)__msa_splati_h(filt, 0);
449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter_vert);
451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt_vt = (v16u8)__msa_splati_h(filt, 0);
452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB2(src, 8, src0, src1);
454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += src_stride;
455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src2, src4, src6);
461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
477da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
4997bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_32w_msa(
5007bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
5017bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t multiple8_cnt;
503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (multiple8_cnt = 2; multiple8_cnt--;) {
504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                           filter_horiz, filter_vert, height);
506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
5117bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hv_2ht_2vt_and_aver_dst_64w_msa(
5127bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
5137bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t multiple8_cnt;
515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (multiple8_cnt = 4; multiple8_cnt--;) {
516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                           filter_horiz, filter_vert, height);
518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += 16;
519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += 16;
520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                           uint8_t *dst, ptrdiff_t dst_stride,
525df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                           const InterpKernel *filter, int x0_q4, int x_step_q4,
526df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                           int y0_q4, int y_step_q4, int w, int h) {
527df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  const int16_t *const filter_x = filter[x0_q4];
528df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  const int16_t *const filter_y = filter[y0_q4];
529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int8_t cnt, filt_hor[8], filt_ver[8];
530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(x_step_q4 == 16);
532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(y_step_q4 == 16);
533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(((const int32_t *)filter_x)[1] != 0x800000);
534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(((const int32_t *)filter_y)[1] != 0x800000);
535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (cnt = 0; cnt < 8; ++cnt) {
537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt_hor[cnt] = filter_x[cnt];
538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt_ver[cnt] = filter_y[cnt];
539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (((const int32_t *)filter_x)[0] == 0 &&
542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      ((const int32_t *)filter_y)[0] == 0) {
543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    switch (w) {
544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 4:
5457bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
5467bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              (int32_t)dst_stride, &filt_hor[3],
5477bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              &filt_ver[3], h);
548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 8:
5507bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
5517bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              (int32_t)dst_stride, &filt_hor[3],
5527bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              &filt_ver[3], h);
553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 16:
5557bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
5567bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                               (int32_t)dst_stride,
557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                               &filt_hor[3], &filt_ver[3], h);
558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 32:
5607bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
5617bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                               (int32_t)dst_stride,
562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                               &filt_hor[3], &filt_ver[3], h);
563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 64:
5657bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
5667bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                               (int32_t)dst_stride,
567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                               &filt_hor[3], &filt_ver[3], h);
568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      default:
570df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
571df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                            x_step_q4, y0_q4, y_step_q4, w, h);
572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else if (((const int32_t *)filter_x)[0] == 0 ||
575da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian             ((const int32_t *)filter_y)[0] == 0) {
576df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
577df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                        x_step_q4, y0_q4, y_step_q4, w, h);
578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    switch (w) {
580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 4:
5817bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
5827bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              (int32_t)dst_stride, filt_hor,
5837bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              filt_ver, h);
584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 8:
5867bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
5877bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              (int32_t)dst_stride, filt_hor,
5887bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              filt_ver, h);
589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 16:
5917bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
5927bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                               (int32_t)dst_stride, filt_hor,
5937bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                               filt_ver, h);
594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 32:
5967bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
5977bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                               (int32_t)dst_stride, filt_hor,
5987bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                               filt_ver, h);
599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 64:
6017bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
6027bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                               (int32_t)dst_stride, filt_hor,
6037bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                               filt_ver, h);
604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      default:
606df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
607df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                            x_step_q4, y0_q4, y_step_q4, w, h);
608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
612