1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/*
2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *
4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */
10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include <assert.h>
12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h"
13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/mips/vpx_convolve_msa.h"
14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
167bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int32_t src_stride, uint8_t *dst,
17da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int32_t dst_stride,
18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int8_t *filter) {
19da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 dst0, dst1, dst2, dst3, res2, res3;
21da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 mask0, mask1, mask2, mask3;
22da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt, res0, res1;
23da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
24da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask0 = LD_UB(&mc_filt_mask_arr[16]);
25da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src -= 3;
26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
27da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
28da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter);
29da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
31da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask1 = mask0 + 2;
32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask2 = mask0 + 4;
33da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask3 = mask0 + 6;
34da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB4(src, src_stride, src0, src1, src2, src3);
36da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  XORI_B4_128_SB(src0, src1, src2, src3);
37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                             filt0, filt1, filt2, filt3, res0, res1);
39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SRARI_H2_SH(res0, res1, FILTER_BITS);
41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SAT_SH2_SH(res0, res1, 7);
42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  XORI_B2_128_UB(res2, res3);
45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
46da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
48da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
507bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int32_t src_stride, uint8_t *dst,
51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int32_t dst_stride,
52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int8_t *filter) {
53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
56da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt, vec0, vec1, vec2, vec3;
57da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask0 = LD_UB(&mc_filt_mask_arr[16]);
59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src -= 3;
60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter);
63da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
64da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
65da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask1 = mask0 + 2;
66da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask2 = mask0 + 4;
67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask3 = mask0 + 6;
68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB4(src, src_stride, src0, src1, src2, src3);
70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  XORI_B4_128_SB(src0, src1, src2, src3);
71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += (4 * src_stride);
72da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
73da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                             filt0, filt1, filt2, filt3, vec0, vec1);
75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB4(src, src_stride, src0, src1, src2, src3);
76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  XORI_B4_128_SB(src0, src1, src2, src3);
77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                             filt0, filt1, filt2, filt3, vec2, vec3);
79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS);
80da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
81da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2,
82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian              res3);
83da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  XORI_B2_128_UB(res0, res2);
85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian             dst6);
87da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
88da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ST4x8_UB(res0, res2, dst, dst_stride);
90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
937bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                             int32_t src_stride, uint8_t *dst,
947bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                             int32_t dst_stride, int8_t *filter,
95da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                             int32_t height) {
96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (4 == height) {
97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else if (8 == height) {
99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
1047bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                             int32_t src_stride, uint8_t *dst,
1057bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                             int32_t dst_stride, int8_t *filter,
106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                             int32_t height) {
107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t loop_cnt;
108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt, out0, out1, out2, out3;
111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask0 = LD_UB(&mc_filt_mask_arr[0]);
113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src -= 3;
114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter);
117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask1 = mask0 + 2;
120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask2 = mask0 + 4;
121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask3 = mask0 + 6;
122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2); loop_cnt--;) {
124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src1, src2, src3);
125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    XORI_B4_128_SB(src0, src1, src2, src3);
126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               mask3, filt0, filt1, filt2, filt3, out0, out1,
129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                               out2, out3);
130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SAT_SH4_SH(out0, out1, out2, out3, 7);
1337bc9febe8749e98a3812a0dc4380ceae75c29450Johann    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
1347bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            dst_stride);
135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
1407bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int32_t src_stride, uint8_t *dst,
141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int32_t dst_stride,
1427bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int8_t *filter, int32_t height) {
143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32_t loop_cnt;
144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt, out0, out1, out2, out3;
147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask0 = LD_UB(&mc_filt_mask_arr[0]);
151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src -= 3;
152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter);
155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask1 = mask0 + 2;
158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask2 = mask0 + 4;
159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask3 = mask0 + 6;
160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = height >> 1; loop_cnt--;) {
162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB2(src, src_stride, src0, src2);
163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB2(src + 8, src_stride, src1, src3);
164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (2 * src_stride);
165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    XORI_B4_128_SB(src0, src1, src2, src3);
167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian               vec14);
171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian               vec15);
173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                vec2, vec3);
175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                vec9, vec10, vec11);
177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                 vec2, vec3);
179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                 vec9, vec10, vec11);
181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                out2, out3);
183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(dst, dst_stride, dst0, dst1);
184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SAT_SH4_SH(out0, out1, out2, out3, 7);
186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
1947bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int32_t src_stride, uint8_t *dst,
195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int32_t dst_stride,
1967bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int8_t *filter, int32_t height) {
197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt, out0, out1, out2, out3;
201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask0 = LD_UB(&mc_filt_mask_arr[0]);
205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src -= 3;
206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter);
209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask1 = mask0 + 2;
212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask2 = mask0 + 4;
213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask3 = mask0 + 6;
214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = height; loop_cnt--;) {
216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src0 = LD_SB(src);
217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src2 = LD_SB(src + 16);
218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src3 = LD_SB(src + 24);
219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src1 = __msa_sldi_b(src2, src0, 8);
220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    XORI_B4_128_SB(src0, src1, src2, src3);
223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian               vec14);
227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian               vec15);
229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                vec2, vec3);
231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                vec9, vec10, vec11);
233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                 vec2, vec3);
235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                 vec9, vec10, vec11);
237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                out2, out3);
239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SAT_SH4_SH(out0, out1, out2, out3, 7);
241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(dst, 16, dst1, dst2);
242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
2497bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int32_t src_stride, uint8_t *dst,
250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int32_t dst_stride,
2517bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int8_t *filter, int32_t height) {
252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt, cnt;
253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 filt, out0, out1, out2, out3;
256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask0 = LD_UB(&mc_filt_mask_arr[0]);
260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src -= 3;
261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_SH(filter);
264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask1 = mask0 + 2;
267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask2 = mask0 + 4;
268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask3 = mask0 + 6;
269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = height; loop_cnt--;) {
271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (cnt = 0; cnt < 2; ++cnt) {
272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      src0 = LD_SB(&src[cnt << 5]);
273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      src2 = LD_SB(&src[16 + (cnt << 5)]);
274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      src3 = LD_SB(&src[24 + (cnt << 5)]);
275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      src1 = __msa_sldi_b(src2, src0, 8);
276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      XORI_B4_128_SB(src0, src1, src2, src3);
278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                 vec12);
280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                 vec13);
282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                 vec14);
284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                 vec15);
286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                  vec1, vec2, vec3);
288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                  vec9, vec10, vec11);
290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                   vec1, vec2, vec3);
292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                   vec9, vec10, vec11);
294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                  out2, out3);
296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      SAT_SH4_SH(out0, out1, out2, out3, 7);
298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
3097bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int32_t src_stride, uint8_t *dst,
310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int32_t dst_stride,
311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int8_t *filter) {
312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, mask;
313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 vec2, vec3, filt;
315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[16]);
317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_UH(filter);
320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB4(src, src_stride, src0, src1, src2, src3);
323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SRARI_H2_UH(vec2, vec3, FILTER_BITS);
327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
3347bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int32_t src_stride, uint8_t *dst,
335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int32_t dst_stride,
336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int8_t *filter) {
337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 vec4, vec5, vec6, vec7, filt;
341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[16]);
343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_UH(filter);
346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian              vec6, vec7);
354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian              res3);
357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian             dst6);
359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian              res3);
361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  dst += (4 * dst_stride);
363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
3677bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                             int32_t src_stride, uint8_t *dst,
3687bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                             int32_t dst_stride, int8_t *filter,
369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                             int32_t height) {
370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (4 == height) {
371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else if (8 == height) {
373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
3787bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int32_t src_stride, uint8_t *dst,
379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int32_t dst_stride,
380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int8_t *filter) {
381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, mask;
382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt0, dst0, dst1, dst2, dst3;
383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 vec0, vec1, vec2, vec3, filt;
384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[0]);
386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_UH(filter);
389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB4(src, src_stride, src0, src1, src2, src3);
392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian              vec2, vec3);
396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3987bc9febe8749e98a3812a0dc4380ceae75c29450Johann  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
3997bc9febe8749e98a3812a0dc4380ceae75c29450Johann                     dst_stride);
400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
4027bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void common_hz_2t_and_aver_dst_8x8mult_msa(
4037bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
4047bc9febe8749e98a3812a0dc4380ceae75c29450Johann    int8_t *filter, int32_t height) {
405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, mask;
406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt0, dst0, dst1, dst2, dst3;
407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 vec0, vec1, vec2, vec3, filt;
408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[0]);
410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_UH(filter);
413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB4(src, src_stride, src0, src1, src2, src3);
416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += (4 * src_stride);
417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian              vec2, vec3);
421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB4(src, src_stride, src0, src1, src2, src3);
424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += (4 * src_stride);
4257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
4267bc9febe8749e98a3812a0dc4380ceae75c29450Johann                     dst_stride);
427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  dst += (4 * dst_stride);
428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian              vec2, vec3);
433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
4357bc9febe8749e98a3812a0dc4380ceae75c29450Johann  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
4367bc9febe8749e98a3812a0dc4380ceae75c29450Johann                     dst_stride);
437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  dst += (4 * dst_stride);
438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (16 == height) {
440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src1, src2, src3);
441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                vec2, vec3);
447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src1, src2, src3);
4507bc9febe8749e98a3812a0dc4380ceae75c29450Johann    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
4517bc9febe8749e98a3812a0dc4380ceae75c29450Johann                       dst_stride);
452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                vec2, vec3);
458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
4607bc9febe8749e98a3812a0dc4380ceae75c29450Johann    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
4617bc9febe8749e98a3812a0dc4380ceae75c29450Johann                       dst_stride);
462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
4667bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                             int32_t src_stride, uint8_t *dst,
4677bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                             int32_t dst_stride, int8_t *filter,
468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                             int32_t height) {
469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (4 == height) {
470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                          filter, height);
474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
477da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
4787bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int32_t src_stride, uint8_t *dst,
479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int32_t dst_stride,
4807bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int8_t *filter, int32_t height) {
481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt0, dst0, dst1, dst2, dst3;
484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[0]);
488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_UH(filter);
491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB4(src, src_stride, src0, src2, src4, src6);
494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src += (4 * src_stride);
496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian              res2, res3);
503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian              res6, res7);
505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  dst += dst_stride;
510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  dst += dst_stride;
512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  dst += dst_stride;
514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  dst += dst_stride;
516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src2, src4, src6);
519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                res2, res3);
528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                res6, res7);
530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
5457bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int32_t src_stride, uint8_t *dst,
546da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int32_t dst_stride,
5477bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int8_t *filter, int32_t height) {
548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt0, dst0, dst1, dst2, dst3;
551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[0]);
555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_UH(filter);
558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = (height >> 1); loop_cnt--;) {
561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src0 = LD_SB(src);
562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src2 = LD_SB(src + 16);
563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src3 = LD_SB(src + 24);
564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src1 = __msa_sldi_b(src2, src0, 8);
565da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
566da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src4 = LD_SB(src);
567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src6 = LD_SB(src + 16);
568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src7 = LD_SB(src + 24);
569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src5 = __msa_sldi_b(src6, src4, 8);
570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
575da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                res2, res3);
578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                res6, res7);
580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(dst, 16, dst0, dst1);
583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
586da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB2(dst, 16, dst2, dst3);
587da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
591da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
5947bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int32_t src_stride, uint8_t *dst,
595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                              int32_t dst_stride,
5967bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                              int8_t *filter, int32_t height) {
597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 filt0, dst0, dst1, dst2, dst3;
600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  mask = LD_SB(&mc_filt_mask_arr[0]);
604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* rearranging filter */
606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt = LD_UH(filter);
607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = height; loop_cnt--;) {
610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, 16, src0, src2, src4, src6);
611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src7 = LD_SB(src + 56);
612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
613da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
616da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
617da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
618da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                out2, out3);
621da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
622da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                out6, out7);
623da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
626da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
627da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
628da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
629da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
632da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
633da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
634da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
635da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 uint8_t *dst, ptrdiff_t dst_stride,
636da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 const int16_t *filter_x, int x_step_q4,
6377bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                 const int16_t *filter_y, int y_step_q4, int w,
6387bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                 int h) {
639da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int8_t cnt, filt_hor[8];
640da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
641da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(x_step_q4 == 16);
642da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(((const int32_t *)filter_x)[1] != 0x800000);
643da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
644da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (cnt = 0; cnt < 8; ++cnt) {
645da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt_hor[cnt] = filter_x[cnt];
646da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
647da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
648da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (((const int32_t *)filter_x)[0] == 0) {
649da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    switch (w) {
650da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 4:
6517bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
6527bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                         (int32_t)dst_stride, &filt_hor[3], h);
653da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 8:
6557bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
6567bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                         (int32_t)dst_stride, &filt_hor[3], h);
657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 16:
6597bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
6607bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                          (int32_t)dst_stride, &filt_hor[3], h);
661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 32:
6637bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
6647bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                          (int32_t)dst_stride, &filt_hor[3], h);
665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 64:
6677bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
6687bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                          (int32_t)dst_stride, &filt_hor[3], h);
669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      default:
6717bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
6727bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  x_step_q4, filter_y, y_step_q4, w, h);
673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
674da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
675da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
676da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    switch (w) {
677da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 4:
6787bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
6797bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                         (int32_t)dst_stride, filt_hor, h);
680da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 8:
6827bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
6837bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                         (int32_t)dst_stride, filt_hor, h);
684da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
685da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 16:
6867bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
6877bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                          (int32_t)dst_stride, filt_hor, h);
688da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
689da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 32:
6907bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
6917bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                          (int32_t)dst_stride, filt_hor, h);
692da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
693da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 64:
6947bc9febe8749e98a3812a0dc4380ceae75c29450Johann        common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
6957bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                          (int32_t)dst_stride, filt_hor, h);
696da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
697da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      default:
6987bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
6997bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  x_step_q4, filter_y, y_step_q4, w, h);
700da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
703da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
704