1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/*
2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *
4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */
10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vp8_rtcd.h"
12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_ports/mem.h"
13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vp8/common/filter.h"
14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vp8/common/mips/msa/vp8_macros_msa.h"
15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
16da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianDECLARE_ALIGNED(16, static const int8_t, vp8_bilinear_filters_msa[7][2]) =
17da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    { 112, 16 },
19da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    { 96, 32 },
20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    { 80, 48 },
21da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    { 64, 64 },
22da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    { 48, 80 },
23da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    { 32, 96 },
24da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    { 16, 112 }
25da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian};
26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
27da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic const uint8_t vp8_mc_filt_mask_arr[16 * 3] =
28da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
29da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    /* 8 width cases */
30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
31da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    /* 4 width cases */
32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
33da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    /* 4 width cases */
34da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian};
36da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 uint8_t *RESTRICT dst, int32_t dst_stride,
39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 const int8_t *filter)
40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 src0, src1, src2, src3, mask;
42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 filt0, vec0, vec1, res0, res1;
43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 vec2, vec3, filt;
44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
46da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_UH(filter);
48da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
50da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src1, src2, src3);
51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(vec2, vec3, VP8_FILTER_SHIFT);
54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
56da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
57da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 uint8_t *RESTRICT dst, int32_t dst_stride,
60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 const int8_t *filter)
61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 vec0, vec1, vec2, vec3, filt0;
63da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
64da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 res0, res1, res2, res3;
65da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 vec4, vec5, vec6, vec7, filt;
66da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_UH(filter);
70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
72da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
73da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                vec4, vec5, vec6, vec7);
77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                res0, res1, res2, res3);
80da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
81da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
83da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                uint8_t *RESTRICT dst, int32_t dst_stride,
87da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                const int8_t *filter, int32_t height)
88da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (4 == height)
90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    else if (8 == height)
94da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
95da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 uint8_t *RESTRICT dst, int32_t dst_stride,
101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 const int8_t *filter)
102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 filt0;
104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 src0, src1, src2, src3, mask;
105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 vec0, vec1, vec2, vec3, filt;
106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_UH(filter);
110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src1, src2, src3);
113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                vec0, vec1, vec2, vec3);
117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST8x4_UB(src0, src1, dst, dst_stride);
120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     uint8_t *RESTRICT dst, int32_t dst_stride,
124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     const int8_t *filter, int32_t height)
125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 filt0;
127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 src0, src1, src2, src3, mask, out0, out1;
128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 vec0, vec1, vec2, vec3, filt;
129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_UH(filter);
133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src1, src2, src3);
136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                vec0, vec1, vec2, vec3);
142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src1, src2, src3);
145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST8x4_UB(out0, out1, dst, dst_stride);
149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                vec0, vec1, vec2, vec3);
155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST8x4_UB(out0, out1, dst, dst_stride);
158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (16 == height)
161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        LD_SB4(src, src_stride, src0, src1, src2, src3);
163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        src += (4 * src_stride);
164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                    vec0, vec1, vec2, vec3);
169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        LD_SB4(src, src_stride, src0, src1, src2, src3);
171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        src += (4 * src_stride);
172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ST8x4_UB(out0, out1, dst, dst_stride);
175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                    vec0, vec1, vec2, vec3);
180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                uint8_t *RESTRICT dst, int32_t dst_stride,
188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                const int8_t *filter, int32_t height)
189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (4 == height)
191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    else
195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hz_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 uint8_t *RESTRICT dst, int32_t dst_stride,
202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 const int8_t *filter, int32_t height)
203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    uint32_t loop_cnt;
205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    loop_cnt = (height >> 2) - 1;
212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_UH(filter);
214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, src_stride, src0, src2, src4, src6);
217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (4 * src_stride);
219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                out0, out1, out2, out3);
226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                out4, out5, out6, out7);
228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_ST_SB(out0, out1, dst);
231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_ST_SB(out2, out3, dst);
233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_ST_SB(out4, out5, dst);
235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_ST_SB(out6, out7, dst);
237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride;
238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (; loop_cnt--;)
240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        LD_SB4(src, src_stride, src0, src2, src4, src6);
242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        src += (4 * src_stride);
244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                    out0, out1, out2, out3);
251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                    out4, out5, out6, out7);
253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_ST_SB(out0, out1, dst);
256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += dst_stride;
257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_ST_SB(out2, out3, dst);
258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += dst_stride;
259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_ST_SB(out4, out5, dst);
260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += dst_stride;
261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_ST_SB(out6, out7, dst);
262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += dst_stride;
263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_vt_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 uint8_t *RESTRICT dst, int32_t dst_stride,
268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 const int8_t *filter)
269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 src0, src1, src2, src3, src4;
271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 filt0;
273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8i16 filt;
274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 tmp0, tmp1;
275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_SH(filter);
277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt0 = (v16u8)__msa_splati_h(filt, 0);
278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (5 * src_stride);
281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian               src10_r, src21_r, src32_r, src43_r);
284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_vt_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 uint8_t *RESTRICT dst, int32_t dst_stride,
293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 const int8_t *filter)
294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 tmp0, tmp1, tmp2, tmp3;
299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 filt0;
300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8i16 filt;
301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_SH(filter);
303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt0 = (v16u8)__msa_splati_h(filt, 0);
304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (8 * src_stride);
307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src8 = LD_SB(src);
309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian               src32_r, src43_r);
313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian               src76_r, src87_r);
315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian               src87_r, src76_r, src2110, src4332, src6554, src8776);
317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                tmp0, tmp1, tmp2, tmp3);
319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_vt_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                uint8_t *RESTRICT dst, int32_t dst_stride,
327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                const int8_t *filter, int32_t height)
328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (4 == height)
330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    else if (8 == height)
334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_vt_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 uint8_t *RESTRICT dst, int32_t dst_stride,
341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 const int8_t *filter)
342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 out0, out1;
345da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 tmp0, tmp1, tmp2, tmp3;
346da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8i16 filt;
347da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_SH(filter);
349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt0 = (v16u8)__msa_splati_h(filt, 0);
350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
352da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
353da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                tmp0, tmp1, tmp2, tmp3);
356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST8x4_UB(out0, out1, dst, dst_stride);
359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_vt_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
362da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     uint8_t *RESTRICT dst, int32_t dst_stride,
363da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     const int8_t *filter, int32_t height)
364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    uint32_t loop_cnt;
366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
367da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
368da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 out0, out1;
369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 tmp0, tmp1, tmp2, tmp3;
370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8i16 filt;
371da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
372da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_SH(filter);
373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt0 = (v16u8)__msa_splati_h(filt, 0);
374da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
375da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src0 = LD_UB(src);
376da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
377da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (loop_cnt = (height >> 3); loop_cnt--;)
379da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
380da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
381da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        src += (8 * src_stride);
382da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
383da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                   vec0, vec1, vec2, vec3);
385da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
386da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                   vec4, vec5, vec6, vec7);
387da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
388da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                    tmp0, tmp1, tmp2, tmp3);
389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ST8x4_UB(out0, out1, dst, dst_stride);
392da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += (4 * dst_stride);
393da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
394da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
395da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                    tmp0, tmp1, tmp2, tmp3);
396da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
397da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
398da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ST8x4_UB(out0, out1, dst, dst_stride);
399da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += (4 * dst_stride);
400da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
401da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        src0 = src8;
402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
403da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
404da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
405da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_vt_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
406da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                uint8_t *RESTRICT dst, int32_t dst_stride,
407da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                const int8_t *filter, int32_t height)
408da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
409da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (4 == height)
410da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
411da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
412da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
413da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    else
414da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
415da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 height);
417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
419da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
420da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_vt_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
421da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 uint8_t *RESTRICT dst, int32_t dst_stride,
422da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 const int8_t *filter, int32_t height)
423da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    uint32_t loop_cnt;
425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 src0, src1, src2, src3, src4;
426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
427da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 tmp0, tmp1, tmp2, tmp3;
428da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8i16 filt;
429da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
430da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_SH(filter);
431da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt0 = (v16u8)__msa_splati_h(filt, 0);
432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
433da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src0 = LD_UB(src);
434da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
435da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
436da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (loop_cnt = (height >> 2); loop_cnt--;)
437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        LD_UB4(src, src_stride, src1, src2, src3, src4);
439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        src += (4 * src_stride);
440da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
441da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
442da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
443da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_ST_SB(tmp0, tmp1, dst);
446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += dst_stride;
447da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
448da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
449da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
450da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
451da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
452da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_ST_SB(tmp2, tmp3, dst);
453da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += dst_stride;
454da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
455da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
456da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_ST_SB(tmp0, tmp1, dst);
458da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += dst_stride;
459da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
460da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
461da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_ST_SB(tmp2, tmp3, dst);
463da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += dst_stride;
464da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
465da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        src0 = src4;
466da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
467da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
468da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      uint8_t *RESTRICT dst, int32_t dst_stride,
471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      const int8_t *filter_horiz,
472da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      const int8_t *filter_vert)
473da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
474da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 src0, src1, src2, src3, src4, mask;
475da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
476da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
477da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
478da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
479da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
480da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_UH(filter_horiz);
481da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
482da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_UH(filter_vert);
483da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
484da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
485da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
486da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
487da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
488da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
489da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
490da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
491da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
492da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
493da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
494da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
495da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
496da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
497da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
498da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
499da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
500da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      uint8_t *RESTRICT dst, int32_t dst_stride,
501da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      const int8_t *filter_horiz,
502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      const int8_t *filter_vert)
503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
505da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 res0, res1, res2, res3;
506da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
508da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
509da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
510da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
512da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_UH(filter_horiz);
513da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
514da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_UH(filter_vert);
515da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
516da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
517da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
518da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (8 * src_stride);
519da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src8 = LD_SB(src);
520da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
521da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
522da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
523da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, VP8_FILTER_SHIFT);
524da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, VP8_FILTER_SHIFT);
525da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, VP8_FILTER_SHIFT);
526da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian               hz_out3, hz_out5, 8);
528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
532da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
533da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                vec4, vec5, vec6, vec7);
534da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
536da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                res0, res1, res2, res3);
537da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
538da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += (4 * dst_stride);
539da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
540da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
542da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
543da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     uint8_t *RESTRICT dst, int32_t dst_stride,
544da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     const int8_t *filter_horiz,
545da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     const int8_t *filter_vert,
546da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     int32_t height)
547da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
548da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (4 == height)
549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
551da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                  filter_horiz, filter_vert);
552da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
553da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    else if (8 == height)
554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                  filter_horiz, filter_vert);
557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
559da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
560da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
561da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      uint8_t *RESTRICT dst, int32_t dst_stride,
562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      const int8_t *filter_horiz,
563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      const int8_t *filter_vert)
564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
565da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
566da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
567da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
568da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8i16 filt;
569da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
570da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
571da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
572da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_SH(filter_horiz);
573da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt_hz = (v16u8)__msa_splati_h(filt, 0);
574da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_SH(filter_vert);
575da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt_vt = (v16u8)__msa_splati_h(filt, 0);
576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
582da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
583da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
584da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
585da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
586da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
587da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
588da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
590da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
591da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
592da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
593da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
594da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
595da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
596da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
597da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
598da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST8x4_UB(out0, out1, dst, dst_stride);
599da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
600da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
601da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_8x8mult_msa(uint8_t *RESTRICT src,
602da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                          int32_t src_stride,
603da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                          uint8_t *RESTRICT dst,
604da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                          int32_t dst_stride,
605da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                          const int8_t *filter_horiz,
606da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                          const int8_t *filter_vert,
607da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                          int32_t height)
608da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
609da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    uint32_t loop_cnt;
610da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
611da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 filt_hz, filt_vt, vec0;
612da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
613da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8i16 filt;
614da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
616da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
617da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_SH(filter_horiz);
618da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt_hz = (v16u8)__msa_splati_h(filt, 0);
619da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_SH(filter_vert);
620da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt_vt = (v16u8)__msa_splati_h(filt, 0);
621da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
622da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src0 = LD_SB(src);
623da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
624da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
625da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
626da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
627da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (loop_cnt = (height >> 3); loop_cnt--;)
628da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
629da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        LD_SB4(src, src_stride, src1, src2, src3, src4);
630da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        src += (4 * src_stride);
631da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
632da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz,
633da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
634da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
635da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
636da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
637da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz,
638da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
639da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
640da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
641da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
642da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
643da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
644da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz,
645da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
646da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
647da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
648da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
649da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz,
650da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
651da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        LD_SB4(src, src_stride, src1, src2, src3, src4);
652da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        src += (4 * src_stride);
653da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
654da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        tmp4 = __msa_dotp_u_h(vec0, filt_vt);
655da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
656da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H2_UH(tmp3, tmp4, VP8_FILTER_SHIFT);
657da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
658da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ST8x4_UB(out0, out1, dst, dst_stride);
659da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += (4 * dst_stride);
660da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
661da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz,
662da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
663da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
664da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        tmp5 = __msa_dotp_u_h(vec0, filt_vt);
665da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
666da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz,
667da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
668da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
669da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        tmp6 = __msa_dotp_u_h(vec0, filt_vt);
670da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
671da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz,
672da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
673da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
674da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        tmp7 = __msa_dotp_u_h(vec0, filt_vt);
675da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
676da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz,
677da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
678da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
679da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        tmp8 = __msa_dotp_u_h(vec0, filt_vt);
680da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
681da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, VP8_FILTER_SHIFT);
682da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
683da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ST8x4_UB(out0, out1, dst, dst_stride);
684da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += (4 * dst_stride);
685da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
686da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
687da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
688da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
689da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     uint8_t *RESTRICT dst, int32_t dst_stride,
690da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     const int8_t *filter_horiz,
691da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     const int8_t *filter_vert,
692da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     int32_t height)
693da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
694da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (4 == height)
695da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
696da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
697da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                  filter_horiz, filter_vert);
698da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
699da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    else
700da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      filter_horiz, filter_vert, height);
703da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
704da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void common_hv_2ht_2vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      uint8_t *RESTRICT dst, int32_t dst_stride,
708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      const int8_t *filter_horiz,
709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      const int8_t *filter_vert,
710da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      int32_t height)
711da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
712da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    uint32_t loop_cnt;
713da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
714da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v16u8 filt_hz, filt_vt, vec0, vec1;
715da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
716da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    v8i16 filt;
717da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
718da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
719da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
720da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    /* rearranging filter */
721da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_SH(filter_horiz);
722da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt_hz = (v16u8)__msa_splati_h(filt, 0);
723da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt = LD_SH(filter_vert);
724da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    filt_vt = (v16u8)__msa_splati_h(filt, 0);
725da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
726da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB2(src, 8, src0, src1);
727da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
728da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
729da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
730da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
731da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
732da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (loop_cnt = (height >> 2); loop_cnt--;)
733da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
734da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        LD_SB4(src, src_stride, src0, src2, src4, src6);
735da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
736da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        src += (4 * src_stride);
737da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
738da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz,
739da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
740da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz,
741da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
742da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
743da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
744da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
745da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_ST_SB(tmp1, tmp2, dst);
746da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += dst_stride;
747da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
748da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz,
749da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
750da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz,
751da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
752da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
753da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
754da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
755da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_ST_SB(tmp1, tmp2, dst);
756da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += dst_stride;
757da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
758da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz,
759da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
760da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz,
761da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
762da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
763da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
764da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
765da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_ST_SB(tmp1, tmp2, dst);
766da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += dst_stride;
767da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
768da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz,
769da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
770da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz,
771da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     VP8_FILTER_SHIFT);
772da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
773da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
774da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
775da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        PCKEV_ST_SB(tmp1, tmp2, dst);
776da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        dst += dst_stride;
777da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
778da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
779da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
780da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vp8_bilinear_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
781da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 int32_t xoffset, int32_t yoffset,
782da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 uint8_t *RESTRICT dst, int32_t dst_stride)
783da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
784da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
785da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
786da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
787da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (yoffset)
788da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
789da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        if (xoffset)
790da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
791da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            common_hv_2ht_2vt_4w_msa(src, src_stride, dst, dst_stride,
792da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     h_filter, v_filter, 4);
793da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
794da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        else
795da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
796da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            common_vt_2t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
797da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
798da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
799da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    else
800da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
801da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        if (xoffset)
802da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
803da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            common_hz_2t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
804da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
805da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        else
806da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
807da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            uint32_t tp0, tp1, tp2, tp3;
808da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
809da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            LW4(src, src_stride, tp0, tp1, tp2, tp3);
810da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
811da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
812da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
813da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
814da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
815da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vp8_bilinear_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
816da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 int32_t xoffset, int32_t yoffset,
817da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 uint8_t *RESTRICT dst, int32_t dst_stride)
818da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
819da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
820da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
821da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
822da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (yoffset)
823da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
824da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        if (xoffset)
825da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
826da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride,
827da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     h_filter, v_filter, 4);
828da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
829da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        else
830da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
831da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
832da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
833da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
834da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    else
835da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
836da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        if (xoffset)
837da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
838da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
839da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
840da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        else
841da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
842da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            vp8_copy_mem8x4(src, src_stride, dst, dst_stride);
843da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
844da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
845da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
846da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
847da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vp8_bilinear_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
848da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 int32_t xoffset, int32_t yoffset,
849da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 uint8_t *RESTRICT dst, int32_t dst_stride)
850da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
851da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
852da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
853da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
854da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (yoffset)
855da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
856da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        if (xoffset)
857da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
858da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride,
859da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                     h_filter, v_filter, 8);
860da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
861da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        else
862da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
863da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8);
864da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
865da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
866da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    else
867da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
868da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        if (xoffset)
869da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
870da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8);
871da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
872da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        else
873da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
874da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            vp8_copy_mem8x8(src, src_stride, dst, dst_stride);
875da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
876da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
877da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
878da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
879da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vp8_bilinear_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
880da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                   int32_t xoffset, int32_t yoffset,
881da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                   uint8_t *RESTRICT dst, int32_t dst_stride)
882da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian{
883da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
884da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
885da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
886da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    if (yoffset)
887da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
888da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        if (xoffset)
889da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
890da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride,
891da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                      h_filter, v_filter, 16);
892da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
893da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        else
894da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
895da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            common_vt_2t_16w_msa(src, src_stride, dst, dst_stride, v_filter,
896da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 16);
897da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
898da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
899da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    else
900da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    {
901da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        if (xoffset)
902da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
903da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            common_hz_2t_16w_msa(src, src_stride, dst, dst_stride, h_filter,
904da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                 16);
905da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
906da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        else
907da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        {
908da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian            vp8_copy_mem16x16(src, src_stride, dst, dst_stride);
909da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        }
910da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
911da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
912