1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vp8_rtcd.h"
12#include "vp8/common/mips/msa/vp8_macros_msa.h"
13
14#define TRANSPOSE4x4_H(in0, in1, in2, in3, out0, out1, out2, out3)  \
15{                                                                   \
16    v8i16 s0_m, s1_m, tp0_m, tp1_m, tp2_m, tp3_m;                   \
17                                                                    \
18    ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                     \
19    ILVRL_H2_SH(s1_m, s0_m, tp0_m, tp1_m);                          \
20    ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                     \
21    ILVRL_H2_SH(s1_m, s0_m, tp2_m, tp3_m);                          \
22    PCKEV_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out0, out2);            \
23    PCKOD_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out1, out3);            \
24}
25
26#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2)    \
27{                                                                   \
28    v8i16 tmp0_m;                                                   \
29                                                                    \
30    SPLATI_H3_SH(coeff, val0, val1, val2, tmp0_m, const1, const2);  \
31    ILVEV_H2_SH(tmp0_m, const1, const2, tmp0_m, const1, const2);    \
32}
33
34#define RET_1_IF_NZERO_H(in0)       \
35({                                  \
36    v8i16 tmp0_m;                   \
37    v8i16 one_m = __msa_ldi_h(1);   \
38                                    \
39    tmp0_m = __msa_ceqi_h(in0, 0);  \
40    tmp0_m = tmp0_m ^ 255;          \
41    tmp0_m = one_m & tmp0_m;        \
42                                    \
43    tmp0_m;                         \
44})
45
46#define RET_1_IF_NZERO_W(in0)       \
47({                                  \
48    v4i32 tmp0_m;                   \
49    v4i32 one_m = __msa_ldi_w(1);   \
50                                    \
51    tmp0_m = __msa_ceqi_w(in0, 0);  \
52    tmp0_m = tmp0_m ^ 255;          \
53    tmp0_m = one_m & tmp0_m;        \
54                                    \
55    tmp0_m;                         \
56})
57
58#define RET_1_IF_NEG_W(in0)           \
59({                                    \
60    v4i32 tmp0_m;                     \
61                                      \
62    v4i32 one_m = __msa_ldi_w(1);     \
63    tmp0_m = __msa_clti_s_w(in0, 0);  \
64    tmp0_m = one_m & tmp0_m;          \
65                                      \
66    tmp0_m;                           \
67})
68
69void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch)
70{
71    v8i16 in0, in1, in2, in3;
72    v8i16 temp0, temp1;
73    v8i16 const0, const1;
74    v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
75    v4i32 out0, out1, out2, out3;
76    v8i16 zero = { 0 };
77
78    LD_SH4(input, pitch / 2, in0, in1, in2, in3);
79    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
80
81    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
82    SLLI_4V(temp0, temp1, in1, in3, 3);
83    in0 = temp0 + temp1;
84    in2 = temp0 - temp1;
85    SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
86    temp0 = __msa_ilvr_h(in3, in1);
87    in1 = __msa_splati_h(coeff, 3);
88    out0 = (v4i32)__msa_ilvev_h(zero, in1);
89    coeff = __msa_ilvl_h(zero, coeff);
90    out1 = __msa_splati_w((v4i32)coeff, 0);
91    DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1);
92    out0 >>= 12;
93    out1 >>= 12;
94    PCKEV_H2_SH(out0, out0, out1, out1, in1, in3);
95    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
96
97    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
98    in0 = temp0 + temp1 + 7;
99    in2 = temp0 - temp1 + 7;
100    in0 >>= 4;
101    in2 >>= 4;
102    ILVR_H2_SW(zero, in0, zero, in2, out0, out2);
103    temp1 = RET_1_IF_NZERO_H(in3);
104    ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0);
105    SPLATI_W2_SW(coeff, 2, out3, out1);
106    out3 += out1;
107    out1 = __msa_splati_w((v4i32)coeff, 1);
108    DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3);
109    out1 >>= 16;
110    out3 >>= 16;
111    out1 += (v4i32)temp1;
112    PCKEV_H2_SH(out1, out0, out3, out2, in0, in2);
113    ST_SH2(in0, in2, output, 8);
114}
115
116void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch)
117{
118    v8i16 in0, in1, in2, in3;
119    v8i16 temp0, temp1, tmp0, tmp1;
120    v8i16 const0, const1, const2;
121    v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
122    v8i16 zero = { 0 };
123    v4i32 vec0_w, vec1_w, vec2_w, vec3_w;
124
125    LD_SH4(input, pitch / 2, in0, in1, in2, in3);
126    TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
127
128    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
129    SLLI_4V(temp0, temp1, in1, in3, 3);
130    in0 = temp0 + temp1;
131    in2 = temp0 - temp1;
132    SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
133    temp0 = __msa_splati_h(coeff, 3);
134    vec1_w = (v4i32)__msa_ilvev_h(zero, temp0);
135    coeff = __msa_ilvl_h(zero, coeff);
136    vec3_w = __msa_splati_w((v4i32)coeff, 0);
137    ILVRL_H2_SH(in3, in1, tmp1, tmp0);
138    vec0_w = vec1_w;
139    vec2_w = vec3_w;
140    DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2,
141                 vec0_w, vec1_w, vec2_w, vec3_w);
142    SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12);
143    PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
144    TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
145
146    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
147    in0 = temp0 + temp1 + 7;
148    in2 = temp0 - temp1 + 7;
149    in0 >>= 4;
150    in2 >>= 4;
151    SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w);
152    vec3_w += vec1_w;
153    vec1_w = __msa_splati_w((v4i32)coeff, 1);
154    const0 = RET_1_IF_NZERO_H(in3);
155    ILVRL_H2_SH(in3, in1, tmp1, tmp0);
156    vec0_w = vec1_w;
157    vec2_w = vec3_w;
158    DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2,
159                 vec0_w, vec1_w, vec2_w, vec3_w);
160    SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16);
161    PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
162    in1 += const0;
163    PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1);
164    ST_SH2(temp0, temp1, output, 8);
165
166    PCKOD_D2_SH(in1, in0, in3, in2, in0, in2);
167    ST_SH2(in0, in2, output + 16, 8);
168}
169
170void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch)
171{
172    v8i16 in0_h, in1_h, in2_h, in3_h;
173    v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3;
174
175    LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h);
176    TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h);
177
178    UNPCK_R_SH_SW(in0_h, in0_w);
179    UNPCK_R_SH_SW(in1_h, in1_w);
180    UNPCK_R_SH_SW(in2_h, in2_w);
181    UNPCK_R_SH_SW(in3_h, in3_w);
182    BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
183    SLLI_4V(temp0, temp1, temp2, temp3, 2);
184    BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
185    temp0 = RET_1_IF_NZERO_W(temp0);
186    in0_w += temp0;
187    TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w);
188
189    BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
190    BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
191    in0_w += RET_1_IF_NEG_W(in0_w);
192    in1_w += RET_1_IF_NEG_W(in1_w);
193    in2_w += RET_1_IF_NEG_W(in2_w);
194    in3_w += RET_1_IF_NEG_W(in3_w);
195    ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w);
196    SRA_4V(in0_w, in1_w, in2_w, in3_w, 3);
197    PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h);
198    ST_SH2(in0_h, in1_h, output, 8);
199}
200