1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <emmintrin.h>  // SSE2
12
13#include "./vpx_config.h"
14#include "./vpx_dsp_rtcd.h"
15#include "vpx_dsp/vpx_dsp_common.h"
16#include "vpx_dsp/x86/fwd_txfm_sse2.h"
17
18void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
19  __m128i in0, in1;
20  __m128i tmp;
21  const __m128i zero = _mm_setzero_si128();
22  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
23  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
24  in1 = _mm_unpacklo_epi64(
25      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
26  in0 = _mm_unpacklo_epi64(
27      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
28
29  tmp = _mm_add_epi16(in0, in1);
30  in0 = _mm_unpacklo_epi16(zero, tmp);
31  in1 = _mm_unpackhi_epi16(zero, tmp);
32  in0 = _mm_srai_epi32(in0, 16);
33  in1 = _mm_srai_epi32(in1, 16);
34
35  tmp = _mm_add_epi32(in0, in1);
36  in0 = _mm_unpacklo_epi32(tmp, zero);
37  in1 = _mm_unpackhi_epi32(tmp, zero);
38
39  tmp = _mm_add_epi32(in0, in1);
40  in0 = _mm_srli_si128(tmp, 8);
41
42  in1 = _mm_add_epi32(tmp, in0);
43  in0 = _mm_slli_epi32(in1, 1);
44  output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
45}
46
47void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
48  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
49  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
50  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
51  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
52  __m128i u0, u1, sum;
53
54  u0 = _mm_add_epi16(in0, in1);
55  u1 = _mm_add_epi16(in2, in3);
56
57  in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
58  in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
59  in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
60  in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
61
62  sum = _mm_add_epi16(u0, u1);
63
64  in0 = _mm_add_epi16(in0, in1);
65  in2 = _mm_add_epi16(in2, in3);
66  sum = _mm_add_epi16(sum, in0);
67
68  u0 = _mm_setzero_si128();
69  sum = _mm_add_epi16(sum, in2);
70
71  in0 = _mm_unpacklo_epi16(u0, sum);
72  in1 = _mm_unpackhi_epi16(u0, sum);
73  in0 = _mm_srai_epi32(in0, 16);
74  in1 = _mm_srai_epi32(in1, 16);
75
76  sum = _mm_add_epi32(in0, in1);
77  in0 = _mm_unpacklo_epi32(sum, u0);
78  in1 = _mm_unpackhi_epi32(sum, u0);
79
80  sum = _mm_add_epi32(in0, in1);
81  in0 = _mm_srli_si128(sum, 8);
82
83  in1 = _mm_add_epi32(sum, in0);
84  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
85}
86
87void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
88                          int stride) {
89  __m128i in0, in1, in2, in3;
90  __m128i u0, u1;
91  __m128i sum = _mm_setzero_si128();
92  int i;
93
94  for (i = 0; i < 2; ++i) {
95    in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
96    in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
97    in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
98    in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
99
100    u0 = _mm_add_epi16(in0, in1);
101    u1 = _mm_add_epi16(in2, in3);
102    sum = _mm_add_epi16(sum, u0);
103
104    in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
105    in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
106    in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
107    in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
108
109    sum = _mm_add_epi16(sum, u1);
110    u0 = _mm_add_epi16(in0, in1);
111    u1 = _mm_add_epi16(in2, in3);
112    sum = _mm_add_epi16(sum, u0);
113
114    in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
115    in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
116    in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
117    in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
118
119    sum = _mm_add_epi16(sum, u1);
120    u0 = _mm_add_epi16(in0, in1);
121    u1 = _mm_add_epi16(in2, in3);
122    sum = _mm_add_epi16(sum, u0);
123
124    in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
125    in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
126    in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
127    in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
128
129    sum = _mm_add_epi16(sum, u1);
130    u0 = _mm_add_epi16(in0, in1);
131    u1 = _mm_add_epi16(in2, in3);
132    sum = _mm_add_epi16(sum, u0);
133
134    sum = _mm_add_epi16(sum, u1);
135    input += 8 * stride;
136  }
137
138  u0 = _mm_setzero_si128();
139  in0 = _mm_unpacklo_epi16(u0, sum);
140  in1 = _mm_unpackhi_epi16(u0, sum);
141  in0 = _mm_srai_epi32(in0, 16);
142  in1 = _mm_srai_epi32(in1, 16);
143
144  sum = _mm_add_epi32(in0, in1);
145  in0 = _mm_unpacklo_epi32(sum, u0);
146  in1 = _mm_unpackhi_epi32(sum, u0);
147
148  sum = _mm_add_epi32(in0, in1);
149  in0 = _mm_srli_si128(sum, 8);
150
151  in1 = _mm_add_epi32(sum, in0);
152  in1 = _mm_srai_epi32(in1, 1);
153  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
154}
155
156void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
157                          int stride) {
158  __m128i in0, in1, in2, in3;
159  __m128i u0, u1;
160  __m128i sum = _mm_setzero_si128();
161  int i;
162
163  for (i = 0; i < 8; ++i) {
164    in0 = _mm_load_si128((const __m128i *)(input + 0));
165    in1 = _mm_load_si128((const __m128i *)(input + 8));
166    in2 = _mm_load_si128((const __m128i *)(input + 16));
167    in3 = _mm_load_si128((const __m128i *)(input + 24));
168
169    input += stride;
170    u0 = _mm_add_epi16(in0, in1);
171    u1 = _mm_add_epi16(in2, in3);
172    sum = _mm_add_epi16(sum, u0);
173
174    in0 = _mm_load_si128((const __m128i *)(input + 0));
175    in1 = _mm_load_si128((const __m128i *)(input + 8));
176    in2 = _mm_load_si128((const __m128i *)(input + 16));
177    in3 = _mm_load_si128((const __m128i *)(input + 24));
178
179    input += stride;
180    sum = _mm_add_epi16(sum, u1);
181    u0 = _mm_add_epi16(in0, in1);
182    u1 = _mm_add_epi16(in2, in3);
183    sum = _mm_add_epi16(sum, u0);
184
185    in0 = _mm_load_si128((const __m128i *)(input + 0));
186    in1 = _mm_load_si128((const __m128i *)(input + 8));
187    in2 = _mm_load_si128((const __m128i *)(input + 16));
188    in3 = _mm_load_si128((const __m128i *)(input + 24));
189
190    input += stride;
191    sum = _mm_add_epi16(sum, u1);
192    u0 = _mm_add_epi16(in0, in1);
193    u1 = _mm_add_epi16(in2, in3);
194    sum = _mm_add_epi16(sum, u0);
195
196    in0 = _mm_load_si128((const __m128i *)(input + 0));
197    in1 = _mm_load_si128((const __m128i *)(input + 8));
198    in2 = _mm_load_si128((const __m128i *)(input + 16));
199    in3 = _mm_load_si128((const __m128i *)(input + 24));
200
201    input += stride;
202    sum = _mm_add_epi16(sum, u1);
203    u0 = _mm_add_epi16(in0, in1);
204    u1 = _mm_add_epi16(in2, in3);
205    sum = _mm_add_epi16(sum, u0);
206
207    sum = _mm_add_epi16(sum, u1);
208  }
209
210  u0 = _mm_setzero_si128();
211  in0 = _mm_unpacklo_epi16(u0, sum);
212  in1 = _mm_unpackhi_epi16(u0, sum);
213  in0 = _mm_srai_epi32(in0, 16);
214  in1 = _mm_srai_epi32(in1, 16);
215
216  sum = _mm_add_epi32(in0, in1);
217  in0 = _mm_unpacklo_epi32(sum, u0);
218  in1 = _mm_unpackhi_epi32(sum, u0);
219
220  sum = _mm_add_epi32(in0, in1);
221  in0 = _mm_srli_si128(sum, 8);
222
223  in1 = _mm_add_epi32(sum, in0);
224  in1 = _mm_srai_epi32(in1, 3);
225  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
226}
227
228#define DCT_HIGH_BIT_DEPTH 0
229#define FDCT4x4_2D vpx_fdct4x4_sse2
230#define FDCT8x8_2D vpx_fdct8x8_sse2
231#define FDCT16x16_2D vpx_fdct16x16_sse2
232#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
233#undef FDCT4x4_2D
234#undef FDCT8x8_2D
235#undef FDCT16x16_2D
236
237#define FDCT32x32_2D vpx_fdct32x32_rd_sse2
238#define FDCT32x32_HIGH_PRECISION 0
239#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
240#undef FDCT32x32_2D
241#undef FDCT32x32_HIGH_PRECISION
242
243#define FDCT32x32_2D vpx_fdct32x32_sse2
244#define FDCT32x32_HIGH_PRECISION 1
245#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
246#undef FDCT32x32_2D
247#undef FDCT32x32_HIGH_PRECISION
248#undef DCT_HIGH_BIT_DEPTH
249
250#if CONFIG_VP9_HIGHBITDEPTH
251#define DCT_HIGH_BIT_DEPTH 1
252#define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
253#define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
254#define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
255#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
256#undef FDCT4x4_2D
257#undef FDCT8x8_2D
258#undef FDCT16x16_2D
259
260#define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
261#define FDCT32x32_HIGH_PRECISION 0
262#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
263#undef FDCT32x32_2D
264#undef FDCT32x32_HIGH_PRECISION
265
266#define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
267#define FDCT32x32_HIGH_PRECISION 1
268#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
269#undef FDCT32x32_2D
270#undef FDCT32x32_HIGH_PRECISION
271#undef DCT_HIGH_BIT_DEPTH
272#endif  // CONFIG_VP9_HIGHBITDEPTH
273