1/*
2 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <emmintrin.h>  // SSE2
13#include "./vpx_config.h"
14#include "vpx/vpx_integer.h"
15#include "vp9/common/vp9_common.h"
16#include "vp9/common/vp9_idct.h"
17
18#define RECON_AND_STORE4X4(dest, in_x) \
19{                                                     \
20  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
21  d0 = _mm_unpacklo_epi8(d0, zero); \
22  d0 = _mm_add_epi16(in_x, d0); \
23  d0 = _mm_packus_epi16(d0, d0); \
24  *(int *)dest = _mm_cvtsi128_si32(d0); \
25  dest += stride; \
26}
27
28void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
29  const __m128i zero = _mm_setzero_si128();
30  const __m128i eight = _mm_set1_epi16(8);
31  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
32                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
33                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
34                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
35  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
36  __m128i input0, input1, input2, input3;
37
38  // Rows
39  input0 = _mm_load_si128((const __m128i *)input);
40  input2 = _mm_load_si128((const __m128i *)(input + 8));
41
42  // Construct i3, i1, i3, i1, i2, i0, i2, i0
43  input0 = _mm_shufflelo_epi16(input0, 0xd8);
44  input0 = _mm_shufflehi_epi16(input0, 0xd8);
45  input2 = _mm_shufflelo_epi16(input2, 0xd8);
46  input2 = _mm_shufflehi_epi16(input2, 0xd8);
47
48  input1 = _mm_unpackhi_epi32(input0, input0);
49  input0 = _mm_unpacklo_epi32(input0, input0);
50  input3 = _mm_unpackhi_epi32(input2, input2);
51  input2 = _mm_unpacklo_epi32(input2, input2);
52
53  // Stage 1
54  input0 = _mm_madd_epi16(input0, cst);
55  input1 = _mm_madd_epi16(input1, cst);
56  input2 = _mm_madd_epi16(input2, cst);
57  input3 = _mm_madd_epi16(input3, cst);
58
59  input0 = _mm_add_epi32(input0, rounding);
60  input1 = _mm_add_epi32(input1, rounding);
61  input2 = _mm_add_epi32(input2, rounding);
62  input3 = _mm_add_epi32(input3, rounding);
63
64  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
65  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
66  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
67  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
68
69  // Stage 2
70  input0 = _mm_packs_epi32(input0, input1);
71  input1 = _mm_packs_epi32(input2, input3);
72
73  // Transpose
74  input2 = _mm_unpacklo_epi16(input0, input1);
75  input3 = _mm_unpackhi_epi16(input0, input1);
76  input0 = _mm_unpacklo_epi32(input2, input3);
77  input1 = _mm_unpackhi_epi32(input2, input3);
78
79  // Switch column2, column 3, and then, we got:
80  // input2: column1, column 0;  input3: column2, column 3.
81  input1 = _mm_shuffle_epi32(input1, 0x4e);
82  input2 = _mm_add_epi16(input0, input1);
83  input3 = _mm_sub_epi16(input0, input1);
84
85  // Columns
86  // Construct i3, i1, i3, i1, i2, i0, i2, i0
87  input0 = _mm_unpacklo_epi32(input2, input2);
88  input1 = _mm_unpackhi_epi32(input2, input2);
89  input2 = _mm_unpackhi_epi32(input3, input3);
90  input3 = _mm_unpacklo_epi32(input3, input3);
91
92  // Stage 1
93  input0 = _mm_madd_epi16(input0, cst);
94  input1 = _mm_madd_epi16(input1, cst);
95  input2 = _mm_madd_epi16(input2, cst);
96  input3 = _mm_madd_epi16(input3, cst);
97
98  input0 = _mm_add_epi32(input0, rounding);
99  input1 = _mm_add_epi32(input1, rounding);
100  input2 = _mm_add_epi32(input2, rounding);
101  input3 = _mm_add_epi32(input3, rounding);
102
103  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
104  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
105  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
106  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
107
108  // Stage 2
109  input0 = _mm_packs_epi32(input0, input2);
110  input1 = _mm_packs_epi32(input1, input3);
111
112  // Transpose
113  input2 = _mm_unpacklo_epi16(input0, input1);
114  input3 = _mm_unpackhi_epi16(input0, input1);
115  input0 = _mm_unpacklo_epi32(input2, input3);
116  input1 = _mm_unpackhi_epi32(input2, input3);
117
118  // Switch column2, column 3, and then, we got:
119  // input2: column1, column 0;  input3: column2, column 3.
120  input1 = _mm_shuffle_epi32(input1, 0x4e);
121  input2 = _mm_add_epi16(input0, input1);
122  input3 = _mm_sub_epi16(input0, input1);
123
124  // Final round and shift
125  input2 = _mm_add_epi16(input2, eight);
126  input3 = _mm_add_epi16(input3, eight);
127
128  input2 = _mm_srai_epi16(input2, 4);
129  input3 = _mm_srai_epi16(input3, 4);
130
131  // Reconstruction and Store
132  {
133     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
134     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
135     d0 = _mm_unpacklo_epi32(d0,
136          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
137     d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
138                    *(const int *) (dest + stride * 3)), d2);
139     d0 = _mm_unpacklo_epi8(d0, zero);
140     d2 = _mm_unpacklo_epi8(d2, zero);
141     d0 = _mm_add_epi16(d0, input2);
142     d2 = _mm_add_epi16(d2, input3);
143     d0 = _mm_packus_epi16(d0, d2);
144     // store input0
145     *(int *)dest = _mm_cvtsi128_si32(d0);
146     // store input1
147     d0 = _mm_srli_si128(d0, 4);
148     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
149     // store input2
150     d0 = _mm_srli_si128(d0, 4);
151     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
152     // store input3
153     d0 = _mm_srli_si128(d0, 4);
154     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
155  }
156}
157
158void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
159  __m128i dc_value;
160  const __m128i zero = _mm_setzero_si128();
161  int a;
162
163  a = dct_const_round_shift(input[0] * cospi_16_64);
164  a = dct_const_round_shift(a * cospi_16_64);
165  a = ROUND_POWER_OF_TWO(a, 4);
166
167  dc_value = _mm_set1_epi16(a);
168
169  RECON_AND_STORE4X4(dest, dc_value);
170  RECON_AND_STORE4X4(dest, dc_value);
171  RECON_AND_STORE4X4(dest, dc_value);
172  RECON_AND_STORE4X4(dest, dc_value);
173}
174
175static INLINE void transpose_4x4(__m128i *res) {
176  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
177  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
178
179  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
180  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
181}
182
183static void idct4_sse2(__m128i *in) {
184  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
185  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
186  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
187  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
188  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
189  __m128i u[8], v[8];
190
191  transpose_4x4(in);
192  // stage 1
193  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
194  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
195  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
196  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
197  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
198  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
199
200  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
201  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
202  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
203  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
204
205  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
206  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
207  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
208  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
209
210  u[0] = _mm_packs_epi32(v[0], v[1]);
211  u[1] = _mm_packs_epi32(v[3], v[2]);
212
213  // stage 2
214  in[0] = _mm_add_epi16(u[0], u[1]);
215  in[1] = _mm_sub_epi16(u[0], u[1]);
216  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
217}
218
219static void iadst4_sse2(__m128i *in) {
220  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
221  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
222  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
223  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
224  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
225  const __m128i kZero = _mm_set1_epi16(0);
226  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
227  __m128i u[8], v[8], in7;
228
229  transpose_4x4(in);
230  in7 = _mm_srli_si128(in[1], 8);
231  in7 = _mm_add_epi16(in7, in[0]);
232  in7 = _mm_sub_epi16(in7, in[1]);
233
234  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
235  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
236  u[2] = _mm_unpacklo_epi16(in7, kZero);
237  u[3] = _mm_unpackhi_epi16(in[0], kZero);
238
239  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
240  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
241  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
242  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
243  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
244  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
245
246  u[0] = _mm_add_epi32(v[0], v[1]);
247  u[1] = _mm_add_epi32(v[3], v[4]);
248  u[2] = v[2];
249  u[3] = _mm_add_epi32(u[0], u[1]);
250  u[4] = _mm_slli_epi32(v[5], 2);
251  u[5] = _mm_add_epi32(u[3], v[5]);
252  u[6] = _mm_sub_epi32(u[5], u[4]);
253
254  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
255  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
256  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
257  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
258
259  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
260  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
261  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
262  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
263
264  in[0] = _mm_packs_epi32(u[0], u[1]);
265  in[1] = _mm_packs_epi32(u[2], u[3]);
266}
267
268void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
269                            int tx_type) {
270  __m128i in[2];
271  const __m128i zero = _mm_setzero_si128();
272  const __m128i eight = _mm_set1_epi16(8);
273
274  in[0]= _mm_loadu_si128((const __m128i *)(input));
275  in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
276
277  switch (tx_type) {
278    case 0:  // DCT_DCT
279      idct4_sse2(in);
280      idct4_sse2(in);
281      break;
282    case 1:  // ADST_DCT
283      idct4_sse2(in);
284      iadst4_sse2(in);
285      break;
286    case 2:  // DCT_ADST
287      iadst4_sse2(in);
288      idct4_sse2(in);
289      break;
290    case 3:  // ADST_ADST
291      iadst4_sse2(in);
292      iadst4_sse2(in);
293      break;
294    default:
295      assert(0);
296      break;
297  }
298
299  // Final round and shift
300  in[0] = _mm_add_epi16(in[0], eight);
301  in[1] = _mm_add_epi16(in[1], eight);
302
303  in[0] = _mm_srai_epi16(in[0], 4);
304  in[1] = _mm_srai_epi16(in[1], 4);
305
306  // Reconstruction and Store
307  {
308     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
309     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
310     d0 = _mm_unpacklo_epi32(d0,
311          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
312     d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128(
313                    *(const int *) (dest + stride * 3)));
314     d0 = _mm_unpacklo_epi8(d0, zero);
315     d2 = _mm_unpacklo_epi8(d2, zero);
316     d0 = _mm_add_epi16(d0, in[0]);
317     d2 = _mm_add_epi16(d2, in[1]);
318     d0 = _mm_packus_epi16(d0, d2);
319     // store result[0]
320     *(int *)dest = _mm_cvtsi128_si32(d0);
321     // store result[1]
322     d0 = _mm_srli_si128(d0, 4);
323     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
324     // store result[2]
325     d0 = _mm_srli_si128(d0, 4);
326     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
327     // store result[3]
328     d0 = _mm_srli_si128(d0, 4);
329     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
330  }
331}
332
333#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
334                      out0, out1, out2, out3, out4, out5, out6, out7) \
335  {                                                     \
336    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
337    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
338    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
339    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
340    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
341    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
342    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
343    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
344                                                        \
345    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
346    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
347    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
348    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
349    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
350    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
351    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
352    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
353                                                            \
354    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
355    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
356    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
357    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
358    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
359    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
360    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
361    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
362  }
363
364#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
365                         out0, out1, out2, out3) \
366  {                                              \
367    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
368    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
369    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
370    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
371    \
372    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
373    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
374    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
375    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
376    \
377    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
378    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
379    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
380    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
381  }
382
383#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
384  {                                                     \
385    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
386    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
387                                                        \
388    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
389    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
390  }
391
392#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
393  {                                            \
394    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
395    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
396    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
397    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
398  }
399
400// Define Macro for multiplying elements by constants and adding them together.
401#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
402                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
403  {   \
404      tmp0 = _mm_madd_epi16(lo_0, cst0); \
405      tmp1 = _mm_madd_epi16(hi_0, cst0); \
406      tmp2 = _mm_madd_epi16(lo_0, cst1); \
407      tmp3 = _mm_madd_epi16(hi_0, cst1); \
408      tmp4 = _mm_madd_epi16(lo_1, cst2); \
409      tmp5 = _mm_madd_epi16(hi_1, cst2); \
410      tmp6 = _mm_madd_epi16(lo_1, cst3); \
411      tmp7 = _mm_madd_epi16(hi_1, cst3); \
412      \
413      tmp0 = _mm_add_epi32(tmp0, rounding); \
414      tmp1 = _mm_add_epi32(tmp1, rounding); \
415      tmp2 = _mm_add_epi32(tmp2, rounding); \
416      tmp3 = _mm_add_epi32(tmp3, rounding); \
417      tmp4 = _mm_add_epi32(tmp4, rounding); \
418      tmp5 = _mm_add_epi32(tmp5, rounding); \
419      tmp6 = _mm_add_epi32(tmp6, rounding); \
420      tmp7 = _mm_add_epi32(tmp7, rounding); \
421      \
422      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
423      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
424      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
425      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
426      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
427      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
428      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
429      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
430      \
431      res0 = _mm_packs_epi32(tmp0, tmp1); \
432      res1 = _mm_packs_epi32(tmp2, tmp3); \
433      res2 = _mm_packs_epi32(tmp4, tmp5); \
434      res3 = _mm_packs_epi32(tmp6, tmp7); \
435  }
436
437#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
438  {   \
439      tmp0 = _mm_madd_epi16(lo_0, cst0); \
440      tmp1 = _mm_madd_epi16(hi_0, cst0); \
441      tmp2 = _mm_madd_epi16(lo_0, cst1); \
442      tmp3 = _mm_madd_epi16(hi_0, cst1); \
443      \
444      tmp0 = _mm_add_epi32(tmp0, rounding); \
445      tmp1 = _mm_add_epi32(tmp1, rounding); \
446      tmp2 = _mm_add_epi32(tmp2, rounding); \
447      tmp3 = _mm_add_epi32(tmp3, rounding); \
448      \
449      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
450      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
451      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
452      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
453      \
454      res0 = _mm_packs_epi32(tmp0, tmp1); \
455      res1 = _mm_packs_epi32(tmp2, tmp3); \
456  }
457
458#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
459                 out0, out1, out2, out3, out4, out5, out6, out7)  \
460  { \
461  /* Stage1 */      \
462  { \
463    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
464    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
465    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
466    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
467    \
468    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
469                          stg1_1, stg1_2, stg1_3, stp1_4,      \
470                          stp1_7, stp1_5, stp1_6)              \
471  } \
472    \
473  /* Stage2 */ \
474  { \
475    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
476    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
477    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
478    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
479    \
480    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
481                           stg2_1, stg2_2, stg2_3, stp2_0,     \
482                           stp2_1, stp2_2, stp2_3)             \
483    \
484    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
485    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
486    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
487    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
488  } \
489    \
490  /* Stage3 */ \
491  { \
492    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
493    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
494    \
495    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
496    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
497    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
498    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
499    \
500    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
501    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
502    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
503    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
504    \
505    tmp0 = _mm_add_epi32(tmp0, rounding); \
506    tmp1 = _mm_add_epi32(tmp1, rounding); \
507    tmp2 = _mm_add_epi32(tmp2, rounding); \
508    tmp3 = _mm_add_epi32(tmp3, rounding); \
509    \
510    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
511    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
512    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
513    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
514    \
515    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
516    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
517  } \
518  \
519  /* Stage4  */ \
520  out0 = _mm_adds_epi16(stp1_0, stp2_7); \
521  out1 = _mm_adds_epi16(stp1_1, stp1_6); \
522  out2 = _mm_adds_epi16(stp1_2, stp1_5); \
523  out3 = _mm_adds_epi16(stp1_3, stp2_4); \
524  out4 = _mm_subs_epi16(stp1_3, stp2_4); \
525  out5 = _mm_subs_epi16(stp1_2, stp1_5); \
526  out6 = _mm_subs_epi16(stp1_1, stp1_6); \
527  out7 = _mm_subs_epi16(stp1_0, stp2_7); \
528  }
529
530#define RECON_AND_STORE(dest, in_x) \
531  {                                                     \
532     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
533      d0 = _mm_unpacklo_epi8(d0, zero); \
534      d0 = _mm_add_epi16(in_x, d0); \
535      d0 = _mm_packus_epi16(d0, d0); \
536      _mm_storel_epi64((__m128i *)(dest), d0); \
537      dest += stride; \
538  }
539
540void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
541  const __m128i zero = _mm_setzero_si128();
542  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
543  const __m128i final_rounding = _mm_set1_epi16(1<<4);
544  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
545  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
546  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
547  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
548  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
549  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
550  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
551  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
552
553  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
554  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
555  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
556  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
557  int i;
558
559  // Load input data.
560  in0 = _mm_load_si128((const __m128i *)input);
561  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
562  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
563  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
564  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
565  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
566  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
567  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
568
569  // 2-D
570  for (i = 0; i < 2; i++) {
571    // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
572    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
573                  in0, in1, in2, in3, in4, in5, in6, in7);
574
575    // 4-stage 1D idct8x8
576    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
577             in0, in1, in2, in3, in4, in5, in6, in7);
578  }
579
580  // Final rounding and shift
581  in0 = _mm_adds_epi16(in0, final_rounding);
582  in1 = _mm_adds_epi16(in1, final_rounding);
583  in2 = _mm_adds_epi16(in2, final_rounding);
584  in3 = _mm_adds_epi16(in3, final_rounding);
585  in4 = _mm_adds_epi16(in4, final_rounding);
586  in5 = _mm_adds_epi16(in5, final_rounding);
587  in6 = _mm_adds_epi16(in6, final_rounding);
588  in7 = _mm_adds_epi16(in7, final_rounding);
589
590  in0 = _mm_srai_epi16(in0, 5);
591  in1 = _mm_srai_epi16(in1, 5);
592  in2 = _mm_srai_epi16(in2, 5);
593  in3 = _mm_srai_epi16(in3, 5);
594  in4 = _mm_srai_epi16(in4, 5);
595  in5 = _mm_srai_epi16(in5, 5);
596  in6 = _mm_srai_epi16(in6, 5);
597  in7 = _mm_srai_epi16(in7, 5);
598
599  RECON_AND_STORE(dest, in0);
600  RECON_AND_STORE(dest, in1);
601  RECON_AND_STORE(dest, in2);
602  RECON_AND_STORE(dest, in3);
603  RECON_AND_STORE(dest, in4);
604  RECON_AND_STORE(dest, in5);
605  RECON_AND_STORE(dest, in6);
606  RECON_AND_STORE(dest, in7);
607}
608
609void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
610  __m128i dc_value;
611  const __m128i zero = _mm_setzero_si128();
612  int a;
613
614  a = dct_const_round_shift(input[0] * cospi_16_64);
615  a = dct_const_round_shift(a * cospi_16_64);
616  a = ROUND_POWER_OF_TWO(a, 5);
617
618  dc_value = _mm_set1_epi16(a);
619
620  RECON_AND_STORE(dest, dc_value);
621  RECON_AND_STORE(dest, dc_value);
622  RECON_AND_STORE(dest, dc_value);
623  RECON_AND_STORE(dest, dc_value);
624  RECON_AND_STORE(dest, dc_value);
625  RECON_AND_STORE(dest, dc_value);
626  RECON_AND_STORE(dest, dc_value);
627  RECON_AND_STORE(dest, dc_value);
628}
629
630// perform 8x8 transpose
631static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
632  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
633  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
634  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
635  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
636  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
637  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
638  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
639  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
640
641  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
642  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
643  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
644  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
645  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
646  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
647  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
648  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
649
650  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
651  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
652  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
653  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
654  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
655  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
656  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
657  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
658}
659
660static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
661  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
662  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
663  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
664  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
665
666  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
667  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
668  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
669  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
670
671  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
672  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
673  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
674  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
675}
676
677static void idct8_sse2(__m128i *in) {
678  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
679  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
680  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
681  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
682  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
683  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
684  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
685  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
686  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
687
688  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
689  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
690  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
691  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
692
693  // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
694  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
695                in0, in1, in2, in3, in4, in5, in6, in7);
696
697  // 4-stage 1D idct8x8
698  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
699           in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
700}
701
702static void iadst8_sse2(__m128i *in) {
703  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
704  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
705  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
706  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
707  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
708  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
709  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
710  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
711  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
712  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
713  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
714  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
715  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
716  const __m128i k__const_0 = _mm_set1_epi16(0);
717  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
718
719  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
720  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
721  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
722  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
723  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
724
725  // transpose
726  array_transpose_8x8(in, in);
727
728  // properly aligned for butterfly input
729  in0  = in[7];
730  in1  = in[0];
731  in2  = in[5];
732  in3  = in[2];
733  in4  = in[3];
734  in5  = in[4];
735  in6  = in[1];
736  in7  = in[6];
737
738  // column transformation
739  // stage 1
740  // interleave and multiply/add into 32-bit integer
741  s0 = _mm_unpacklo_epi16(in0, in1);
742  s1 = _mm_unpackhi_epi16(in0, in1);
743  s2 = _mm_unpacklo_epi16(in2, in3);
744  s3 = _mm_unpackhi_epi16(in2, in3);
745  s4 = _mm_unpacklo_epi16(in4, in5);
746  s5 = _mm_unpackhi_epi16(in4, in5);
747  s6 = _mm_unpacklo_epi16(in6, in7);
748  s7 = _mm_unpackhi_epi16(in6, in7);
749
750  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
751  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
752  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
753  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
754  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
755  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
756  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
757  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
758  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
759  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
760  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
761  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
762  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
763  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
764  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
765  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
766
767  // addition
768  w0 = _mm_add_epi32(u0, u8);
769  w1 = _mm_add_epi32(u1, u9);
770  w2 = _mm_add_epi32(u2, u10);
771  w3 = _mm_add_epi32(u3, u11);
772  w4 = _mm_add_epi32(u4, u12);
773  w5 = _mm_add_epi32(u5, u13);
774  w6 = _mm_add_epi32(u6, u14);
775  w7 = _mm_add_epi32(u7, u15);
776  w8 = _mm_sub_epi32(u0, u8);
777  w9 = _mm_sub_epi32(u1, u9);
778  w10 = _mm_sub_epi32(u2, u10);
779  w11 = _mm_sub_epi32(u3, u11);
780  w12 = _mm_sub_epi32(u4, u12);
781  w13 = _mm_sub_epi32(u5, u13);
782  w14 = _mm_sub_epi32(u6, u14);
783  w15 = _mm_sub_epi32(u7, u15);
784
785  // shift and rounding
786  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
787  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
788  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
789  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
790  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
791  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
792  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
793  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
794  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
795  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
796  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
797  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
798  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
799  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
800  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
801  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
802
803  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
804  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
805  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
806  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
807  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
808  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
809  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
810  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
811  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
812  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
813  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
814  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
815  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
816  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
817  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
818  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
819
820  // back to 16-bit and pack 8 integers into __m128i
821  in[0] = _mm_packs_epi32(u0, u1);
822  in[1] = _mm_packs_epi32(u2, u3);
823  in[2] = _mm_packs_epi32(u4, u5);
824  in[3] = _mm_packs_epi32(u6, u7);
825  in[4] = _mm_packs_epi32(u8, u9);
826  in[5] = _mm_packs_epi32(u10, u11);
827  in[6] = _mm_packs_epi32(u12, u13);
828  in[7] = _mm_packs_epi32(u14, u15);
829
830  // stage 2
831  s0 = _mm_add_epi16(in[0], in[2]);
832  s1 = _mm_add_epi16(in[1], in[3]);
833  s2 = _mm_sub_epi16(in[0], in[2]);
834  s3 = _mm_sub_epi16(in[1], in[3]);
835  u0 = _mm_unpacklo_epi16(in[4], in[5]);
836  u1 = _mm_unpackhi_epi16(in[4], in[5]);
837  u2 = _mm_unpacklo_epi16(in[6], in[7]);
838  u3 = _mm_unpackhi_epi16(in[6], in[7]);
839
840  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
841  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
842  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
843  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
844  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
845  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
846  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
847  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
848
849  w0 = _mm_add_epi32(v0, v4);
850  w1 = _mm_add_epi32(v1, v5);
851  w2 = _mm_add_epi32(v2, v6);
852  w3 = _mm_add_epi32(v3, v7);
853  w4 = _mm_sub_epi32(v0, v4);
854  w5 = _mm_sub_epi32(v1, v5);
855  w6 = _mm_sub_epi32(v2, v6);
856  w7 = _mm_sub_epi32(v3, v7);
857
858  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
859  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
860  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
861  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
862  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
863  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
864  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
865  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
866
867  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
868  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
869  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
870  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
871  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
872  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
873  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
874  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
875
876  // back to 16-bit intergers
877  s4 = _mm_packs_epi32(u0, u1);
878  s5 = _mm_packs_epi32(u2, u3);
879  s6 = _mm_packs_epi32(u4, u5);
880  s7 = _mm_packs_epi32(u6, u7);
881
882  // stage 3
883  u0 = _mm_unpacklo_epi16(s2, s3);
884  u1 = _mm_unpackhi_epi16(s2, s3);
885  u2 = _mm_unpacklo_epi16(s6, s7);
886  u3 = _mm_unpackhi_epi16(s6, s7);
887
888  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
889  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
890  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
891  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
892  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
893  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
894  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
895  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
896
897  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
898  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
899  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
900  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
901  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
902  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
903  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
904  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
905
906  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
907  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
908  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
909  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
910  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
911  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
912  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
913  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
914
915  s2 = _mm_packs_epi32(v0, v1);
916  s3 = _mm_packs_epi32(v2, v3);
917  s6 = _mm_packs_epi32(v4, v5);
918  s7 = _mm_packs_epi32(v6, v7);
919
920  in[0] = s0;
921  in[1] = _mm_sub_epi16(k__const_0, s4);
922  in[2] = s6;
923  in[3] = _mm_sub_epi16(k__const_0, s2);
924  in[4] = s3;
925  in[5] = _mm_sub_epi16(k__const_0, s7);
926  in[6] = s5;
927  in[7] = _mm_sub_epi16(k__const_0, s1);
928}
929
930
931void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
932                            int tx_type) {
933  __m128i in[8];
934  const __m128i zero = _mm_setzero_si128();
935  const __m128i final_rounding = _mm_set1_epi16(1<<4);
936
937  // load input data
938  in[0] = _mm_load_si128((const __m128i *)input);
939  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
940  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
941  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
942  in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
943  in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
944  in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
945  in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
946
947  switch (tx_type) {
948    case 0:  // DCT_DCT
949      idct8_sse2(in);
950      idct8_sse2(in);
951      break;
952    case 1:  // ADST_DCT
953      idct8_sse2(in);
954      iadst8_sse2(in);
955      break;
956    case 2:  // DCT_ADST
957      iadst8_sse2(in);
958      idct8_sse2(in);
959      break;
960    case 3:  // ADST_ADST
961      iadst8_sse2(in);
962      iadst8_sse2(in);
963      break;
964    default:
965      assert(0);
966      break;
967  }
968
969  // Final rounding and shift
970  in[0] = _mm_adds_epi16(in[0], final_rounding);
971  in[1] = _mm_adds_epi16(in[1], final_rounding);
972  in[2] = _mm_adds_epi16(in[2], final_rounding);
973  in[3] = _mm_adds_epi16(in[3], final_rounding);
974  in[4] = _mm_adds_epi16(in[4], final_rounding);
975  in[5] = _mm_adds_epi16(in[5], final_rounding);
976  in[6] = _mm_adds_epi16(in[6], final_rounding);
977  in[7] = _mm_adds_epi16(in[7], final_rounding);
978
979  in[0] = _mm_srai_epi16(in[0], 5);
980  in[1] = _mm_srai_epi16(in[1], 5);
981  in[2] = _mm_srai_epi16(in[2], 5);
982  in[3] = _mm_srai_epi16(in[3], 5);
983  in[4] = _mm_srai_epi16(in[4], 5);
984  in[5] = _mm_srai_epi16(in[5], 5);
985  in[6] = _mm_srai_epi16(in[6], 5);
986  in[7] = _mm_srai_epi16(in[7], 5);
987
988  RECON_AND_STORE(dest, in[0]);
989  RECON_AND_STORE(dest, in[1]);
990  RECON_AND_STORE(dest, in[2]);
991  RECON_AND_STORE(dest, in[3]);
992  RECON_AND_STORE(dest, in[4]);
993  RECON_AND_STORE(dest, in[5]);
994  RECON_AND_STORE(dest, in[6]);
995  RECON_AND_STORE(dest, in[7]);
996}
997
998void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
999  const __m128i zero = _mm_setzero_si128();
1000  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1001  const __m128i final_rounding = _mm_set1_epi16(1<<4);
1002  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1003  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1004  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
1005  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
1006  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1007  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1008  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1009  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1010  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1011
1012  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
1013  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
1014  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
1015  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1016
1017  // Rows. Load 4-row input data.
1018  in0 = _mm_load_si128((const __m128i *)input);
1019  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
1020  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
1021  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
1022
1023  // 8x4 Transpose
1024  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
1025  // Stage1
1026  { //NOLINT
1027    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
1028    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
1029
1030    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
1031    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
1032    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
1033    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
1034
1035    tmp0 = _mm_add_epi32(tmp0, rounding);
1036    tmp2 = _mm_add_epi32(tmp2, rounding);
1037    tmp4 = _mm_add_epi32(tmp4, rounding);
1038    tmp6 = _mm_add_epi32(tmp6, rounding);
1039    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1040    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1041    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1042    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1043
1044    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
1045    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
1046  }
1047
1048  // Stage2
1049  { //NOLINT
1050    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
1051    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
1052
1053    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
1054    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
1055    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
1056    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
1057
1058    tmp0 = _mm_add_epi32(tmp0, rounding);
1059    tmp2 = _mm_add_epi32(tmp2, rounding);
1060    tmp4 = _mm_add_epi32(tmp4, rounding);
1061    tmp6 = _mm_add_epi32(tmp6, rounding);
1062    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1063    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1064    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1065    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1066
1067    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
1068    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
1069
1070    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
1071    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
1072
1073    stp2_4 = tmp0;
1074    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
1075    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
1076  }
1077
1078  // Stage3
1079  { //NOLINT
1080    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
1081
1082    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
1083    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
1084
1085    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
1086    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
1087
1088    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
1089    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
1090
1091    tmp0 = _mm_add_epi32(tmp0, rounding);
1092    tmp2 = _mm_add_epi32(tmp2, rounding);
1093    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1094    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1095
1096    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
1097  }
1098
1099  // Stage4
1100  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
1101  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
1102  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
1103  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
1104
1105  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
1106
1107  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
1108           in0, in1, in2, in3, in4, in5, in6, in7);
1109  // Final rounding and shift
1110  in0 = _mm_adds_epi16(in0, final_rounding);
1111  in1 = _mm_adds_epi16(in1, final_rounding);
1112  in2 = _mm_adds_epi16(in2, final_rounding);
1113  in3 = _mm_adds_epi16(in3, final_rounding);
1114  in4 = _mm_adds_epi16(in4, final_rounding);
1115  in5 = _mm_adds_epi16(in5, final_rounding);
1116  in6 = _mm_adds_epi16(in6, final_rounding);
1117  in7 = _mm_adds_epi16(in7, final_rounding);
1118
1119  in0 = _mm_srai_epi16(in0, 5);
1120  in1 = _mm_srai_epi16(in1, 5);
1121  in2 = _mm_srai_epi16(in2, 5);
1122  in3 = _mm_srai_epi16(in3, 5);
1123  in4 = _mm_srai_epi16(in4, 5);
1124  in5 = _mm_srai_epi16(in5, 5);
1125  in6 = _mm_srai_epi16(in6, 5);
1126  in7 = _mm_srai_epi16(in7, 5);
1127
1128  RECON_AND_STORE(dest, in0);
1129  RECON_AND_STORE(dest, in1);
1130  RECON_AND_STORE(dest, in2);
1131  RECON_AND_STORE(dest, in3);
1132  RECON_AND_STORE(dest, in4);
1133  RECON_AND_STORE(dest, in5);
1134  RECON_AND_STORE(dest, in6);
1135  RECON_AND_STORE(dest, in7);
1136}
1137
1138#define IDCT16 \
1139  /* Stage2 */ \
1140  { \
1141    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
1142    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
1143    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
1144    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
1145    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
1146    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
1147    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
1148    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
1149    \
1150    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
1151                           stg2_0, stg2_1, stg2_2, stg2_3, \
1152                           stp2_8, stp2_15, stp2_9, stp2_14) \
1153    \
1154    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
1155                           stg2_4, stg2_5, stg2_6, stg2_7, \
1156                           stp2_10, stp2_13, stp2_11, stp2_12) \
1157  } \
1158    \
1159  /* Stage3 */ \
1160  { \
1161    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
1162    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
1163    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
1164    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
1165    \
1166    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
1167                           stg3_0, stg3_1, stg3_2, stg3_3, \
1168                           stp1_4, stp1_7, stp1_5, stp1_6) \
1169    \
1170    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
1171    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
1172    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
1173    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
1174    \
1175    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
1176    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
1177    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
1178    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
1179  } \
1180  \
1181  /* Stage4 */ \
1182  { \
1183    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
1184    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
1185    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
1186    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
1187    \
1188    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1189    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1190    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1191    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1192    \
1193    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
1194                           stg4_0, stg4_1, stg4_2, stg4_3, \
1195                           stp2_0, stp2_1, stp2_2, stp2_3) \
1196    \
1197    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1198    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1199    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1200    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1201    \
1202    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1203                           stg4_4, stg4_5, stg4_6, stg4_7, \
1204                           stp2_9, stp2_14, stp2_10, stp2_13) \
1205  } \
1206    \
1207  /* Stage5 */ \
1208  { \
1209    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1210    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1211    \
1212    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1213    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1214    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1215    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1216    \
1217    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1218    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1219    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1220    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1221    \
1222    tmp0 = _mm_add_epi32(tmp0, rounding); \
1223    tmp1 = _mm_add_epi32(tmp1, rounding); \
1224    tmp2 = _mm_add_epi32(tmp2, rounding); \
1225    tmp3 = _mm_add_epi32(tmp3, rounding); \
1226    \
1227    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1228    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1229    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1230    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1231    \
1232    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1233    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1234    \
1235    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1236    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1237    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1238    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1239    \
1240    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1241    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1242    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1243    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1244  } \
1245    \
1246  /* Stage6 */ \
1247  { \
1248    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1249    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1250    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1251    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1252    \
1253    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1254    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1255    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1256    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1257    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1258    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1259    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1260    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1261    \
1262    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1263                           stg6_0, stg4_0, stg6_0, stg4_0, \
1264                           stp2_10, stp2_13, stp2_11, stp2_12) \
1265  }
1266
1267#define IDCT16_10 \
1268    /* Stage2 */ \
1269    { \
1270      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
1271      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
1272      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
1273      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
1274      \
1275      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
1276                             stg2_0, stg2_1, stg2_6, stg2_7, \
1277                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
1278    } \
1279      \
1280    /* Stage3 */ \
1281    { \
1282      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
1283      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
1284      \
1285      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
1286                               stg3_0, stg3_1,  \
1287                               stp2_4, stp2_7) \
1288      \
1289      stp1_9  =  stp1_8_0; \
1290      stp1_10 =  stp1_11;  \
1291      \
1292      stp1_13 = stp1_12_0; \
1293      stp1_14 = stp1_15;   \
1294    } \
1295    \
1296    /* Stage4 */ \
1297    { \
1298      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
1299      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
1300      \
1301      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1302      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1303      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1304      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1305      \
1306      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
1307                               stg4_0, stg4_1, \
1308                               stp1_0, stp1_1) \
1309      stp2_5 = stp2_4; \
1310      stp2_6 = stp2_7; \
1311      \
1312      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1313                             stg4_4, stg4_5, stg4_6, stg4_7, \
1314                             stp2_9, stp2_14, stp2_10, stp2_13) \
1315    } \
1316      \
1317    /* Stage5 */ \
1318    { \
1319      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1320      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1321      \
1322      stp1_2 = stp1_1; \
1323      stp1_3 = stp1_0; \
1324      \
1325      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1326      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1327      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1328      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1329      \
1330      tmp0 = _mm_add_epi32(tmp0, rounding); \
1331      tmp1 = _mm_add_epi32(tmp1, rounding); \
1332      tmp2 = _mm_add_epi32(tmp2, rounding); \
1333      tmp3 = _mm_add_epi32(tmp3, rounding); \
1334      \
1335      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1336      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1337      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1338      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1339      \
1340      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1341      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1342      \
1343      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1344      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1345      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1346      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1347      \
1348      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1349      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1350      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1351      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1352    } \
1353      \
1354    /* Stage6 */ \
1355    { \
1356      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1357      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1358      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1359      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1360      \
1361      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1362      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1363      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1364      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1365      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1366      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1367      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1368      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1369      \
1370      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1371                             stg6_0, stg4_0, stg6_0, stg4_0, \
1372                             stp2_10, stp2_13, stp2_11, stp2_12) \
1373    }
1374
1375void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
1376                                int stride) {
1377  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1378  const __m128i final_rounding = _mm_set1_epi16(1<<5);
1379  const __m128i zero = _mm_setzero_si128();
1380
1381  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1382  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1383  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1384  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1385  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1386  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1387  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1388  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1389
1390  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1391  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1392  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1393  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1394
1395  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1396  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1397  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1398  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1399  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1400  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1401  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1402  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1403
1404  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1405
1406  __m128i in[16], l[16], r[16], *curr1;
1407  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1408          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1409          stp1_8_0, stp1_12_0;
1410  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1411          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1412  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1413  int i;
1414
1415  curr1 = l;
1416  for (i = 0; i < 2; i++) {
1417      // 1-D idct
1418
1419      // Load input data.
1420      in[0] = _mm_load_si128((const __m128i *)input);
1421      in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
1422      in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
1423      in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
1424      in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
1425      in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
1426      in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
1427      in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
1428      in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
1429      in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
1430      in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
1431      in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
1432      in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
1433      in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
1434      in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
1435      in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
1436
1437      array_transpose_8x8(in, in);
1438      array_transpose_8x8(in+8, in+8);
1439
1440      IDCT16
1441
1442      // Stage7
1443      curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1444      curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1445      curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1446      curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1447      curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1448      curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1449      curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1450      curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1451      curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1452      curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1453      curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1454      curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1455      curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1456      curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1457      curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1458      curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1459
1460      curr1 = r;
1461      input += 128;
1462  }
1463  for (i = 0; i < 2; i++) {
1464      // 1-D idct
1465      array_transpose_8x8(l+i*8, in);
1466      array_transpose_8x8(r+i*8, in+8);
1467
1468      IDCT16
1469
1470      // 2-D
1471      in[0] = _mm_add_epi16(stp2_0, stp1_15);
1472      in[1] = _mm_add_epi16(stp2_1, stp1_14);
1473      in[2] = _mm_add_epi16(stp2_2, stp2_13);
1474      in[3] = _mm_add_epi16(stp2_3, stp2_12);
1475      in[4] = _mm_add_epi16(stp2_4, stp2_11);
1476      in[5] = _mm_add_epi16(stp2_5, stp2_10);
1477      in[6] = _mm_add_epi16(stp2_6, stp1_9);
1478      in[7] = _mm_add_epi16(stp2_7, stp1_8);
1479      in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1480      in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1481      in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1482      in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1483      in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1484      in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1485      in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1486      in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1487
1488      // Final rounding and shift
1489      in[0] = _mm_adds_epi16(in[0], final_rounding);
1490      in[1] = _mm_adds_epi16(in[1], final_rounding);
1491      in[2] = _mm_adds_epi16(in[2], final_rounding);
1492      in[3] = _mm_adds_epi16(in[3], final_rounding);
1493      in[4] = _mm_adds_epi16(in[4], final_rounding);
1494      in[5] = _mm_adds_epi16(in[5], final_rounding);
1495      in[6] = _mm_adds_epi16(in[6], final_rounding);
1496      in[7] = _mm_adds_epi16(in[7], final_rounding);
1497      in[8] = _mm_adds_epi16(in[8], final_rounding);
1498      in[9] = _mm_adds_epi16(in[9], final_rounding);
1499      in[10] = _mm_adds_epi16(in[10], final_rounding);
1500      in[11] = _mm_adds_epi16(in[11], final_rounding);
1501      in[12] = _mm_adds_epi16(in[12], final_rounding);
1502      in[13] = _mm_adds_epi16(in[13], final_rounding);
1503      in[14] = _mm_adds_epi16(in[14], final_rounding);
1504      in[15] = _mm_adds_epi16(in[15], final_rounding);
1505
1506      in[0] = _mm_srai_epi16(in[0], 6);
1507      in[1] = _mm_srai_epi16(in[1], 6);
1508      in[2] = _mm_srai_epi16(in[2], 6);
1509      in[3] = _mm_srai_epi16(in[3], 6);
1510      in[4] = _mm_srai_epi16(in[4], 6);
1511      in[5] = _mm_srai_epi16(in[5], 6);
1512      in[6] = _mm_srai_epi16(in[6], 6);
1513      in[7] = _mm_srai_epi16(in[7], 6);
1514      in[8] = _mm_srai_epi16(in[8], 6);
1515      in[9] = _mm_srai_epi16(in[9], 6);
1516      in[10] = _mm_srai_epi16(in[10], 6);
1517      in[11] = _mm_srai_epi16(in[11], 6);
1518      in[12] = _mm_srai_epi16(in[12], 6);
1519      in[13] = _mm_srai_epi16(in[13], 6);
1520      in[14] = _mm_srai_epi16(in[14], 6);
1521      in[15] = _mm_srai_epi16(in[15], 6);
1522
1523      RECON_AND_STORE(dest, in[0]);
1524      RECON_AND_STORE(dest, in[1]);
1525      RECON_AND_STORE(dest, in[2]);
1526      RECON_AND_STORE(dest, in[3]);
1527      RECON_AND_STORE(dest, in[4]);
1528      RECON_AND_STORE(dest, in[5]);
1529      RECON_AND_STORE(dest, in[6]);
1530      RECON_AND_STORE(dest, in[7]);
1531      RECON_AND_STORE(dest, in[8]);
1532      RECON_AND_STORE(dest, in[9]);
1533      RECON_AND_STORE(dest, in[10]);
1534      RECON_AND_STORE(dest, in[11]);
1535      RECON_AND_STORE(dest, in[12]);
1536      RECON_AND_STORE(dest, in[13]);
1537      RECON_AND_STORE(dest, in[14]);
1538      RECON_AND_STORE(dest, in[15]);
1539
1540      dest += 8 - (stride * 16);
1541  }
1542}
1543
1544void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
1545  __m128i dc_value;
1546  const __m128i zero = _mm_setzero_si128();
1547  int a, i;
1548
1549  a = dct_const_round_shift(input[0] * cospi_16_64);
1550  a = dct_const_round_shift(a * cospi_16_64);
1551  a = ROUND_POWER_OF_TWO(a, 6);
1552
1553  dc_value = _mm_set1_epi16(a);
1554
1555  for (i = 0; i < 2; ++i) {
1556    RECON_AND_STORE(dest, dc_value);
1557    RECON_AND_STORE(dest, dc_value);
1558    RECON_AND_STORE(dest, dc_value);
1559    RECON_AND_STORE(dest, dc_value);
1560    RECON_AND_STORE(dest, dc_value);
1561    RECON_AND_STORE(dest, dc_value);
1562    RECON_AND_STORE(dest, dc_value);
1563    RECON_AND_STORE(dest, dc_value);
1564    RECON_AND_STORE(dest, dc_value);
1565    RECON_AND_STORE(dest, dc_value);
1566    RECON_AND_STORE(dest, dc_value);
1567    RECON_AND_STORE(dest, dc_value);
1568    RECON_AND_STORE(dest, dc_value);
1569    RECON_AND_STORE(dest, dc_value);
1570    RECON_AND_STORE(dest, dc_value);
1571    RECON_AND_STORE(dest, dc_value);
1572    dest += 8 - (stride * 16);
1573  }
1574}
1575
1576static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
1577  __m128i tbuf[8];
1578  array_transpose_8x8(res0, res0);
1579  array_transpose_8x8(res1, tbuf);
1580  array_transpose_8x8(res0 + 8, res1);
1581  array_transpose_8x8(res1 + 8, res1 + 8);
1582
1583  res0[8] = tbuf[0];
1584  res0[9] = tbuf[1];
1585  res0[10] = tbuf[2];
1586  res0[11] = tbuf[3];
1587  res0[12] = tbuf[4];
1588  res0[13] = tbuf[5];
1589  res0[14] = tbuf[6];
1590  res0[15] = tbuf[7];
1591}
1592
1593static void iadst16_8col(__m128i *in) {
1594  // perform 16x16 1-D ADST for 8 columns
1595  __m128i s[16], x[16], u[32], v[32];
1596  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1597  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1598  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1599  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1600  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1601  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1602  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1603  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1604  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1605  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1606  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1607  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1608  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1609  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1610  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1611  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1612  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1613  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1614  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1615  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1616  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1617  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1618  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1619  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1620  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1621  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
1622  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1623  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1624  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1625  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1626  const __m128i kZero = _mm_set1_epi16(0);
1627
1628  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1629  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1630  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1631  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1632  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1633  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1634  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1635  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1636  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1637  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1638  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1639  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1640  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1641  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1642  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1643  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1644
1645  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1646  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1647  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1648  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1649  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1650  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1651  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1652  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1653  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1654  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1655  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1656  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1657  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1658  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1659  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1660  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1661  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1662  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1663  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1664  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1665  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1666  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1667  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1668  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1669  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1670  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1671  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1672  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1673  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1674  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1675  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1676  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1677
1678  u[0] = _mm_add_epi32(v[0], v[16]);
1679  u[1] = _mm_add_epi32(v[1], v[17]);
1680  u[2] = _mm_add_epi32(v[2], v[18]);
1681  u[3] = _mm_add_epi32(v[3], v[19]);
1682  u[4] = _mm_add_epi32(v[4], v[20]);
1683  u[5] = _mm_add_epi32(v[5], v[21]);
1684  u[6] = _mm_add_epi32(v[6], v[22]);
1685  u[7] = _mm_add_epi32(v[7], v[23]);
1686  u[8] = _mm_add_epi32(v[8], v[24]);
1687  u[9] = _mm_add_epi32(v[9], v[25]);
1688  u[10] = _mm_add_epi32(v[10], v[26]);
1689  u[11] = _mm_add_epi32(v[11], v[27]);
1690  u[12] = _mm_add_epi32(v[12], v[28]);
1691  u[13] = _mm_add_epi32(v[13], v[29]);
1692  u[14] = _mm_add_epi32(v[14], v[30]);
1693  u[15] = _mm_add_epi32(v[15], v[31]);
1694  u[16] = _mm_sub_epi32(v[0], v[16]);
1695  u[17] = _mm_sub_epi32(v[1], v[17]);
1696  u[18] = _mm_sub_epi32(v[2], v[18]);
1697  u[19] = _mm_sub_epi32(v[3], v[19]);
1698  u[20] = _mm_sub_epi32(v[4], v[20]);
1699  u[21] = _mm_sub_epi32(v[5], v[21]);
1700  u[22] = _mm_sub_epi32(v[6], v[22]);
1701  u[23] = _mm_sub_epi32(v[7], v[23]);
1702  u[24] = _mm_sub_epi32(v[8], v[24]);
1703  u[25] = _mm_sub_epi32(v[9], v[25]);
1704  u[26] = _mm_sub_epi32(v[10], v[26]);
1705  u[27] = _mm_sub_epi32(v[11], v[27]);
1706  u[28] = _mm_sub_epi32(v[12], v[28]);
1707  u[29] = _mm_sub_epi32(v[13], v[29]);
1708  u[30] = _mm_sub_epi32(v[14], v[30]);
1709  u[31] = _mm_sub_epi32(v[15], v[31]);
1710
1711  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1712  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1713  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1714  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1715  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1716  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1717  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1718  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1719  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1720  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1721  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1722  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1723  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1724  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1725  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1726  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1727  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1728  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1729  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1730  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1731  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1732  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1733  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1734  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1735  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1736  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1737  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1738  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1739  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1740  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1741  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1742  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1743
1744  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1745  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1746  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1747  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1748  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1749  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1750  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1751  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1752  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1753  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1754  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1755  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1756  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1757  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1758  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1759  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1760  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1761  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1762  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1763  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1764  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1765  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1766  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1767  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1768  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1769  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1770  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1771  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1772  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1773  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1774  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1775  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1776
1777  s[0] = _mm_packs_epi32(u[0], u[1]);
1778  s[1] = _mm_packs_epi32(u[2], u[3]);
1779  s[2] = _mm_packs_epi32(u[4], u[5]);
1780  s[3] = _mm_packs_epi32(u[6], u[7]);
1781  s[4] = _mm_packs_epi32(u[8], u[9]);
1782  s[5] = _mm_packs_epi32(u[10], u[11]);
1783  s[6] = _mm_packs_epi32(u[12], u[13]);
1784  s[7] = _mm_packs_epi32(u[14], u[15]);
1785  s[8] = _mm_packs_epi32(u[16], u[17]);
1786  s[9] = _mm_packs_epi32(u[18], u[19]);
1787  s[10] = _mm_packs_epi32(u[20], u[21]);
1788  s[11] = _mm_packs_epi32(u[22], u[23]);
1789  s[12] = _mm_packs_epi32(u[24], u[25]);
1790  s[13] = _mm_packs_epi32(u[26], u[27]);
1791  s[14] = _mm_packs_epi32(u[28], u[29]);
1792  s[15] = _mm_packs_epi32(u[30], u[31]);
1793
1794  // stage 2
1795  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1796  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1797  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1798  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1799  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1800  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1801  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1802  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1803
1804  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1805  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1806  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1807  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1808  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1809  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1810  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1811  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1812  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1813  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1814  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1815  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1816  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1817  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1818  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1819  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1820
1821  u[0] = _mm_add_epi32(v[0], v[8]);
1822  u[1] = _mm_add_epi32(v[1], v[9]);
1823  u[2] = _mm_add_epi32(v[2], v[10]);
1824  u[3] = _mm_add_epi32(v[3], v[11]);
1825  u[4] = _mm_add_epi32(v[4], v[12]);
1826  u[5] = _mm_add_epi32(v[5], v[13]);
1827  u[6] = _mm_add_epi32(v[6], v[14]);
1828  u[7] = _mm_add_epi32(v[7], v[15]);
1829  u[8] = _mm_sub_epi32(v[0], v[8]);
1830  u[9] = _mm_sub_epi32(v[1], v[9]);
1831  u[10] = _mm_sub_epi32(v[2], v[10]);
1832  u[11] = _mm_sub_epi32(v[3], v[11]);
1833  u[12] = _mm_sub_epi32(v[4], v[12]);
1834  u[13] = _mm_sub_epi32(v[5], v[13]);
1835  u[14] = _mm_sub_epi32(v[6], v[14]);
1836  u[15] = _mm_sub_epi32(v[7], v[15]);
1837
1838  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1839  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1840  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1841  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1842  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1843  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1844  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1845  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1846  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1847  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1848  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1849  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1850  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1851  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1852  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1853  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1854
1855  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1856  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1857  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1858  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1859  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1860  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1861  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1862  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1863  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1864  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1865  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1866  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1867  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1868  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1869  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1870  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1871
1872  x[0] = _mm_add_epi16(s[0], s[4]);
1873  x[1] = _mm_add_epi16(s[1], s[5]);
1874  x[2] = _mm_add_epi16(s[2], s[6]);
1875  x[3] = _mm_add_epi16(s[3], s[7]);
1876  x[4] = _mm_sub_epi16(s[0], s[4]);
1877  x[5] = _mm_sub_epi16(s[1], s[5]);
1878  x[6] = _mm_sub_epi16(s[2], s[6]);
1879  x[7] = _mm_sub_epi16(s[3], s[7]);
1880  x[8] = _mm_packs_epi32(u[0], u[1]);
1881  x[9] = _mm_packs_epi32(u[2], u[3]);
1882  x[10] = _mm_packs_epi32(u[4], u[5]);
1883  x[11] = _mm_packs_epi32(u[6], u[7]);
1884  x[12] = _mm_packs_epi32(u[8], u[9]);
1885  x[13] = _mm_packs_epi32(u[10], u[11]);
1886  x[14] = _mm_packs_epi32(u[12], u[13]);
1887  x[15] = _mm_packs_epi32(u[14], u[15]);
1888
1889  // stage 3
1890  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1891  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1892  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1893  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1894  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1895  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1896  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1897  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1898
1899  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1900  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1901  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1902  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1903  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1904  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1905  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1906  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1907  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1908  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1909  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1910  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1911  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1912  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1913  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1914  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1915
1916  u[0] = _mm_add_epi32(v[0], v[4]);
1917  u[1] = _mm_add_epi32(v[1], v[5]);
1918  u[2] = _mm_add_epi32(v[2], v[6]);
1919  u[3] = _mm_add_epi32(v[3], v[7]);
1920  u[4] = _mm_sub_epi32(v[0], v[4]);
1921  u[5] = _mm_sub_epi32(v[1], v[5]);
1922  u[6] = _mm_sub_epi32(v[2], v[6]);
1923  u[7] = _mm_sub_epi32(v[3], v[7]);
1924  u[8] = _mm_add_epi32(v[8], v[12]);
1925  u[9] = _mm_add_epi32(v[9], v[13]);
1926  u[10] = _mm_add_epi32(v[10], v[14]);
1927  u[11] = _mm_add_epi32(v[11], v[15]);
1928  u[12] = _mm_sub_epi32(v[8], v[12]);
1929  u[13] = _mm_sub_epi32(v[9], v[13]);
1930  u[14] = _mm_sub_epi32(v[10], v[14]);
1931  u[15] = _mm_sub_epi32(v[11], v[15]);
1932
1933  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1934  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1935  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1936  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1937  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1938  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1939  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1940  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1941  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1942  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1943  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1944  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1945  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1946  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1947  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1948  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1949
1950  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1951  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1952  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1953  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1954  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1955  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1956  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1957  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1958  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1959  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1960  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1961  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1962  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1963  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1964  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1965  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1966
1967  s[0] = _mm_add_epi16(x[0], x[2]);
1968  s[1] = _mm_add_epi16(x[1], x[3]);
1969  s[2] = _mm_sub_epi16(x[0], x[2]);
1970  s[3] = _mm_sub_epi16(x[1], x[3]);
1971  s[4] = _mm_packs_epi32(v[0], v[1]);
1972  s[5] = _mm_packs_epi32(v[2], v[3]);
1973  s[6] = _mm_packs_epi32(v[4], v[5]);
1974  s[7] = _mm_packs_epi32(v[6], v[7]);
1975  s[8] = _mm_add_epi16(x[8], x[10]);
1976  s[9] = _mm_add_epi16(x[9], x[11]);
1977  s[10] = _mm_sub_epi16(x[8], x[10]);
1978  s[11] = _mm_sub_epi16(x[9], x[11]);
1979  s[12] = _mm_packs_epi32(v[8], v[9]);
1980  s[13] = _mm_packs_epi32(v[10], v[11]);
1981  s[14] = _mm_packs_epi32(v[12], v[13]);
1982  s[15] = _mm_packs_epi32(v[14], v[15]);
1983
1984  // stage 4
1985  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1986  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1987  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1988  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1989  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1990  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1991  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1992  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1993
1994  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1995  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1996  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1997  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1998  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1999  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2000  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2001  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2002  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
2003  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
2004  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
2005  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
2006  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
2007  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
2008  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
2009  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
2010
2011  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2012  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2013  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2014  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2015  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2016  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2017  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2018  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2019  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2020  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2021  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2022  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2023  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2024  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2025  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2026  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2027
2028  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2029  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2030  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2031  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2032  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2033  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2034  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2035  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2036  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2037  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2038  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2039  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2040  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2041  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2042  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2043  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2044
2045  in[0] = s[0];
2046  in[1] = _mm_sub_epi16(kZero, s[8]);
2047  in[2] = s[12];
2048  in[3] = _mm_sub_epi16(kZero, s[4]);
2049  in[4] = _mm_packs_epi32(v[4], v[5]);
2050  in[5] = _mm_packs_epi32(v[12], v[13]);
2051  in[6] = _mm_packs_epi32(v[8], v[9]);
2052  in[7] = _mm_packs_epi32(v[0], v[1]);
2053  in[8] = _mm_packs_epi32(v[2], v[3]);
2054  in[9] = _mm_packs_epi32(v[10], v[11]);
2055  in[10] = _mm_packs_epi32(v[14], v[15]);
2056  in[11] = _mm_packs_epi32(v[6], v[7]);
2057  in[12] = s[5];
2058  in[13] = _mm_sub_epi16(kZero, s[13]);
2059  in[14] = s[9];
2060  in[15] = _mm_sub_epi16(kZero, s[1]);
2061}
2062
2063static void idct16_8col(__m128i *in) {
2064  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2065  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
2066  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
2067  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
2068  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
2069  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
2070  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2071  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
2072  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2073  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
2074  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2075  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
2076  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
2077  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2078  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2079  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
2080  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2081  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
2082  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2083  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2084  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
2085  __m128i v[16], u[16], s[16], t[16];
2086
2087  // stage 1
2088  s[0] = in[0];
2089  s[1] = in[8];
2090  s[2] = in[4];
2091  s[3] = in[12];
2092  s[4] = in[2];
2093  s[5] = in[10];
2094  s[6] = in[6];
2095  s[7] = in[14];
2096  s[8] = in[1];
2097  s[9] = in[9];
2098  s[10] = in[5];
2099  s[11] = in[13];
2100  s[12] = in[3];
2101  s[13] = in[11];
2102  s[14] = in[7];
2103  s[15] = in[15];
2104
2105  // stage 2
2106  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
2107  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
2108  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
2109  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
2110  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
2111  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
2112  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
2113  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
2114
2115  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
2116  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
2117  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
2118  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
2119  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
2120  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
2121  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
2122  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
2123  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
2124  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
2125  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
2126  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
2127  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
2128  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
2129  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
2130  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
2131
2132  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2133  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2134  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2135  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2136  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2137  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2138  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2139  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2140  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2141  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2142  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2143  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2144  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2145  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2146  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2147  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2148
2149  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2150  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2151  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2152  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2153  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2154  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2155  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2156  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2157  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2158  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2159  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2160  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2161  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2162  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2163  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2164  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2165
2166  s[8]  = _mm_packs_epi32(u[0], u[1]);
2167  s[15] = _mm_packs_epi32(u[2], u[3]);
2168  s[9]  = _mm_packs_epi32(u[4], u[5]);
2169  s[14] = _mm_packs_epi32(u[6], u[7]);
2170  s[10] = _mm_packs_epi32(u[8], u[9]);
2171  s[13] = _mm_packs_epi32(u[10], u[11]);
2172  s[11] = _mm_packs_epi32(u[12], u[13]);
2173  s[12] = _mm_packs_epi32(u[14], u[15]);
2174
2175  // stage 3
2176  t[0] = s[0];
2177  t[1] = s[1];
2178  t[2] = s[2];
2179  t[3] = s[3];
2180  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
2181  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
2182  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
2183  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
2184
2185  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
2186  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
2187  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
2188  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
2189  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
2190  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
2191  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
2192  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
2193
2194  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2195  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2196  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2197  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2198  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2199  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2200  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2201  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2202
2203  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2204  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2205  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2206  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2207  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2208  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2209  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2210  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2211
2212  t[4] = _mm_packs_epi32(u[0], u[1]);
2213  t[7] = _mm_packs_epi32(u[2], u[3]);
2214  t[5] = _mm_packs_epi32(u[4], u[5]);
2215  t[6] = _mm_packs_epi32(u[6], u[7]);
2216  t[8] = _mm_add_epi16(s[8], s[9]);
2217  t[9] = _mm_sub_epi16(s[8], s[9]);
2218  t[10] = _mm_sub_epi16(s[11], s[10]);
2219  t[11] = _mm_add_epi16(s[10], s[11]);
2220  t[12] = _mm_add_epi16(s[12], s[13]);
2221  t[13] = _mm_sub_epi16(s[12], s[13]);
2222  t[14] = _mm_sub_epi16(s[15], s[14]);
2223  t[15] = _mm_add_epi16(s[14], s[15]);
2224
2225  // stage 4
2226  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
2227  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
2228  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
2229  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
2230  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
2231  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
2232  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
2233  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
2234
2235  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2236  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2237  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
2238  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
2239  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
2240  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
2241  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
2242  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
2243  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
2244  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
2245  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
2246  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
2247  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
2248  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
2249  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
2250  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
2251
2252  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2253  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2254  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2255  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2256  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2257  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2258  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2259  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2260  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2261  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2262  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2263  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2264  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2265  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2266  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2267  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2268
2269  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2270  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2271  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2272  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2273  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2274  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2275  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2276  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2277  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2278  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2279  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2280  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2281  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2282  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2283  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2284  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2285
2286  s[0] = _mm_packs_epi32(u[0], u[1]);
2287  s[1] = _mm_packs_epi32(u[2], u[3]);
2288  s[2] = _mm_packs_epi32(u[4], u[5]);
2289  s[3] = _mm_packs_epi32(u[6], u[7]);
2290  s[4] = _mm_add_epi16(t[4], t[5]);
2291  s[5] = _mm_sub_epi16(t[4], t[5]);
2292  s[6] = _mm_sub_epi16(t[7], t[6]);
2293  s[7] = _mm_add_epi16(t[6], t[7]);
2294  s[8] = t[8];
2295  s[15] = t[15];
2296  s[9]  = _mm_packs_epi32(u[8], u[9]);
2297  s[14] = _mm_packs_epi32(u[10], u[11]);
2298  s[10] = _mm_packs_epi32(u[12], u[13]);
2299  s[13] = _mm_packs_epi32(u[14], u[15]);
2300  s[11] = t[11];
2301  s[12] = t[12];
2302
2303  // stage 5
2304  t[0] = _mm_add_epi16(s[0], s[3]);
2305  t[1] = _mm_add_epi16(s[1], s[2]);
2306  t[2] = _mm_sub_epi16(s[1], s[2]);
2307  t[3] = _mm_sub_epi16(s[0], s[3]);
2308  t[4] = s[4];
2309  t[7] = s[7];
2310
2311  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
2312  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
2313  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2314  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2315  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2316  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2317  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2318  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2319  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2320  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2321  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2322  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2323  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2324  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2325  t[5] = _mm_packs_epi32(u[0], u[1]);
2326  t[6] = _mm_packs_epi32(u[2], u[3]);
2327
2328  t[8] = _mm_add_epi16(s[8], s[11]);
2329  t[9] = _mm_add_epi16(s[9], s[10]);
2330  t[10] = _mm_sub_epi16(s[9], s[10]);
2331  t[11] = _mm_sub_epi16(s[8], s[11]);
2332  t[12] = _mm_sub_epi16(s[15], s[12]);
2333  t[13] = _mm_sub_epi16(s[14], s[13]);
2334  t[14] = _mm_add_epi16(s[13], s[14]);
2335  t[15] = _mm_add_epi16(s[12], s[15]);
2336
2337  // stage 6
2338  s[0] = _mm_add_epi16(t[0], t[7]);
2339  s[1] = _mm_add_epi16(t[1], t[6]);
2340  s[2] = _mm_add_epi16(t[2], t[5]);
2341  s[3] = _mm_add_epi16(t[3], t[4]);
2342  s[4] = _mm_sub_epi16(t[3], t[4]);
2343  s[5] = _mm_sub_epi16(t[2], t[5]);
2344  s[6] = _mm_sub_epi16(t[1], t[6]);
2345  s[7] = _mm_sub_epi16(t[0], t[7]);
2346  s[8] = t[8];
2347  s[9] = t[9];
2348
2349  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
2350  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
2351  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
2352  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
2353
2354  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2355  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2356  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2357  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2358  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2359  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2360  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2361  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2362
2363  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2364  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2365  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2366  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2367  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2368  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2369  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2370  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2371
2372  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2373  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2374  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2375  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2376  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2377  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2378  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2379  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2380
2381  s[10] = _mm_packs_epi32(u[0], u[1]);
2382  s[13] = _mm_packs_epi32(u[2], u[3]);
2383  s[11] = _mm_packs_epi32(u[4], u[5]);
2384  s[12] = _mm_packs_epi32(u[6], u[7]);
2385  s[14] = t[14];
2386  s[15] = t[15];
2387
2388  // stage 7
2389  in[0] = _mm_add_epi16(s[0], s[15]);
2390  in[1] = _mm_add_epi16(s[1], s[14]);
2391  in[2] = _mm_add_epi16(s[2], s[13]);
2392  in[3] = _mm_add_epi16(s[3], s[12]);
2393  in[4] = _mm_add_epi16(s[4], s[11]);
2394  in[5] = _mm_add_epi16(s[5], s[10]);
2395  in[6] = _mm_add_epi16(s[6], s[9]);
2396  in[7] = _mm_add_epi16(s[7], s[8]);
2397  in[8] = _mm_sub_epi16(s[7], s[8]);
2398  in[9] = _mm_sub_epi16(s[6], s[9]);
2399  in[10] = _mm_sub_epi16(s[5], s[10]);
2400  in[11] = _mm_sub_epi16(s[4], s[11]);
2401  in[12] = _mm_sub_epi16(s[3], s[12]);
2402  in[13] = _mm_sub_epi16(s[2], s[13]);
2403  in[14] = _mm_sub_epi16(s[1], s[14]);
2404  in[15] = _mm_sub_epi16(s[0], s[15]);
2405}
2406
2407static void idct16_sse2(__m128i *in0, __m128i *in1) {
2408  array_transpose_16x16(in0, in1);
2409  idct16_8col(in0);
2410  idct16_8col(in1);
2411}
2412
2413static void iadst16_sse2(__m128i *in0, __m128i *in1) {
2414  array_transpose_16x16(in0, in1);
2415  iadst16_8col(in0);
2416  iadst16_8col(in1);
2417}
2418
2419static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
2420  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
2421  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
2422  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
2423  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
2424  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
2425  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
2426  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
2427  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
2428
2429  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
2430  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
2431  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
2432  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
2433  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
2434  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
2435  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
2436  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
2437}
2438
2439static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
2440  const __m128i final_rounding = _mm_set1_epi16(1<<5);
2441  const __m128i zero = _mm_setzero_si128();
2442  // Final rounding and shift
2443  in[0] = _mm_adds_epi16(in[0], final_rounding);
2444  in[1] = _mm_adds_epi16(in[1], final_rounding);
2445  in[2] = _mm_adds_epi16(in[2], final_rounding);
2446  in[3] = _mm_adds_epi16(in[3], final_rounding);
2447  in[4] = _mm_adds_epi16(in[4], final_rounding);
2448  in[5] = _mm_adds_epi16(in[5], final_rounding);
2449  in[6] = _mm_adds_epi16(in[6], final_rounding);
2450  in[7] = _mm_adds_epi16(in[7], final_rounding);
2451  in[8] = _mm_adds_epi16(in[8], final_rounding);
2452  in[9] = _mm_adds_epi16(in[9], final_rounding);
2453  in[10] = _mm_adds_epi16(in[10], final_rounding);
2454  in[11] = _mm_adds_epi16(in[11], final_rounding);
2455  in[12] = _mm_adds_epi16(in[12], final_rounding);
2456  in[13] = _mm_adds_epi16(in[13], final_rounding);
2457  in[14] = _mm_adds_epi16(in[14], final_rounding);
2458  in[15] = _mm_adds_epi16(in[15], final_rounding);
2459
2460  in[0] = _mm_srai_epi16(in[0], 6);
2461  in[1] = _mm_srai_epi16(in[1], 6);
2462  in[2] = _mm_srai_epi16(in[2], 6);
2463  in[3] = _mm_srai_epi16(in[3], 6);
2464  in[4] = _mm_srai_epi16(in[4], 6);
2465  in[5] = _mm_srai_epi16(in[5], 6);
2466  in[6] = _mm_srai_epi16(in[6], 6);
2467  in[7] = _mm_srai_epi16(in[7], 6);
2468  in[8] = _mm_srai_epi16(in[8], 6);
2469  in[9] = _mm_srai_epi16(in[9], 6);
2470  in[10] = _mm_srai_epi16(in[10], 6);
2471  in[11] = _mm_srai_epi16(in[11], 6);
2472  in[12] = _mm_srai_epi16(in[12], 6);
2473  in[13] = _mm_srai_epi16(in[13], 6);
2474  in[14] = _mm_srai_epi16(in[14], 6);
2475  in[15] = _mm_srai_epi16(in[15], 6);
2476
2477  RECON_AND_STORE(dest, in[0]);
2478  RECON_AND_STORE(dest, in[1]);
2479  RECON_AND_STORE(dest, in[2]);
2480  RECON_AND_STORE(dest, in[3]);
2481  RECON_AND_STORE(dest, in[4]);
2482  RECON_AND_STORE(dest, in[5]);
2483  RECON_AND_STORE(dest, in[6]);
2484  RECON_AND_STORE(dest, in[7]);
2485  RECON_AND_STORE(dest, in[8]);
2486  RECON_AND_STORE(dest, in[9]);
2487  RECON_AND_STORE(dest, in[10]);
2488  RECON_AND_STORE(dest, in[11]);
2489  RECON_AND_STORE(dest, in[12]);
2490  RECON_AND_STORE(dest, in[13]);
2491  RECON_AND_STORE(dest, in[14]);
2492  RECON_AND_STORE(dest, in[15]);
2493}
2494
2495void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
2496                               int tx_type) {
2497  __m128i in0[16], in1[16];
2498
2499  load_buffer_8x16(input, in0);
2500  input += 8;
2501  load_buffer_8x16(input, in1);
2502
2503  switch (tx_type) {
2504    case 0:  // DCT_DCT
2505      idct16_sse2(in0, in1);
2506      idct16_sse2(in0, in1);
2507      break;
2508    case 1:  // ADST_DCT
2509      idct16_sse2(in0, in1);
2510      iadst16_sse2(in0, in1);
2511      break;
2512    case 2:  // DCT_ADST
2513      iadst16_sse2(in0, in1);
2514      idct16_sse2(in0, in1);
2515      break;
2516    case 3:  // ADST_ADST
2517      iadst16_sse2(in0, in1);
2518      iadst16_sse2(in0, in1);
2519      break;
2520    default:
2521      assert(0);
2522      break;
2523  }
2524
2525  write_buffer_8x16(dest, in0, stride);
2526  dest += 8;
2527  write_buffer_8x16(dest, in1, stride);
2528}
2529
2530void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
2531                               int stride) {
2532  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2533  const __m128i final_rounding = _mm_set1_epi16(1<<5);
2534  const __m128i zero = _mm_setzero_si128();
2535
2536  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2537  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2538  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2539  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2540
2541  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2542  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2543
2544  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2545  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2546  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2547  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2548  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2549  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2550
2551  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2552  __m128i in[16], l[16];
2553  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
2554          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2555          stp1_8_0, stp1_12_0;
2556  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2557          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
2558  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2559  int i;
2560  // First 1-D inverse DCT
2561  // Load input data.
2562  in[0] = _mm_load_si128((const __m128i *)input);
2563  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
2564  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
2565  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
2566
2567  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
2568
2569  // Stage2
2570  {
2571    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
2572    const __m128i lo_13_3 =  _mm_unpackhi_epi16(zero, in[1]);
2573
2574    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2575    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2576    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2577    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2578
2579    tmp0 = _mm_add_epi32(tmp0, rounding);
2580    tmp2 = _mm_add_epi32(tmp2, rounding);
2581    tmp5 = _mm_add_epi32(tmp5, rounding);
2582    tmp7 = _mm_add_epi32(tmp7, rounding);
2583
2584    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2585    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2586    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2587    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2588
2589    stp2_8  = _mm_packs_epi32(tmp0, tmp2);
2590    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
2591  }
2592
2593  // Stage3
2594  {
2595    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
2596
2597    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2598    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2599
2600    tmp0 = _mm_add_epi32(tmp0, rounding);
2601    tmp2 = _mm_add_epi32(tmp2, rounding);
2602    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2603    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2604
2605    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
2606    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
2607
2608    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
2609  }
2610
2611  // Stage4
2612  {
2613    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
2614    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
2615    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
2616
2617    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2618    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2619    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2620    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2621    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2622    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2623
2624    tmp0 = _mm_add_epi32(tmp0, rounding);
2625    tmp2 = _mm_add_epi32(tmp2, rounding);
2626    tmp1 = _mm_add_epi32(tmp1, rounding);
2627    tmp3 = _mm_add_epi32(tmp3, rounding);
2628    tmp5 = _mm_add_epi32(tmp5, rounding);
2629    tmp7 = _mm_add_epi32(tmp7, rounding);
2630
2631    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2632    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2633    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2634    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2635    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2636    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2637
2638    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
2639    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
2640    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
2641    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
2642
2643    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
2644  }
2645
2646  // Stage5 and Stage6
2647  {
2648    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
2649    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
2650    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
2651    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
2652
2653    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
2654    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
2655    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
2656    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
2657
2658    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
2659    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
2660    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
2661    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
2662  }
2663
2664  // Stage6
2665  {
2666    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
2667    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2668    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2669
2670    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2671    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2672    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2673    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2674    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2675    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2676
2677    tmp1 = _mm_add_epi32(tmp1, rounding);
2678    tmp3 = _mm_add_epi32(tmp3, rounding);
2679    tmp0 = _mm_add_epi32(tmp0, rounding);
2680    tmp2 = _mm_add_epi32(tmp2, rounding);
2681    tmp4 = _mm_add_epi32(tmp4, rounding);
2682    tmp6 = _mm_add_epi32(tmp6, rounding);
2683
2684    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2685    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2686    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2687    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2688    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2689    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2690
2691    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
2692
2693    stp2_10 = _mm_packs_epi32(tmp0, zero);
2694    stp2_13 = _mm_packs_epi32(tmp2, zero);
2695    stp2_11 = _mm_packs_epi32(tmp4, zero);
2696    stp2_12 = _mm_packs_epi32(tmp6, zero);
2697
2698    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
2699    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
2700    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
2701    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
2702
2703    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
2704    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
2705    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
2706    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
2707    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
2708    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
2709    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
2710    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
2711  }
2712
2713  // Stage7. Left 8x16 only.
2714  l[0] = _mm_add_epi16(stp2_0, stp1_15);
2715  l[1] = _mm_add_epi16(stp2_1, stp1_14);
2716  l[2] = _mm_add_epi16(stp2_2, stp2_13);
2717  l[3] = _mm_add_epi16(stp2_3, stp2_12);
2718  l[4] = _mm_add_epi16(stp2_4, stp2_11);
2719  l[5] = _mm_add_epi16(stp2_5, stp2_10);
2720  l[6] = _mm_add_epi16(stp2_6, stp1_9);
2721  l[7] = _mm_add_epi16(stp2_7, stp1_8);
2722  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2723  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2724  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2725  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2726  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2727  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2728  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2729  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2730
2731  // Second 1-D inverse transform, performed per 8x16 block
2732  for (i = 0; i < 2; i++) {
2733    array_transpose_4X8(l + 8*i, in);
2734
2735    IDCT16_10
2736
2737    // Stage7
2738    in[0] = _mm_add_epi16(stp2_0, stp1_15);
2739    in[1] = _mm_add_epi16(stp2_1, stp1_14);
2740    in[2] = _mm_add_epi16(stp2_2, stp2_13);
2741    in[3] = _mm_add_epi16(stp2_3, stp2_12);
2742    in[4] = _mm_add_epi16(stp2_4, stp2_11);
2743    in[5] = _mm_add_epi16(stp2_5, stp2_10);
2744    in[6] = _mm_add_epi16(stp2_6, stp1_9);
2745    in[7] = _mm_add_epi16(stp2_7, stp1_8);
2746    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2747    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2748    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2749    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2750    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2751    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2752    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2753    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2754
2755    // Final rounding and shift
2756    in[0] = _mm_adds_epi16(in[0], final_rounding);
2757    in[1] = _mm_adds_epi16(in[1], final_rounding);
2758    in[2] = _mm_adds_epi16(in[2], final_rounding);
2759    in[3] = _mm_adds_epi16(in[3], final_rounding);
2760    in[4] = _mm_adds_epi16(in[4], final_rounding);
2761    in[5] = _mm_adds_epi16(in[5], final_rounding);
2762    in[6] = _mm_adds_epi16(in[6], final_rounding);
2763    in[7] = _mm_adds_epi16(in[7], final_rounding);
2764    in[8] = _mm_adds_epi16(in[8], final_rounding);
2765    in[9] = _mm_adds_epi16(in[9], final_rounding);
2766    in[10] = _mm_adds_epi16(in[10], final_rounding);
2767    in[11] = _mm_adds_epi16(in[11], final_rounding);
2768    in[12] = _mm_adds_epi16(in[12], final_rounding);
2769    in[13] = _mm_adds_epi16(in[13], final_rounding);
2770    in[14] = _mm_adds_epi16(in[14], final_rounding);
2771    in[15] = _mm_adds_epi16(in[15], final_rounding);
2772
2773    in[0] = _mm_srai_epi16(in[0], 6);
2774    in[1] = _mm_srai_epi16(in[1], 6);
2775    in[2] = _mm_srai_epi16(in[2], 6);
2776    in[3] = _mm_srai_epi16(in[3], 6);
2777    in[4] = _mm_srai_epi16(in[4], 6);
2778    in[5] = _mm_srai_epi16(in[5], 6);
2779    in[6] = _mm_srai_epi16(in[6], 6);
2780    in[7] = _mm_srai_epi16(in[7], 6);
2781    in[8] = _mm_srai_epi16(in[8], 6);
2782    in[9] = _mm_srai_epi16(in[9], 6);
2783    in[10] = _mm_srai_epi16(in[10], 6);
2784    in[11] = _mm_srai_epi16(in[11], 6);
2785    in[12] = _mm_srai_epi16(in[12], 6);
2786    in[13] = _mm_srai_epi16(in[13], 6);
2787    in[14] = _mm_srai_epi16(in[14], 6);
2788    in[15] = _mm_srai_epi16(in[15], 6);
2789
2790    RECON_AND_STORE(dest, in[0]);
2791    RECON_AND_STORE(dest, in[1]);
2792    RECON_AND_STORE(dest, in[2]);
2793    RECON_AND_STORE(dest, in[3]);
2794    RECON_AND_STORE(dest, in[4]);
2795    RECON_AND_STORE(dest, in[5]);
2796    RECON_AND_STORE(dest, in[6]);
2797    RECON_AND_STORE(dest, in[7]);
2798    RECON_AND_STORE(dest, in[8]);
2799    RECON_AND_STORE(dest, in[9]);
2800    RECON_AND_STORE(dest, in[10]);
2801    RECON_AND_STORE(dest, in[11]);
2802    RECON_AND_STORE(dest, in[12]);
2803    RECON_AND_STORE(dest, in[13]);
2804    RECON_AND_STORE(dest, in[14]);
2805    RECON_AND_STORE(dest, in[15]);
2806
2807    dest += 8 - (stride * 16);
2808  }
2809}
2810
2811#define LOAD_DQCOEFF(reg, input) \
2812  {  \
2813    reg = _mm_load_si128((const __m128i *) input); \
2814    input += 8; \
2815  }  \
2816
2817#define IDCT32_34 \
2818/* Stage1 */ \
2819{ \
2820  const __m128i zero = _mm_setzero_si128();\
2821  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
2822  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
2823  \
2824  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
2825  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
2826  \
2827  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
2828  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
2829  \
2830  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
2831  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
2832  \
2833  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
2834                         stg1_1, stp1_16, stp1_31); \
2835  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
2836                         stg1_7, stp1_19, stp1_28); \
2837  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
2838                         stg1_9, stp1_20, stp1_27); \
2839  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
2840                         stg1_15, stp1_23, stp1_24); \
2841} \
2842\
2843/* Stage2 */ \
2844{ \
2845  const __m128i zero = _mm_setzero_si128();\
2846  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
2847  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
2848  \
2849  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
2850  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
2851  \
2852  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
2853                         stg2_1, stp2_8, stp2_15); \
2854  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
2855                         stg2_7, stp2_11, stp2_12); \
2856  \
2857  stp2_16 = stp1_16; \
2858  stp2_19 = stp1_19; \
2859  \
2860  stp2_20 = stp1_20; \
2861  stp2_23 = stp1_23; \
2862  \
2863  stp2_24 = stp1_24; \
2864  stp2_27 = stp1_27; \
2865  \
2866  stp2_28 = stp1_28; \
2867  stp2_31 = stp1_31; \
2868} \
2869\
2870/* Stage3 */ \
2871{ \
2872  const __m128i zero = _mm_setzero_si128();\
2873  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
2874  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
2875  \
2876  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
2877  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
2878  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
2879  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
2880  \
2881  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
2882  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
2883  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
2884  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
2885  \
2886  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
2887                         stg3_1, stp1_4, stp1_7); \
2888  \
2889  stp1_8 = stp2_8; \
2890  stp1_11 = stp2_11; \
2891  stp1_12 = stp2_12; \
2892  stp1_15 = stp2_15; \
2893  \
2894  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2895                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2896                         stp1_18, stp1_29) \
2897  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2898                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2899                         stp1_22, stp1_25) \
2900  \
2901  stp1_16 = stp2_16; \
2902  stp1_31 = stp2_31; \
2903  stp1_19 = stp2_19; \
2904  stp1_20 = stp2_20; \
2905  stp1_23 = stp2_23; \
2906  stp1_24 = stp2_24; \
2907  stp1_27 = stp2_27; \
2908  stp1_28 = stp2_28; \
2909} \
2910\
2911/* Stage4 */ \
2912{ \
2913  const __m128i zero = _mm_setzero_si128();\
2914  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
2915  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
2916  \
2917  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
2918  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
2919  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
2920  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
2921  \
2922  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
2923                         stg4_1, stp2_0, stp2_1); \
2924  \
2925  stp2_4 = stp1_4; \
2926  stp2_5 = stp1_4; \
2927  stp2_6 = stp1_7; \
2928  stp2_7 = stp1_7; \
2929  \
2930  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2931                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2932                         stp2_10, stp2_13) \
2933  \
2934  stp2_8 = stp1_8; \
2935  stp2_15 = stp1_15; \
2936  stp2_11 = stp1_11; \
2937  stp2_12 = stp1_12; \
2938  \
2939  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2940  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2941  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2942  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2943  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2944  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2945  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2946  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2947  \
2948  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2949  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2950  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2951  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2952  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2953  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2954  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2955  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2956} \
2957\
2958/* Stage5 */ \
2959{ \
2960  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2961  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2962  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2963  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2964  \
2965  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2966  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2967  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2968  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2969  \
2970  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2971  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2972  \
2973  stp1_0 = stp2_0; \
2974  stp1_1 = stp2_1; \
2975  stp1_2 = stp2_1; \
2976  stp1_3 = stp2_0; \
2977  \
2978  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2979  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2980  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2981  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2982  \
2983  tmp0 = _mm_add_epi32(tmp0, rounding); \
2984  tmp1 = _mm_add_epi32(tmp1, rounding); \
2985  tmp2 = _mm_add_epi32(tmp2, rounding); \
2986  tmp3 = _mm_add_epi32(tmp3, rounding); \
2987  \
2988  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2989  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2990  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2991  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2992  \
2993  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2994  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2995  \
2996  stp1_4 = stp2_4; \
2997  stp1_7 = stp2_7; \
2998  \
2999  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
3000  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
3001  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
3002  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
3003  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
3004  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
3005  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
3006  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
3007  \
3008  stp1_16 = stp2_16; \
3009  stp1_17 = stp2_17; \
3010  \
3011  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
3012                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
3013                         stp1_19, stp1_28) \
3014  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
3015                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
3016                         stp1_21, stp1_26) \
3017  \
3018  stp1_22 = stp2_22; \
3019  stp1_23 = stp2_23; \
3020  stp1_24 = stp2_24; \
3021  stp1_25 = stp2_25; \
3022  stp1_30 = stp2_30; \
3023  stp1_31 = stp2_31; \
3024} \
3025\
3026/* Stage6 */ \
3027{ \
3028  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3029  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3030  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
3031  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
3032  \
3033  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
3034  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
3035  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
3036  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
3037  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
3038  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
3039  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
3040  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
3041  \
3042  stp2_8 = stp1_8; \
3043  stp2_9 = stp1_9; \
3044  stp2_14 = stp1_14; \
3045  stp2_15 = stp1_15; \
3046  \
3047  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
3048                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
3049                         stp2_13, stp2_11, stp2_12) \
3050  \
3051  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
3052  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
3053  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
3054  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
3055  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
3056  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
3057  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
3058  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
3059  \
3060  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
3061  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
3062  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
3063  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
3064  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
3065  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
3066  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
3067  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
3068} \
3069\
3070/* Stage7 */ \
3071{ \
3072  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3073  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3074  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3075  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3076  \
3077  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3078  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3079  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
3080  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
3081  \
3082  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
3083  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
3084  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
3085  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3086  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3087  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3088  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3089  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3090  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3091  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3092  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3093  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3094  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3095  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3096  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3097  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3098  \
3099  stp1_16 = stp2_16; \
3100  stp1_17 = stp2_17; \
3101  stp1_18 = stp2_18; \
3102  stp1_19 = stp2_19; \
3103  \
3104  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3105                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3106                         stp1_21, stp1_26) \
3107  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3108                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3109                         stp1_23, stp1_24) \
3110  \
3111  stp1_28 = stp2_28; \
3112  stp1_29 = stp2_29; \
3113  stp1_30 = stp2_30; \
3114  stp1_31 = stp2_31; \
3115}
3116
3117
3118#define IDCT32 \
3119/* Stage1 */ \
3120{ \
3121  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
3122  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
3123  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
3124  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
3125  \
3126  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
3127  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
3128  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
3129  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
3130  \
3131  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
3132  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
3133  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
3134  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
3135  \
3136  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
3137  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
3138  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
3139  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
3140  \
3141  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
3142                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
3143                         stp1_17, stp1_30) \
3144  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
3145                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
3146                         stp1_19, stp1_28) \
3147  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
3148                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
3149                         stp1_21, stp1_26) \
3150  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
3151                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
3152                         stp1_23, stp1_24) \
3153} \
3154\
3155/* Stage2 */ \
3156{ \
3157  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
3158  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
3159  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
3160  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
3161  \
3162  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
3163  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
3164  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
3165  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
3166  \
3167  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
3168                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
3169                         stp2_14) \
3170  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
3171                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
3172                         stp2_11, stp2_12) \
3173  \
3174  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
3175  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
3176  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
3177  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
3178  \
3179  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
3180  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
3181  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
3182  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
3183  \
3184  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
3185  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
3186  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
3187  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
3188  \
3189  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
3190  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
3191  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
3192  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
3193} \
3194\
3195/* Stage3 */ \
3196{ \
3197  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
3198  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
3199  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
3200  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
3201  \
3202  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
3203  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
3204  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
3205  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
3206  \
3207  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3208  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3209  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3210  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3211  \
3212  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
3213                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
3214                         stp1_6) \
3215  \
3216  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
3217  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
3218  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
3219  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
3220  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
3221  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
3222  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
3223  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
3224  \
3225  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
3226                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
3227                         stp1_18, stp1_29) \
3228  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
3229                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
3230                         stp1_22, stp1_25) \
3231  \
3232  stp1_16 = stp2_16; \
3233  stp1_31 = stp2_31; \
3234  stp1_19 = stp2_19; \
3235  stp1_20 = stp2_20; \
3236  stp1_23 = stp2_23; \
3237  stp1_24 = stp2_24; \
3238  stp1_27 = stp2_27; \
3239  stp1_28 = stp2_28; \
3240} \
3241\
3242/* Stage4 */ \
3243{ \
3244  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
3245  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
3246  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
3247  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
3248  \
3249  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
3250  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
3251  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3252  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3253  \
3254  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
3255                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
3256                         stp2_2, stp2_3) \
3257  \
3258  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
3259  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
3260  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
3261  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
3262  \
3263  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
3264                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
3265                         stp2_10, stp2_13) \
3266  \
3267  stp2_8 = stp1_8; \
3268  stp2_15 = stp1_15; \
3269  stp2_11 = stp1_11; \
3270  stp2_12 = stp1_12; \
3271  \
3272  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
3273  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
3274  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
3275  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
3276  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
3277  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
3278  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
3279  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
3280  \
3281  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
3282  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
3283  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
3284  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
3285  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
3286  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
3287  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
3288  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
3289} \
3290\
3291/* Stage5 */ \
3292{ \
3293  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
3294  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
3295  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
3296  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
3297  \
3298  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
3299  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
3300  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3301  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3302  \
3303  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3304  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3305  \
3306  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
3307  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
3308  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
3309  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
3310  \
3311  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
3312  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
3313  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
3314  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
3315  \
3316  tmp0 = _mm_add_epi32(tmp0, rounding); \
3317  tmp1 = _mm_add_epi32(tmp1, rounding); \
3318  tmp2 = _mm_add_epi32(tmp2, rounding); \
3319  tmp3 = _mm_add_epi32(tmp3, rounding); \
3320  \
3321  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
3322  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
3323  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
3324  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
3325  \
3326  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
3327  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
3328  \
3329  stp1_4 = stp2_4; \
3330  stp1_7 = stp2_7; \
3331  \
3332  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
3333  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
3334  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
3335  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
3336  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
3337  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
3338  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
3339  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
3340  \
3341  stp1_16 = stp2_16; \
3342  stp1_17 = stp2_17; \
3343  \
3344  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
3345                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
3346                         stp1_19, stp1_28) \
3347  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
3348                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
3349                         stp1_21, stp1_26) \
3350  \
3351  stp1_22 = stp2_22; \
3352  stp1_23 = stp2_23; \
3353  stp1_24 = stp2_24; \
3354  stp1_25 = stp2_25; \
3355  stp1_30 = stp2_30; \
3356  stp1_31 = stp2_31; \
3357} \
3358\
3359/* Stage6 */ \
3360{ \
3361  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3362  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3363  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
3364  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
3365  \
3366  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
3367  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
3368  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
3369  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
3370  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
3371  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
3372  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
3373  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
3374  \
3375  stp2_8 = stp1_8; \
3376  stp2_9 = stp1_9; \
3377  stp2_14 = stp1_14; \
3378  stp2_15 = stp1_15; \
3379  \
3380  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
3381                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
3382                         stp2_13, stp2_11, stp2_12) \
3383  \
3384  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
3385  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
3386  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
3387  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
3388  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
3389  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
3390  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
3391  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
3392  \
3393  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
3394  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
3395  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
3396  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
3397  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
3398  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
3399  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
3400  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
3401} \
3402\
3403/* Stage7 */ \
3404{ \
3405  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3406  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3407  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3408  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3409  \
3410  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3411  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3412  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
3413  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
3414  \
3415  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
3416  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
3417  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
3418  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3419  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3420  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3421  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3422  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3423  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3424  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3425  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3426  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3427  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3428  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3429  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3430  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3431  \
3432  stp1_16 = stp2_16; \
3433  stp1_17 = stp2_17; \
3434  stp1_18 = stp2_18; \
3435  stp1_19 = stp2_19; \
3436  \
3437  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3438                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3439                         stp1_21, stp1_26) \
3440  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3441                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3442                         stp1_23, stp1_24) \
3443  \
3444  stp1_28 = stp2_28; \
3445  stp1_29 = stp2_29; \
3446  stp1_30 = stp2_30; \
3447  stp1_31 = stp2_31; \
3448}
3449
3450// Only upper-left 8x8 has non-zero coeff
3451void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
3452                                 int stride) {
3453  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3454  const __m128i final_rounding = _mm_set1_epi16(1<<5);
3455
3456  // idct constants for each stage
3457  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3458  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3459  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3460  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3461  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3462  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3463  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3464  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3465  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3466  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3467  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3468  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3469  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3470  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3471  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3472  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3473
3474  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3475  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3476  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3477  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3478  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3479  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3480  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3481  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3482
3483  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3484  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3485  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3486  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3487  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3488  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3489  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3490  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3491  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3492  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3493
3494  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3495  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3496  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3497  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3498  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3499  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3500  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3501
3502  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3503
3504  __m128i in[32], col[32];
3505  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3506          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3507          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3508          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3509          stp1_30, stp1_31;
3510  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3511          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3512          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3513          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3514          stp2_30, stp2_31;
3515  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3516  int i;
3517  // Load input data.
3518  LOAD_DQCOEFF(in[0], input);
3519  LOAD_DQCOEFF(in[8], input);
3520  LOAD_DQCOEFF(in[16], input);
3521  LOAD_DQCOEFF(in[24], input);
3522  LOAD_DQCOEFF(in[1], input);
3523  LOAD_DQCOEFF(in[9], input);
3524  LOAD_DQCOEFF(in[17], input);
3525  LOAD_DQCOEFF(in[25], input);
3526  LOAD_DQCOEFF(in[2], input);
3527  LOAD_DQCOEFF(in[10], input);
3528  LOAD_DQCOEFF(in[18], input);
3529  LOAD_DQCOEFF(in[26], input);
3530  LOAD_DQCOEFF(in[3], input);
3531  LOAD_DQCOEFF(in[11], input);
3532  LOAD_DQCOEFF(in[19], input);
3533  LOAD_DQCOEFF(in[27], input);
3534
3535  LOAD_DQCOEFF(in[4], input);
3536  LOAD_DQCOEFF(in[12], input);
3537  LOAD_DQCOEFF(in[20], input);
3538  LOAD_DQCOEFF(in[28], input);
3539  LOAD_DQCOEFF(in[5], input);
3540  LOAD_DQCOEFF(in[13], input);
3541  LOAD_DQCOEFF(in[21], input);
3542  LOAD_DQCOEFF(in[29], input);
3543  LOAD_DQCOEFF(in[6], input);
3544  LOAD_DQCOEFF(in[14], input);
3545  LOAD_DQCOEFF(in[22], input);
3546  LOAD_DQCOEFF(in[30], input);
3547  LOAD_DQCOEFF(in[7], input);
3548  LOAD_DQCOEFF(in[15], input);
3549  LOAD_DQCOEFF(in[23], input);
3550  LOAD_DQCOEFF(in[31], input);
3551
3552  array_transpose_8x8(in, in);
3553  array_transpose_8x8(in+8, in+8);
3554  array_transpose_8x8(in+16, in+16);
3555  array_transpose_8x8(in+24, in+24);
3556
3557  IDCT32
3558
3559  // 1_D: Store 32 intermediate results for each 8x32 block.
3560  col[0] = _mm_add_epi16(stp1_0, stp1_31);
3561  col[1] = _mm_add_epi16(stp1_1, stp1_30);
3562  col[2] = _mm_add_epi16(stp1_2, stp1_29);
3563  col[3] = _mm_add_epi16(stp1_3, stp1_28);
3564  col[4] = _mm_add_epi16(stp1_4, stp1_27);
3565  col[5] = _mm_add_epi16(stp1_5, stp1_26);
3566  col[6] = _mm_add_epi16(stp1_6, stp1_25);
3567  col[7] = _mm_add_epi16(stp1_7, stp1_24);
3568  col[8] = _mm_add_epi16(stp1_8, stp1_23);
3569  col[9] = _mm_add_epi16(stp1_9, stp1_22);
3570  col[10] = _mm_add_epi16(stp1_10, stp1_21);
3571  col[11] = _mm_add_epi16(stp1_11, stp1_20);
3572  col[12] = _mm_add_epi16(stp1_12, stp1_19);
3573  col[13] = _mm_add_epi16(stp1_13, stp1_18);
3574  col[14] = _mm_add_epi16(stp1_14, stp1_17);
3575  col[15] = _mm_add_epi16(stp1_15, stp1_16);
3576  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
3577  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
3578  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
3579  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
3580  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
3581  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
3582  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
3583  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
3584  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
3585  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
3586  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
3587  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
3588  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
3589  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
3590  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
3591  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
3592  for (i = 0; i < 4; i++) {
3593      const __m128i zero = _mm_setzero_si128();
3594      // Transpose 32x8 block to 8x32 block
3595      array_transpose_8x8(col+i*8, in);
3596      IDCT32_34
3597
3598      // 2_D: Calculate the results and store them to destination.
3599      in[0] = _mm_add_epi16(stp1_0, stp1_31);
3600      in[1] = _mm_add_epi16(stp1_1, stp1_30);
3601      in[2] = _mm_add_epi16(stp1_2, stp1_29);
3602      in[3] = _mm_add_epi16(stp1_3, stp1_28);
3603      in[4] = _mm_add_epi16(stp1_4, stp1_27);
3604      in[5] = _mm_add_epi16(stp1_5, stp1_26);
3605      in[6] = _mm_add_epi16(stp1_6, stp1_25);
3606      in[7] = _mm_add_epi16(stp1_7, stp1_24);
3607      in[8] = _mm_add_epi16(stp1_8, stp1_23);
3608      in[9] = _mm_add_epi16(stp1_9, stp1_22);
3609      in[10] = _mm_add_epi16(stp1_10, stp1_21);
3610      in[11] = _mm_add_epi16(stp1_11, stp1_20);
3611      in[12] = _mm_add_epi16(stp1_12, stp1_19);
3612      in[13] = _mm_add_epi16(stp1_13, stp1_18);
3613      in[14] = _mm_add_epi16(stp1_14, stp1_17);
3614      in[15] = _mm_add_epi16(stp1_15, stp1_16);
3615      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3616      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3617      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3618      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3619      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3620      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3621      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3622      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3623      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3624      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3625      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3626      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3627      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3628      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3629      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3630      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3631
3632      // Final rounding and shift
3633      in[0] = _mm_adds_epi16(in[0], final_rounding);
3634      in[1] = _mm_adds_epi16(in[1], final_rounding);
3635      in[2] = _mm_adds_epi16(in[2], final_rounding);
3636      in[3] = _mm_adds_epi16(in[3], final_rounding);
3637      in[4] = _mm_adds_epi16(in[4], final_rounding);
3638      in[5] = _mm_adds_epi16(in[5], final_rounding);
3639      in[6] = _mm_adds_epi16(in[6], final_rounding);
3640      in[7] = _mm_adds_epi16(in[7], final_rounding);
3641      in[8] = _mm_adds_epi16(in[8], final_rounding);
3642      in[9] = _mm_adds_epi16(in[9], final_rounding);
3643      in[10] = _mm_adds_epi16(in[10], final_rounding);
3644      in[11] = _mm_adds_epi16(in[11], final_rounding);
3645      in[12] = _mm_adds_epi16(in[12], final_rounding);
3646      in[13] = _mm_adds_epi16(in[13], final_rounding);
3647      in[14] = _mm_adds_epi16(in[14], final_rounding);
3648      in[15] = _mm_adds_epi16(in[15], final_rounding);
3649      in[16] = _mm_adds_epi16(in[16], final_rounding);
3650      in[17] = _mm_adds_epi16(in[17], final_rounding);
3651      in[18] = _mm_adds_epi16(in[18], final_rounding);
3652      in[19] = _mm_adds_epi16(in[19], final_rounding);
3653      in[20] = _mm_adds_epi16(in[20], final_rounding);
3654      in[21] = _mm_adds_epi16(in[21], final_rounding);
3655      in[22] = _mm_adds_epi16(in[22], final_rounding);
3656      in[23] = _mm_adds_epi16(in[23], final_rounding);
3657      in[24] = _mm_adds_epi16(in[24], final_rounding);
3658      in[25] = _mm_adds_epi16(in[25], final_rounding);
3659      in[26] = _mm_adds_epi16(in[26], final_rounding);
3660      in[27] = _mm_adds_epi16(in[27], final_rounding);
3661      in[28] = _mm_adds_epi16(in[28], final_rounding);
3662      in[29] = _mm_adds_epi16(in[29], final_rounding);
3663      in[30] = _mm_adds_epi16(in[30], final_rounding);
3664      in[31] = _mm_adds_epi16(in[31], final_rounding);
3665
3666      in[0] = _mm_srai_epi16(in[0], 6);
3667      in[1] = _mm_srai_epi16(in[1], 6);
3668      in[2] = _mm_srai_epi16(in[2], 6);
3669      in[3] = _mm_srai_epi16(in[3], 6);
3670      in[4] = _mm_srai_epi16(in[4], 6);
3671      in[5] = _mm_srai_epi16(in[5], 6);
3672      in[6] = _mm_srai_epi16(in[6], 6);
3673      in[7] = _mm_srai_epi16(in[7], 6);
3674      in[8] = _mm_srai_epi16(in[8], 6);
3675      in[9] = _mm_srai_epi16(in[9], 6);
3676      in[10] = _mm_srai_epi16(in[10], 6);
3677      in[11] = _mm_srai_epi16(in[11], 6);
3678      in[12] = _mm_srai_epi16(in[12], 6);
3679      in[13] = _mm_srai_epi16(in[13], 6);
3680      in[14] = _mm_srai_epi16(in[14], 6);
3681      in[15] = _mm_srai_epi16(in[15], 6);
3682      in[16] = _mm_srai_epi16(in[16], 6);
3683      in[17] = _mm_srai_epi16(in[17], 6);
3684      in[18] = _mm_srai_epi16(in[18], 6);
3685      in[19] = _mm_srai_epi16(in[19], 6);
3686      in[20] = _mm_srai_epi16(in[20], 6);
3687      in[21] = _mm_srai_epi16(in[21], 6);
3688      in[22] = _mm_srai_epi16(in[22], 6);
3689      in[23] = _mm_srai_epi16(in[23], 6);
3690      in[24] = _mm_srai_epi16(in[24], 6);
3691      in[25] = _mm_srai_epi16(in[25], 6);
3692      in[26] = _mm_srai_epi16(in[26], 6);
3693      in[27] = _mm_srai_epi16(in[27], 6);
3694      in[28] = _mm_srai_epi16(in[28], 6);
3695      in[29] = _mm_srai_epi16(in[29], 6);
3696      in[30] = _mm_srai_epi16(in[30], 6);
3697      in[31] = _mm_srai_epi16(in[31], 6);
3698
3699      RECON_AND_STORE(dest, in[0]);
3700      RECON_AND_STORE(dest, in[1]);
3701      RECON_AND_STORE(dest, in[2]);
3702      RECON_AND_STORE(dest, in[3]);
3703      RECON_AND_STORE(dest, in[4]);
3704      RECON_AND_STORE(dest, in[5]);
3705      RECON_AND_STORE(dest, in[6]);
3706      RECON_AND_STORE(dest, in[7]);
3707      RECON_AND_STORE(dest, in[8]);
3708      RECON_AND_STORE(dest, in[9]);
3709      RECON_AND_STORE(dest, in[10]);
3710      RECON_AND_STORE(dest, in[11]);
3711      RECON_AND_STORE(dest, in[12]);
3712      RECON_AND_STORE(dest, in[13]);
3713      RECON_AND_STORE(dest, in[14]);
3714      RECON_AND_STORE(dest, in[15]);
3715      RECON_AND_STORE(dest, in[16]);
3716      RECON_AND_STORE(dest, in[17]);
3717      RECON_AND_STORE(dest, in[18]);
3718      RECON_AND_STORE(dest, in[19]);
3719      RECON_AND_STORE(dest, in[20]);
3720      RECON_AND_STORE(dest, in[21]);
3721      RECON_AND_STORE(dest, in[22]);
3722      RECON_AND_STORE(dest, in[23]);
3723      RECON_AND_STORE(dest, in[24]);
3724      RECON_AND_STORE(dest, in[25]);
3725      RECON_AND_STORE(dest, in[26]);
3726      RECON_AND_STORE(dest, in[27]);
3727      RECON_AND_STORE(dest, in[28]);
3728      RECON_AND_STORE(dest, in[29]);
3729      RECON_AND_STORE(dest, in[30]);
3730      RECON_AND_STORE(dest, in[31]);
3731
3732      dest += 8 - (stride * 32);
3733    }
3734  }
3735
3736void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
3737                                 int stride) {
3738  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3739  const __m128i final_rounding = _mm_set1_epi16(1<<5);
3740
3741  // idct constants for each stage
3742  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3743  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3744  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3745  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3746  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3747  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3748  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3749  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3750  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3751  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3752  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3753  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3754  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3755  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3756  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3757  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3758
3759  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3760  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3761  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3762  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3763  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3764  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3765  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3766  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3767
3768  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3769  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3770  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3771  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3772  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3773  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3774  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3775  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3776  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3777  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3778
3779  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3780  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3781  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3782  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3783  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3784  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3785  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3786
3787  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3788
3789  __m128i in[32], col[128], zero_idx[16];
3790  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3791          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3792          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3793          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3794          stp1_30, stp1_31;
3795  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3796          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3797          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21,