1/*
2 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
12
13#define RECON_AND_STORE4X4(dest, in_x) \
14{                                                     \
15  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
16  d0 = _mm_unpacklo_epi8(d0, zero); \
17  d0 = _mm_add_epi16(in_x, d0); \
18  d0 = _mm_packus_epi16(d0, d0); \
19  *(int *)dest = _mm_cvtsi128_si32(d0); \
20  dest += stride; \
21}
22
23void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
24  const __m128i zero = _mm_setzero_si128();
25  const __m128i eight = _mm_set1_epi16(8);
26  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
27                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
28                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
29                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
30  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
31  __m128i input0, input1, input2, input3;
32
33  // Rows
34  input0 = _mm_load_si128((const __m128i *)input);
35  input2 = _mm_load_si128((const __m128i *)(input + 8));
36
37  // Construct i3, i1, i3, i1, i2, i0, i2, i0
38  input0 = _mm_shufflelo_epi16(input0, 0xd8);
39  input0 = _mm_shufflehi_epi16(input0, 0xd8);
40  input2 = _mm_shufflelo_epi16(input2, 0xd8);
41  input2 = _mm_shufflehi_epi16(input2, 0xd8);
42
43  input1 = _mm_unpackhi_epi32(input0, input0);
44  input0 = _mm_unpacklo_epi32(input0, input0);
45  input3 = _mm_unpackhi_epi32(input2, input2);
46  input2 = _mm_unpacklo_epi32(input2, input2);
47
48  // Stage 1
49  input0 = _mm_madd_epi16(input0, cst);
50  input1 = _mm_madd_epi16(input1, cst);
51  input2 = _mm_madd_epi16(input2, cst);
52  input3 = _mm_madd_epi16(input3, cst);
53
54  input0 = _mm_add_epi32(input0, rounding);
55  input1 = _mm_add_epi32(input1, rounding);
56  input2 = _mm_add_epi32(input2, rounding);
57  input3 = _mm_add_epi32(input3, rounding);
58
59  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
60  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
61  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
62  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
63
64  // Stage 2
65  input0 = _mm_packs_epi32(input0, input1);
66  input1 = _mm_packs_epi32(input2, input3);
67
68  // Transpose
69  input2 = _mm_unpacklo_epi16(input0, input1);
70  input3 = _mm_unpackhi_epi16(input0, input1);
71  input0 = _mm_unpacklo_epi32(input2, input3);
72  input1 = _mm_unpackhi_epi32(input2, input3);
73
74  // Switch column2, column 3, and then, we got:
75  // input2: column1, column 0;  input3: column2, column 3.
76  input1 = _mm_shuffle_epi32(input1, 0x4e);
77  input2 = _mm_add_epi16(input0, input1);
78  input3 = _mm_sub_epi16(input0, input1);
79
80  // Columns
81  // Construct i3, i1, i3, i1, i2, i0, i2, i0
82  input0 = _mm_unpacklo_epi32(input2, input2);
83  input1 = _mm_unpackhi_epi32(input2, input2);
84  input2 = _mm_unpackhi_epi32(input3, input3);
85  input3 = _mm_unpacklo_epi32(input3, input3);
86
87  // Stage 1
88  input0 = _mm_madd_epi16(input0, cst);
89  input1 = _mm_madd_epi16(input1, cst);
90  input2 = _mm_madd_epi16(input2, cst);
91  input3 = _mm_madd_epi16(input3, cst);
92
93  input0 = _mm_add_epi32(input0, rounding);
94  input1 = _mm_add_epi32(input1, rounding);
95  input2 = _mm_add_epi32(input2, rounding);
96  input3 = _mm_add_epi32(input3, rounding);
97
98  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
99  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
100  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
101  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
102
103  // Stage 2
104  input0 = _mm_packs_epi32(input0, input2);
105  input1 = _mm_packs_epi32(input1, input3);
106
107  // Transpose
108  input2 = _mm_unpacklo_epi16(input0, input1);
109  input3 = _mm_unpackhi_epi16(input0, input1);
110  input0 = _mm_unpacklo_epi32(input2, input3);
111  input1 = _mm_unpackhi_epi32(input2, input3);
112
113  // Switch column2, column 3, and then, we got:
114  // input2: column1, column 0;  input3: column2, column 3.
115  input1 = _mm_shuffle_epi32(input1, 0x4e);
116  input2 = _mm_add_epi16(input0, input1);
117  input3 = _mm_sub_epi16(input0, input1);
118
119  // Final round and shift
120  input2 = _mm_add_epi16(input2, eight);
121  input3 = _mm_add_epi16(input3, eight);
122
123  input2 = _mm_srai_epi16(input2, 4);
124  input3 = _mm_srai_epi16(input3, 4);
125
126  // Reconstruction and Store
127  {
128     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
129     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
130     d0 = _mm_unpacklo_epi32(d0,
131          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
132     d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
133                    *(const int *) (dest + stride * 3)), d2);
134     d0 = _mm_unpacklo_epi8(d0, zero);
135     d2 = _mm_unpacklo_epi8(d2, zero);
136     d0 = _mm_add_epi16(d0, input2);
137     d2 = _mm_add_epi16(d2, input3);
138     d0 = _mm_packus_epi16(d0, d2);
139     // store input0
140     *(int *)dest = _mm_cvtsi128_si32(d0);
141     // store input1
142     d0 = _mm_srli_si128(d0, 4);
143     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
144     // store input2
145     d0 = _mm_srli_si128(d0, 4);
146     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
147     // store input3
148     d0 = _mm_srli_si128(d0, 4);
149     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
150  }
151}
152
153void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
154  __m128i dc_value;
155  const __m128i zero = _mm_setzero_si128();
156  int a;
157
158  a = dct_const_round_shift(input[0] * cospi_16_64);
159  a = dct_const_round_shift(a * cospi_16_64);
160  a = ROUND_POWER_OF_TWO(a, 4);
161
162  dc_value = _mm_set1_epi16(a);
163
164  RECON_AND_STORE4X4(dest, dc_value);
165  RECON_AND_STORE4X4(dest, dc_value);
166  RECON_AND_STORE4X4(dest, dc_value);
167  RECON_AND_STORE4X4(dest, dc_value);
168}
169
170static INLINE void transpose_4x4(__m128i *res) {
171  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
172  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
173
174  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
175  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
176}
177
178static void idct4_sse2(__m128i *in) {
179  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
180  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
181  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
182  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
183  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
184  __m128i u[8], v[8];
185
186  transpose_4x4(in);
187  // stage 1
188  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
189  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
190  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
191  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
192  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
193  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
194
195  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
196  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
197  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
198  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
199
200  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
201  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
202  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
203  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
204
205  u[0] = _mm_packs_epi32(v[0], v[1]);
206  u[1] = _mm_packs_epi32(v[3], v[2]);
207
208  // stage 2
209  in[0] = _mm_add_epi16(u[0], u[1]);
210  in[1] = _mm_sub_epi16(u[0], u[1]);
211  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
212}
213
214static void iadst4_sse2(__m128i *in) {
215  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
216  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
217  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
218  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
219  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
220  const __m128i kZero = _mm_set1_epi16(0);
221  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
222  __m128i u[8], v[8], in7;
223
224  transpose_4x4(in);
225  in7 = _mm_srli_si128(in[1], 8);
226  in7 = _mm_add_epi16(in7, in[0]);
227  in7 = _mm_sub_epi16(in7, in[1]);
228
229  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
230  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
231  u[2] = _mm_unpacklo_epi16(in7, kZero);
232  u[3] = _mm_unpackhi_epi16(in[0], kZero);
233
234  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
235  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
236  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
237  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
238  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
239  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
240
241  u[0] = _mm_add_epi32(v[0], v[1]);
242  u[1] = _mm_add_epi32(v[3], v[4]);
243  u[2] = v[2];
244  u[3] = _mm_add_epi32(u[0], u[1]);
245  u[4] = _mm_slli_epi32(v[5], 2);
246  u[5] = _mm_add_epi32(u[3], v[5]);
247  u[6] = _mm_sub_epi32(u[5], u[4]);
248
249  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
250  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
251  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
252  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
253
254  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
255  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
256  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
257  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
258
259  in[0] = _mm_packs_epi32(u[0], u[1]);
260  in[1] = _mm_packs_epi32(u[2], u[3]);
261}
262
263void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
264                            int tx_type) {
265  __m128i in[2];
266  const __m128i zero = _mm_setzero_si128();
267  const __m128i eight = _mm_set1_epi16(8);
268
269  in[0]= _mm_loadu_si128((const __m128i *)(input));
270  in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
271
272  switch (tx_type) {
273    case 0:  // DCT_DCT
274      idct4_sse2(in);
275      idct4_sse2(in);
276      break;
277    case 1:  // ADST_DCT
278      idct4_sse2(in);
279      iadst4_sse2(in);
280      break;
281    case 2:  // DCT_ADST
282      iadst4_sse2(in);
283      idct4_sse2(in);
284      break;
285    case 3:  // ADST_ADST
286      iadst4_sse2(in);
287      iadst4_sse2(in);
288      break;
289    default:
290      assert(0);
291      break;
292  }
293
294  // Final round and shift
295  in[0] = _mm_add_epi16(in[0], eight);
296  in[1] = _mm_add_epi16(in[1], eight);
297
298  in[0] = _mm_srai_epi16(in[0], 4);
299  in[1] = _mm_srai_epi16(in[1], 4);
300
301  // Reconstruction and Store
302  {
303     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
304     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
305     d0 = _mm_unpacklo_epi32(d0,
306          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
307     d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128(
308                    *(const int *) (dest + stride * 3)));
309     d0 = _mm_unpacklo_epi8(d0, zero);
310     d2 = _mm_unpacklo_epi8(d2, zero);
311     d0 = _mm_add_epi16(d0, in[0]);
312     d2 = _mm_add_epi16(d2, in[1]);
313     d0 = _mm_packus_epi16(d0, d2);
314     // store result[0]
315     *(int *)dest = _mm_cvtsi128_si32(d0);
316     // store result[1]
317     d0 = _mm_srli_si128(d0, 4);
318     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
319     // store result[2]
320     d0 = _mm_srli_si128(d0, 4);
321     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
322     // store result[3]
323     d0 = _mm_srli_si128(d0, 4);
324     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
325  }
326}
327
328#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
329                      out0, out1, out2, out3, out4, out5, out6, out7) \
330  {                                                     \
331    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
332    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
333    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
334    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
335    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
336    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
337    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
338    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
339                                                        \
340    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
341    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
342    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
343    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
344    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
345    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
346    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
347    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
348                                                            \
349    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
350    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
351    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
352    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
353    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
354    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
355    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
356    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
357  }
358
359#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
360                         out0, out1, out2, out3) \
361  {                                              \
362    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
363    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
364    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
365    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
366    \
367    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
368    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
369    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
370    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
371    \
372    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
373    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
374    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
375    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
376  }
377
378#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
379  {                                            \
380    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
381    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
382    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
383    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
384  }
385
386// Define Macro for multiplying elements by constants and adding them together.
387#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
388                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
389  {   \
390      tmp0 = _mm_madd_epi16(lo_0, cst0); \
391      tmp1 = _mm_madd_epi16(hi_0, cst0); \
392      tmp2 = _mm_madd_epi16(lo_0, cst1); \
393      tmp3 = _mm_madd_epi16(hi_0, cst1); \
394      tmp4 = _mm_madd_epi16(lo_1, cst2); \
395      tmp5 = _mm_madd_epi16(hi_1, cst2); \
396      tmp6 = _mm_madd_epi16(lo_1, cst3); \
397      tmp7 = _mm_madd_epi16(hi_1, cst3); \
398      \
399      tmp0 = _mm_add_epi32(tmp0, rounding); \
400      tmp1 = _mm_add_epi32(tmp1, rounding); \
401      tmp2 = _mm_add_epi32(tmp2, rounding); \
402      tmp3 = _mm_add_epi32(tmp3, rounding); \
403      tmp4 = _mm_add_epi32(tmp4, rounding); \
404      tmp5 = _mm_add_epi32(tmp5, rounding); \
405      tmp6 = _mm_add_epi32(tmp6, rounding); \
406      tmp7 = _mm_add_epi32(tmp7, rounding); \
407      \
408      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
409      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
410      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
411      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
412      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
413      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
414      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
415      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
416      \
417      res0 = _mm_packs_epi32(tmp0, tmp1); \
418      res1 = _mm_packs_epi32(tmp2, tmp3); \
419      res2 = _mm_packs_epi32(tmp4, tmp5); \
420      res3 = _mm_packs_epi32(tmp6, tmp7); \
421  }
422
423#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
424  {   \
425      tmp0 = _mm_madd_epi16(lo_0, cst0); \
426      tmp1 = _mm_madd_epi16(hi_0, cst0); \
427      tmp2 = _mm_madd_epi16(lo_0, cst1); \
428      tmp3 = _mm_madd_epi16(hi_0, cst1); \
429      \
430      tmp0 = _mm_add_epi32(tmp0, rounding); \
431      tmp1 = _mm_add_epi32(tmp1, rounding); \
432      tmp2 = _mm_add_epi32(tmp2, rounding); \
433      tmp3 = _mm_add_epi32(tmp3, rounding); \
434      \
435      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
436      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
437      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
438      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
439      \
440      res0 = _mm_packs_epi32(tmp0, tmp1); \
441      res1 = _mm_packs_epi32(tmp2, tmp3); \
442  }
443
444#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
445                 out0, out1, out2, out3, out4, out5, out6, out7)  \
446  { \
447  /* Stage1 */      \
448  { \
449    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
450    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
451    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
452    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
453    \
454    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
455                          stg1_1, stg1_2, stg1_3, stp1_4,      \
456                          stp1_7, stp1_5, stp1_6)              \
457  } \
458    \
459  /* Stage2 */ \
460  { \
461    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
462    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
463    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
464    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
465    \
466    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
467                           stg2_1, stg2_2, stg2_3, stp2_0,     \
468                           stp2_1, stp2_2, stp2_3)             \
469    \
470    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
471    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
472    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
473    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
474  } \
475    \
476  /* Stage3 */ \
477  { \
478    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
479    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
480    \
481    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
482    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
483    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
484    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
485    \
486    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
487    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
488    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
489    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
490    \
491    tmp0 = _mm_add_epi32(tmp0, rounding); \
492    tmp1 = _mm_add_epi32(tmp1, rounding); \
493    tmp2 = _mm_add_epi32(tmp2, rounding); \
494    tmp3 = _mm_add_epi32(tmp3, rounding); \
495    \
496    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
497    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
498    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
499    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
500    \
501    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
502    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
503  } \
504  \
505  /* Stage4  */ \
506  out0 = _mm_adds_epi16(stp1_0, stp2_7); \
507  out1 = _mm_adds_epi16(stp1_1, stp1_6); \
508  out2 = _mm_adds_epi16(stp1_2, stp1_5); \
509  out3 = _mm_adds_epi16(stp1_3, stp2_4); \
510  out4 = _mm_subs_epi16(stp1_3, stp2_4); \
511  out5 = _mm_subs_epi16(stp1_2, stp1_5); \
512  out6 = _mm_subs_epi16(stp1_1, stp1_6); \
513  out7 = _mm_subs_epi16(stp1_0, stp2_7); \
514  }
515
516void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
517  const __m128i zero = _mm_setzero_si128();
518  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
519  const __m128i final_rounding = _mm_set1_epi16(1<<4);
520  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
521  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
522  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
523  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
524  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
525  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
526  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
527  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
528
529  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
530  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
531  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
532  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
533  int i;
534
535  // Load input data.
536  in0 = _mm_load_si128((const __m128i *)input);
537  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
538  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
539  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
540  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
541  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
542  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
543  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
544
545  // 2-D
546  for (i = 0; i < 2; i++) {
547    // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
548    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
549                  in0, in1, in2, in3, in4, in5, in6, in7);
550
551    // 4-stage 1D idct8x8
552    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
553             in0, in1, in2, in3, in4, in5, in6, in7);
554  }
555
556  // Final rounding and shift
557  in0 = _mm_adds_epi16(in0, final_rounding);
558  in1 = _mm_adds_epi16(in1, final_rounding);
559  in2 = _mm_adds_epi16(in2, final_rounding);
560  in3 = _mm_adds_epi16(in3, final_rounding);
561  in4 = _mm_adds_epi16(in4, final_rounding);
562  in5 = _mm_adds_epi16(in5, final_rounding);
563  in6 = _mm_adds_epi16(in6, final_rounding);
564  in7 = _mm_adds_epi16(in7, final_rounding);
565
566  in0 = _mm_srai_epi16(in0, 5);
567  in1 = _mm_srai_epi16(in1, 5);
568  in2 = _mm_srai_epi16(in2, 5);
569  in3 = _mm_srai_epi16(in3, 5);
570  in4 = _mm_srai_epi16(in4, 5);
571  in5 = _mm_srai_epi16(in5, 5);
572  in6 = _mm_srai_epi16(in6, 5);
573  in7 = _mm_srai_epi16(in7, 5);
574
575  RECON_AND_STORE(dest, in0);
576  RECON_AND_STORE(dest, in1);
577  RECON_AND_STORE(dest, in2);
578  RECON_AND_STORE(dest, in3);
579  RECON_AND_STORE(dest, in4);
580  RECON_AND_STORE(dest, in5);
581  RECON_AND_STORE(dest, in6);
582  RECON_AND_STORE(dest, in7);
583}
584
585void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
586  __m128i dc_value;
587  const __m128i zero = _mm_setzero_si128();
588  int a;
589
590  a = dct_const_round_shift(input[0] * cospi_16_64);
591  a = dct_const_round_shift(a * cospi_16_64);
592  a = ROUND_POWER_OF_TWO(a, 5);
593
594  dc_value = _mm_set1_epi16(a);
595
596  RECON_AND_STORE(dest, dc_value);
597  RECON_AND_STORE(dest, dc_value);
598  RECON_AND_STORE(dest, dc_value);
599  RECON_AND_STORE(dest, dc_value);
600  RECON_AND_STORE(dest, dc_value);
601  RECON_AND_STORE(dest, dc_value);
602  RECON_AND_STORE(dest, dc_value);
603  RECON_AND_STORE(dest, dc_value);
604}
605
606static void idct8_sse2(__m128i *in) {
607  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
608  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
609  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
610  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
611  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
612  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
613  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
614  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
615  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
616
617  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
618  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
619  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
620  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
621
622  // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
623  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
624                in0, in1, in2, in3, in4, in5, in6, in7);
625
626  // 4-stage 1D idct8x8
627  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
628           in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
629}
630
631static void iadst8_sse2(__m128i *in) {
632  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
633  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
634  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
635  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
636  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
637  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
638  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
639  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
640  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
641  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
642  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
643  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
644  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
645  const __m128i k__const_0 = _mm_set1_epi16(0);
646  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
647
648  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
649  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
650  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
651  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
652  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
653
654  // transpose
655  array_transpose_8x8(in, in);
656
657  // properly aligned for butterfly input
658  in0  = in[7];
659  in1  = in[0];
660  in2  = in[5];
661  in3  = in[2];
662  in4  = in[3];
663  in5  = in[4];
664  in6  = in[1];
665  in7  = in[6];
666
667  // column transformation
668  // stage 1
669  // interleave and multiply/add into 32-bit integer
670  s0 = _mm_unpacklo_epi16(in0, in1);
671  s1 = _mm_unpackhi_epi16(in0, in1);
672  s2 = _mm_unpacklo_epi16(in2, in3);
673  s3 = _mm_unpackhi_epi16(in2, in3);
674  s4 = _mm_unpacklo_epi16(in4, in5);
675  s5 = _mm_unpackhi_epi16(in4, in5);
676  s6 = _mm_unpacklo_epi16(in6, in7);
677  s7 = _mm_unpackhi_epi16(in6, in7);
678
679  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
680  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
681  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
682  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
683  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
684  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
685  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
686  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
687  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
688  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
689  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
690  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
691  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
692  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
693  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
694  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
695
696  // addition
697  w0 = _mm_add_epi32(u0, u8);
698  w1 = _mm_add_epi32(u1, u9);
699  w2 = _mm_add_epi32(u2, u10);
700  w3 = _mm_add_epi32(u3, u11);
701  w4 = _mm_add_epi32(u4, u12);
702  w5 = _mm_add_epi32(u5, u13);
703  w6 = _mm_add_epi32(u6, u14);
704  w7 = _mm_add_epi32(u7, u15);
705  w8 = _mm_sub_epi32(u0, u8);
706  w9 = _mm_sub_epi32(u1, u9);
707  w10 = _mm_sub_epi32(u2, u10);
708  w11 = _mm_sub_epi32(u3, u11);
709  w12 = _mm_sub_epi32(u4, u12);
710  w13 = _mm_sub_epi32(u5, u13);
711  w14 = _mm_sub_epi32(u6, u14);
712  w15 = _mm_sub_epi32(u7, u15);
713
714  // shift and rounding
715  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
716  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
717  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
718  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
719  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
720  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
721  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
722  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
723  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
724  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
725  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
726  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
727  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
728  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
729  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
730  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
731
732  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
733  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
734  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
735  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
736  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
737  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
738  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
739  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
740  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
741  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
742  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
743  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
744  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
745  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
746  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
747  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
748
749  // back to 16-bit and pack 8 integers into __m128i
750  in[0] = _mm_packs_epi32(u0, u1);
751  in[1] = _mm_packs_epi32(u2, u3);
752  in[2] = _mm_packs_epi32(u4, u5);
753  in[3] = _mm_packs_epi32(u6, u7);
754  in[4] = _mm_packs_epi32(u8, u9);
755  in[5] = _mm_packs_epi32(u10, u11);
756  in[6] = _mm_packs_epi32(u12, u13);
757  in[7] = _mm_packs_epi32(u14, u15);
758
759  // stage 2
760  s0 = _mm_add_epi16(in[0], in[2]);
761  s1 = _mm_add_epi16(in[1], in[3]);
762  s2 = _mm_sub_epi16(in[0], in[2]);
763  s3 = _mm_sub_epi16(in[1], in[3]);
764  u0 = _mm_unpacklo_epi16(in[4], in[5]);
765  u1 = _mm_unpackhi_epi16(in[4], in[5]);
766  u2 = _mm_unpacklo_epi16(in[6], in[7]);
767  u3 = _mm_unpackhi_epi16(in[6], in[7]);
768
769  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
770  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
771  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
772  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
773  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
774  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
775  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
776  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
777
778  w0 = _mm_add_epi32(v0, v4);
779  w1 = _mm_add_epi32(v1, v5);
780  w2 = _mm_add_epi32(v2, v6);
781  w3 = _mm_add_epi32(v3, v7);
782  w4 = _mm_sub_epi32(v0, v4);
783  w5 = _mm_sub_epi32(v1, v5);
784  w6 = _mm_sub_epi32(v2, v6);
785  w7 = _mm_sub_epi32(v3, v7);
786
787  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
788  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
789  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
790  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
791  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
792  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
793  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
794  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
795
796  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
797  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
798  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
799  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
800  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
801  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
802  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
803  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
804
805  // back to 16-bit intergers
806  s4 = _mm_packs_epi32(u0, u1);
807  s5 = _mm_packs_epi32(u2, u3);
808  s6 = _mm_packs_epi32(u4, u5);
809  s7 = _mm_packs_epi32(u6, u7);
810
811  // stage 3
812  u0 = _mm_unpacklo_epi16(s2, s3);
813  u1 = _mm_unpackhi_epi16(s2, s3);
814  u2 = _mm_unpacklo_epi16(s6, s7);
815  u3 = _mm_unpackhi_epi16(s6, s7);
816
817  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
818  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
819  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
820  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
821  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
822  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
823  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
824  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
825
826  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
827  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
828  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
829  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
830  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
831  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
832  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
833  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
834
835  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
836  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
837  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
838  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
839  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
840  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
841  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
842  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
843
844  s2 = _mm_packs_epi32(v0, v1);
845  s3 = _mm_packs_epi32(v2, v3);
846  s6 = _mm_packs_epi32(v4, v5);
847  s7 = _mm_packs_epi32(v6, v7);
848
849  in[0] = s0;
850  in[1] = _mm_sub_epi16(k__const_0, s4);
851  in[2] = s6;
852  in[3] = _mm_sub_epi16(k__const_0, s2);
853  in[4] = s3;
854  in[5] = _mm_sub_epi16(k__const_0, s7);
855  in[6] = s5;
856  in[7] = _mm_sub_epi16(k__const_0, s1);
857}
858
859
860void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
861                            int tx_type) {
862  __m128i in[8];
863  const __m128i zero = _mm_setzero_si128();
864  const __m128i final_rounding = _mm_set1_epi16(1<<4);
865
866  // load input data
867  in[0] = _mm_load_si128((const __m128i *)input);
868  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
869  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
870  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
871  in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
872  in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
873  in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
874  in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
875
876  switch (tx_type) {
877    case 0:  // DCT_DCT
878      idct8_sse2(in);
879      idct8_sse2(in);
880      break;
881    case 1:  // ADST_DCT
882      idct8_sse2(in);
883      iadst8_sse2(in);
884      break;
885    case 2:  // DCT_ADST
886      iadst8_sse2(in);
887      idct8_sse2(in);
888      break;
889    case 3:  // ADST_ADST
890      iadst8_sse2(in);
891      iadst8_sse2(in);
892      break;
893    default:
894      assert(0);
895      break;
896  }
897
898  // Final rounding and shift
899  in[0] = _mm_adds_epi16(in[0], final_rounding);
900  in[1] = _mm_adds_epi16(in[1], final_rounding);
901  in[2] = _mm_adds_epi16(in[2], final_rounding);
902  in[3] = _mm_adds_epi16(in[3], final_rounding);
903  in[4] = _mm_adds_epi16(in[4], final_rounding);
904  in[5] = _mm_adds_epi16(in[5], final_rounding);
905  in[6] = _mm_adds_epi16(in[6], final_rounding);
906  in[7] = _mm_adds_epi16(in[7], final_rounding);
907
908  in[0] = _mm_srai_epi16(in[0], 5);
909  in[1] = _mm_srai_epi16(in[1], 5);
910  in[2] = _mm_srai_epi16(in[2], 5);
911  in[3] = _mm_srai_epi16(in[3], 5);
912  in[4] = _mm_srai_epi16(in[4], 5);
913  in[5] = _mm_srai_epi16(in[5], 5);
914  in[6] = _mm_srai_epi16(in[6], 5);
915  in[7] = _mm_srai_epi16(in[7], 5);
916
917  RECON_AND_STORE(dest, in[0]);
918  RECON_AND_STORE(dest, in[1]);
919  RECON_AND_STORE(dest, in[2]);
920  RECON_AND_STORE(dest, in[3]);
921  RECON_AND_STORE(dest, in[4]);
922  RECON_AND_STORE(dest, in[5]);
923  RECON_AND_STORE(dest, in[6]);
924  RECON_AND_STORE(dest, in[7]);
925}
926
927void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
928  const __m128i zero = _mm_setzero_si128();
929  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
930  const __m128i final_rounding = _mm_set1_epi16(1<<4);
931  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
932  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
933  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
934  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
935  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
936  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
937  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
938  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
939  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
940
941  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
942  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
943  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
944  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
945
946  // Rows. Load 4-row input data.
947  in0 = _mm_load_si128((const __m128i *)input);
948  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
949  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
950  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
951
952  // 8x4 Transpose
953  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
954  // Stage1
955  { //NOLINT
956    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
957    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
958
959    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
960    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
961    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
962    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
963
964    tmp0 = _mm_add_epi32(tmp0, rounding);
965    tmp2 = _mm_add_epi32(tmp2, rounding);
966    tmp4 = _mm_add_epi32(tmp4, rounding);
967    tmp6 = _mm_add_epi32(tmp6, rounding);
968    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
969    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
970    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
971    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
972
973    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
974    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
975  }
976
977  // Stage2
978  { //NOLINT
979    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
980    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
981
982    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
983    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
984    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
985    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
986
987    tmp0 = _mm_add_epi32(tmp0, rounding);
988    tmp2 = _mm_add_epi32(tmp2, rounding);
989    tmp4 = _mm_add_epi32(tmp4, rounding);
990    tmp6 = _mm_add_epi32(tmp6, rounding);
991    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
992    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
993    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
994    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
995
996    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
997    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
998
999    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
1000    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
1001
1002    stp2_4 = tmp0;
1003    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
1004    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
1005  }
1006
1007  // Stage3
1008  { //NOLINT
1009    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
1010
1011    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
1012    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
1013
1014    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
1015    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
1016
1017    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
1018    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
1019
1020    tmp0 = _mm_add_epi32(tmp0, rounding);
1021    tmp2 = _mm_add_epi32(tmp2, rounding);
1022    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1023    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1024
1025    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
1026  }
1027
1028  // Stage4
1029  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
1030  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
1031  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
1032  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
1033
1034  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
1035
1036  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
1037           in0, in1, in2, in3, in4, in5, in6, in7);
1038  // Final rounding and shift
1039  in0 = _mm_adds_epi16(in0, final_rounding);
1040  in1 = _mm_adds_epi16(in1, final_rounding);
1041  in2 = _mm_adds_epi16(in2, final_rounding);
1042  in3 = _mm_adds_epi16(in3, final_rounding);
1043  in4 = _mm_adds_epi16(in4, final_rounding);
1044  in5 = _mm_adds_epi16(in5, final_rounding);
1045  in6 = _mm_adds_epi16(in6, final_rounding);
1046  in7 = _mm_adds_epi16(in7, final_rounding);
1047
1048  in0 = _mm_srai_epi16(in0, 5);
1049  in1 = _mm_srai_epi16(in1, 5);
1050  in2 = _mm_srai_epi16(in2, 5);
1051  in3 = _mm_srai_epi16(in3, 5);
1052  in4 = _mm_srai_epi16(in4, 5);
1053  in5 = _mm_srai_epi16(in5, 5);
1054  in6 = _mm_srai_epi16(in6, 5);
1055  in7 = _mm_srai_epi16(in7, 5);
1056
1057  RECON_AND_STORE(dest, in0);
1058  RECON_AND_STORE(dest, in1);
1059  RECON_AND_STORE(dest, in2);
1060  RECON_AND_STORE(dest, in3);
1061  RECON_AND_STORE(dest, in4);
1062  RECON_AND_STORE(dest, in5);
1063  RECON_AND_STORE(dest, in6);
1064  RECON_AND_STORE(dest, in7);
1065}
1066
1067#define IDCT16 \
1068  /* Stage2 */ \
1069  { \
1070    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
1071    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
1072    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
1073    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
1074    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
1075    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
1076    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
1077    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
1078    \
1079    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
1080                           stg2_0, stg2_1, stg2_2, stg2_3, \
1081                           stp2_8, stp2_15, stp2_9, stp2_14) \
1082    \
1083    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
1084                           stg2_4, stg2_5, stg2_6, stg2_7, \
1085                           stp2_10, stp2_13, stp2_11, stp2_12) \
1086  } \
1087    \
1088  /* Stage3 */ \
1089  { \
1090    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
1091    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
1092    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
1093    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
1094    \
1095    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
1096                           stg3_0, stg3_1, stg3_2, stg3_3, \
1097                           stp1_4, stp1_7, stp1_5, stp1_6) \
1098    \
1099    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
1100    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
1101    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
1102    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
1103    \
1104    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
1105    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
1106    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
1107    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
1108  } \
1109  \
1110  /* Stage4 */ \
1111  { \
1112    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
1113    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
1114    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
1115    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
1116    \
1117    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1118    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1119    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1120    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1121    \
1122    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
1123                           stg4_0, stg4_1, stg4_2, stg4_3, \
1124                           stp2_0, stp2_1, stp2_2, stp2_3) \
1125    \
1126    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1127    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1128    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1129    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1130    \
1131    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1132                           stg4_4, stg4_5, stg4_6, stg4_7, \
1133                           stp2_9, stp2_14, stp2_10, stp2_13) \
1134  } \
1135    \
1136  /* Stage5 */ \
1137  { \
1138    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1139    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1140    \
1141    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1142    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1143    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1144    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1145    \
1146    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1147    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1148    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1149    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1150    \
1151    tmp0 = _mm_add_epi32(tmp0, rounding); \
1152    tmp1 = _mm_add_epi32(tmp1, rounding); \
1153    tmp2 = _mm_add_epi32(tmp2, rounding); \
1154    tmp3 = _mm_add_epi32(tmp3, rounding); \
1155    \
1156    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1157    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1158    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1159    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1160    \
1161    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1162    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1163    \
1164    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1165    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1166    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1167    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1168    \
1169    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1170    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1171    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1172    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1173  } \
1174    \
1175  /* Stage6 */ \
1176  { \
1177    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1178    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1179    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1180    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1181    \
1182    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1183    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1184    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1185    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1186    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1187    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1188    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1189    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1190    \
1191    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1192                           stg6_0, stg4_0, stg6_0, stg4_0, \
1193                           stp2_10, stp2_13, stp2_11, stp2_12) \
1194  }
1195
1196#define IDCT16_10 \
1197    /* Stage2 */ \
1198    { \
1199      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
1200      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
1201      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
1202      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
1203      \
1204      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
1205                             stg2_0, stg2_1, stg2_6, stg2_7, \
1206                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
1207    } \
1208      \
1209    /* Stage3 */ \
1210    { \
1211      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
1212      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
1213      \
1214      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
1215                               stg3_0, stg3_1,  \
1216                               stp2_4, stp2_7) \
1217      \
1218      stp1_9  =  stp1_8_0; \
1219      stp1_10 =  stp1_11;  \
1220      \
1221      stp1_13 = stp1_12_0; \
1222      stp1_14 = stp1_15;   \
1223    } \
1224    \
1225    /* Stage4 */ \
1226    { \
1227      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
1228      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
1229      \
1230      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1231      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1232      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1233      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1234      \
1235      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
1236                               stg4_0, stg4_1, \
1237                               stp1_0, stp1_1) \
1238      stp2_5 = stp2_4; \
1239      stp2_6 = stp2_7; \
1240      \
1241      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1242                             stg4_4, stg4_5, stg4_6, stg4_7, \
1243                             stp2_9, stp2_14, stp2_10, stp2_13) \
1244    } \
1245      \
1246    /* Stage5 */ \
1247    { \
1248      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1249      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1250      \
1251      stp1_2 = stp1_1; \
1252      stp1_3 = stp1_0; \
1253      \
1254      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1255      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1256      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1257      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1258      \
1259      tmp0 = _mm_add_epi32(tmp0, rounding); \
1260      tmp1 = _mm_add_epi32(tmp1, rounding); \
1261      tmp2 = _mm_add_epi32(tmp2, rounding); \
1262      tmp3 = _mm_add_epi32(tmp3, rounding); \
1263      \
1264      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1265      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1266      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1267      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1268      \
1269      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1270      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1271      \
1272      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1273      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1274      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1275      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1276      \
1277      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1278      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1279      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1280      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1281    } \
1282      \
1283    /* Stage6 */ \
1284    { \
1285      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1286      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1287      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1288      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1289      \
1290      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1291      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1292      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1293      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1294      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1295      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1296      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1297      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1298      \
1299      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1300                             stg6_0, stg4_0, stg6_0, stg4_0, \
1301                             stp2_10, stp2_13, stp2_11, stp2_12) \
1302    }
1303
1304void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
1305                                int stride) {
1306  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1307  const __m128i final_rounding = _mm_set1_epi16(1<<5);
1308  const __m128i zero = _mm_setzero_si128();
1309
1310  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1311  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1312  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1313  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1314  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1315  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1316  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1317  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1318
1319  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1320  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1321  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1322  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1323
1324  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1325  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1326  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1327  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1328  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1329  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1330  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1331  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1332
1333  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1334
1335  __m128i in[16], l[16], r[16], *curr1;
1336  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1337          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1338          stp1_8_0, stp1_12_0;
1339  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1340          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1341  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1342  int i;
1343
1344  curr1 = l;
1345  for (i = 0; i < 2; i++) {
1346      // 1-D idct
1347
1348      // Load input data.
1349      in[0] = _mm_load_si128((const __m128i *)input);
1350      in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
1351      in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
1352      in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
1353      in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
1354      in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
1355      in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
1356      in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
1357      in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
1358      in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
1359      in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
1360      in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
1361      in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
1362      in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
1363      in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
1364      in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
1365
1366      array_transpose_8x8(in, in);
1367      array_transpose_8x8(in+8, in+8);
1368
1369      IDCT16
1370
1371      // Stage7
1372      curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1373      curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1374      curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1375      curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1376      curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1377      curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1378      curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1379      curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1380      curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1381      curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1382      curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1383      curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1384      curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1385      curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1386      curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1387      curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1388
1389      curr1 = r;
1390      input += 128;
1391  }
1392  for (i = 0; i < 2; i++) {
1393      // 1-D idct
1394      array_transpose_8x8(l+i*8, in);
1395      array_transpose_8x8(r+i*8, in+8);
1396
1397      IDCT16
1398
1399      // 2-D
1400      in[0] = _mm_add_epi16(stp2_0, stp1_15);
1401      in[1] = _mm_add_epi16(stp2_1, stp1_14);
1402      in[2] = _mm_add_epi16(stp2_2, stp2_13);
1403      in[3] = _mm_add_epi16(stp2_3, stp2_12);
1404      in[4] = _mm_add_epi16(stp2_4, stp2_11);
1405      in[5] = _mm_add_epi16(stp2_5, stp2_10);
1406      in[6] = _mm_add_epi16(stp2_6, stp1_9);
1407      in[7] = _mm_add_epi16(stp2_7, stp1_8);
1408      in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1409      in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1410      in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1411      in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1412      in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1413      in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1414      in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1415      in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1416
1417      // Final rounding and shift
1418      in[0] = _mm_adds_epi16(in[0], final_rounding);
1419      in[1] = _mm_adds_epi16(in[1], final_rounding);
1420      in[2] = _mm_adds_epi16(in[2], final_rounding);
1421      in[3] = _mm_adds_epi16(in[3], final_rounding);
1422      in[4] = _mm_adds_epi16(in[4], final_rounding);
1423      in[5] = _mm_adds_epi16(in[5], final_rounding);
1424      in[6] = _mm_adds_epi16(in[6], final_rounding);
1425      in[7] = _mm_adds_epi16(in[7], final_rounding);
1426      in[8] = _mm_adds_epi16(in[8], final_rounding);
1427      in[9] = _mm_adds_epi16(in[9], final_rounding);
1428      in[10] = _mm_adds_epi16(in[10], final_rounding);
1429      in[11] = _mm_adds_epi16(in[11], final_rounding);
1430      in[12] = _mm_adds_epi16(in[12], final_rounding);
1431      in[13] = _mm_adds_epi16(in[13], final_rounding);
1432      in[14] = _mm_adds_epi16(in[14], final_rounding);
1433      in[15] = _mm_adds_epi16(in[15], final_rounding);
1434
1435      in[0] = _mm_srai_epi16(in[0], 6);
1436      in[1] = _mm_srai_epi16(in[1], 6);
1437      in[2] = _mm_srai_epi16(in[2], 6);
1438      in[3] = _mm_srai_epi16(in[3], 6);
1439      in[4] = _mm_srai_epi16(in[4], 6);
1440      in[5] = _mm_srai_epi16(in[5], 6);
1441      in[6] = _mm_srai_epi16(in[6], 6);
1442      in[7] = _mm_srai_epi16(in[7], 6);
1443      in[8] = _mm_srai_epi16(in[8], 6);
1444      in[9] = _mm_srai_epi16(in[9], 6);
1445      in[10] = _mm_srai_epi16(in[10], 6);
1446      in[11] = _mm_srai_epi16(in[11], 6);
1447      in[12] = _mm_srai_epi16(in[12], 6);
1448      in[13] = _mm_srai_epi16(in[13], 6);
1449      in[14] = _mm_srai_epi16(in[14], 6);
1450      in[15] = _mm_srai_epi16(in[15], 6);
1451
1452      RECON_AND_STORE(dest, in[0]);
1453      RECON_AND_STORE(dest, in[1]);
1454      RECON_AND_STORE(dest, in[2]);
1455      RECON_AND_STORE(dest, in[3]);
1456      RECON_AND_STORE(dest, in[4]);
1457      RECON_AND_STORE(dest, in[5]);
1458      RECON_AND_STORE(dest, in[6]);
1459      RECON_AND_STORE(dest, in[7]);
1460      RECON_AND_STORE(dest, in[8]);
1461      RECON_AND_STORE(dest, in[9]);
1462      RECON_AND_STORE(dest, in[10]);
1463      RECON_AND_STORE(dest, in[11]);
1464      RECON_AND_STORE(dest, in[12]);
1465      RECON_AND_STORE(dest, in[13]);
1466      RECON_AND_STORE(dest, in[14]);
1467      RECON_AND_STORE(dest, in[15]);
1468
1469      dest += 8 - (stride * 16);
1470  }
1471}
1472
1473void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
1474  __m128i dc_value;
1475  const __m128i zero = _mm_setzero_si128();
1476  int a, i;
1477
1478  a = dct_const_round_shift(input[0] * cospi_16_64);
1479  a = dct_const_round_shift(a * cospi_16_64);
1480  a = ROUND_POWER_OF_TWO(a, 6);
1481
1482  dc_value = _mm_set1_epi16(a);
1483
1484  for (i = 0; i < 2; ++i) {
1485    RECON_AND_STORE(dest, dc_value);
1486    RECON_AND_STORE(dest, dc_value);
1487    RECON_AND_STORE(dest, dc_value);
1488    RECON_AND_STORE(dest, dc_value);
1489    RECON_AND_STORE(dest, dc_value);
1490    RECON_AND_STORE(dest, dc_value);
1491    RECON_AND_STORE(dest, dc_value);
1492    RECON_AND_STORE(dest, dc_value);
1493    RECON_AND_STORE(dest, dc_value);
1494    RECON_AND_STORE(dest, dc_value);
1495    RECON_AND_STORE(dest, dc_value);
1496    RECON_AND_STORE(dest, dc_value);
1497    RECON_AND_STORE(dest, dc_value);
1498    RECON_AND_STORE(dest, dc_value);
1499    RECON_AND_STORE(dest, dc_value);
1500    RECON_AND_STORE(dest, dc_value);
1501    dest += 8 - (stride * 16);
1502  }
1503}
1504
1505static void iadst16_8col(__m128i *in) {
1506  // perform 16x16 1-D ADST for 8 columns
1507  __m128i s[16], x[16], u[32], v[32];
1508  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1509  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1510  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1511  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1512  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1513  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1514  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1515  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1516  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1517  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1518  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1519  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1520  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1521  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1522  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1523  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1524  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1525  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1526  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1527  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1528  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1529  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1530  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1531  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1532  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1533  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
1534  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1535  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1536  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1537  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1538  const __m128i kZero = _mm_set1_epi16(0);
1539
1540  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1541  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1542  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1543  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1544  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1545  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1546  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1547  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1548  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1549  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1550  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1551  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1552  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1553  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1554  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1555  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1556
1557  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1558  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1559  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1560  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1561  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1562  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1563  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1564  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1565  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1566  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1567  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1568  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1569  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1570  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1571  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1572  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1573  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1574  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1575  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1576  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1577  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1578  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1579  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1580  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1581  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1582  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1583  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1584  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1585  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1586  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1587  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1588  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1589
1590  u[0] = _mm_add_epi32(v[0], v[16]);
1591  u[1] = _mm_add_epi32(v[1], v[17]);
1592  u[2] = _mm_add_epi32(v[2], v[18]);
1593  u[3] = _mm_add_epi32(v[3], v[19]);
1594  u[4] = _mm_add_epi32(v[4], v[20]);
1595  u[5] = _mm_add_epi32(v[5], v[21]);
1596  u[6] = _mm_add_epi32(v[6], v[22]);
1597  u[7] = _mm_add_epi32(v[7], v[23]);
1598  u[8] = _mm_add_epi32(v[8], v[24]);
1599  u[9] = _mm_add_epi32(v[9], v[25]);
1600  u[10] = _mm_add_epi32(v[10], v[26]);
1601  u[11] = _mm_add_epi32(v[11], v[27]);
1602  u[12] = _mm_add_epi32(v[12], v[28]);
1603  u[13] = _mm_add_epi32(v[13], v[29]);
1604  u[14] = _mm_add_epi32(v[14], v[30]);
1605  u[15] = _mm_add_epi32(v[15], v[31]);
1606  u[16] = _mm_sub_epi32(v[0], v[16]);
1607  u[17] = _mm_sub_epi32(v[1], v[17]);
1608  u[18] = _mm_sub_epi32(v[2], v[18]);
1609  u[19] = _mm_sub_epi32(v[3], v[19]);
1610  u[20] = _mm_sub_epi32(v[4], v[20]);
1611  u[21] = _mm_sub_epi32(v[5], v[21]);
1612  u[22] = _mm_sub_epi32(v[6], v[22]);
1613  u[23] = _mm_sub_epi32(v[7], v[23]);
1614  u[24] = _mm_sub_epi32(v[8], v[24]);
1615  u[25] = _mm_sub_epi32(v[9], v[25]);
1616  u[26] = _mm_sub_epi32(v[10], v[26]);
1617  u[27] = _mm_sub_epi32(v[11], v[27]);
1618  u[28] = _mm_sub_epi32(v[12], v[28]);
1619  u[29] = _mm_sub_epi32(v[13], v[29]);
1620  u[30] = _mm_sub_epi32(v[14], v[30]);
1621  u[31] = _mm_sub_epi32(v[15], v[31]);
1622
1623  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1624  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1625  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1626  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1627  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1628  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1629  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1630  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1631  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1632  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1633  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1634  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1635  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1636  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1637  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1638  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1639  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1640  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1641  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1642  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1643  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1644  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1645  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1646  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1647  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1648  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1649  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1650  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1651  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1652  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1653  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1654  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1655
1656  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1657  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1658  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1659  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1660  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1661  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1662  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1663  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1664  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1665  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1666  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1667  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1668  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1669  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1670  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1671  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1672  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1673  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1674  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1675  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1676  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1677  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1678  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1679  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1680  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1681  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1682  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1683  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1684  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1685  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1686  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1687  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1688
1689  s[0] = _mm_packs_epi32(u[0], u[1]);
1690  s[1] = _mm_packs_epi32(u[2], u[3]);
1691  s[2] = _mm_packs_epi32(u[4], u[5]);
1692  s[3] = _mm_packs_epi32(u[6], u[7]);
1693  s[4] = _mm_packs_epi32(u[8], u[9]);
1694  s[5] = _mm_packs_epi32(u[10], u[11]);
1695  s[6] = _mm_packs_epi32(u[12], u[13]);
1696  s[7] = _mm_packs_epi32(u[14], u[15]);
1697  s[8] = _mm_packs_epi32(u[16], u[17]);
1698  s[9] = _mm_packs_epi32(u[18], u[19]);
1699  s[10] = _mm_packs_epi32(u[20], u[21]);
1700  s[11] = _mm_packs_epi32(u[22], u[23]);
1701  s[12] = _mm_packs_epi32(u[24], u[25]);
1702  s[13] = _mm_packs_epi32(u[26], u[27]);
1703  s[14] = _mm_packs_epi32(u[28], u[29]);
1704  s[15] = _mm_packs_epi32(u[30], u[31]);
1705
1706  // stage 2
1707  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1708  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1709  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1710  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1711  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1712  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1713  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1714  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1715
1716  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1717  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1718  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1719  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1720  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1721  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1722  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1723  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1724  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1725  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1726  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1727  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1728  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1729  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1730  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1731  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1732
1733  u[0] = _mm_add_epi32(v[0], v[8]);
1734  u[1] = _mm_add_epi32(v[1], v[9]);
1735  u[2] = _mm_add_epi32(v[2], v[10]);
1736  u[3] = _mm_add_epi32(v[3], v[11]);
1737  u[4] = _mm_add_epi32(v[4], v[12]);
1738  u[5] = _mm_add_epi32(v[5], v[13]);
1739  u[6] = _mm_add_epi32(v[6], v[14]);
1740  u[7] = _mm_add_epi32(v[7], v[15]);
1741  u[8] = _mm_sub_epi32(v[0], v[8]);
1742  u[9] = _mm_sub_epi32(v[1], v[9]);
1743  u[10] = _mm_sub_epi32(v[2], v[10]);
1744  u[11] = _mm_sub_epi32(v[3], v[11]);
1745  u[12] = _mm_sub_epi32(v[4], v[12]);
1746  u[13] = _mm_sub_epi32(v[5], v[13]);
1747  u[14] = _mm_sub_epi32(v[6], v[14]);
1748  u[15] = _mm_sub_epi32(v[7], v[15]);
1749
1750  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1751  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1752  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1753  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1754  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1755  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1756  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1757  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1758  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1759  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1760  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1761  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1762  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1763  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1764  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1765  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1766
1767  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1768  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1769  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1770  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1771  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1772  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1773  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1774  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1775  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1776  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1777  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1778  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1779  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1780  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1781  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1782  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1783
1784  x[0] = _mm_add_epi16(s[0], s[4]);
1785  x[1] = _mm_add_epi16(s[1], s[5]);
1786  x[2] = _mm_add_epi16(s[2], s[6]);
1787  x[3] = _mm_add_epi16(s[3], s[7]);
1788  x[4] = _mm_sub_epi16(s[0], s[4]);
1789  x[5] = _mm_sub_epi16(s[1], s[5]);
1790  x[6] = _mm_sub_epi16(s[2], s[6]);
1791  x[7] = _mm_sub_epi16(s[3], s[7]);
1792  x[8] = _mm_packs_epi32(u[0], u[1]);
1793  x[9] = _mm_packs_epi32(u[2], u[3]);
1794  x[10] = _mm_packs_epi32(u[4], u[5]);
1795  x[11] = _mm_packs_epi32(u[6], u[7]);
1796  x[12] = _mm_packs_epi32(u[8], u[9]);
1797  x[13] = _mm_packs_epi32(u[10], u[11]);
1798  x[14] = _mm_packs_epi32(u[12], u[13]);
1799  x[15] = _mm_packs_epi32(u[14], u[15]);
1800
1801  // stage 3
1802  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1803  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1804  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1805  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1806  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1807  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1808  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1809  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1810
1811  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1812  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1813  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1814  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1815  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1816  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1817  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1818  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1819  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1820  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1821  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1822  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1823  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1824  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1825  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1826  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1827
1828  u[0] = _mm_add_epi32(v[0], v[4]);
1829  u[1] = _mm_add_epi32(v[1], v[5]);
1830  u[2] = _mm_add_epi32(v[2], v[6]);
1831  u[3] = _mm_add_epi32(v[3], v[7]);
1832  u[4] = _mm_sub_epi32(v[0], v[4]);
1833  u[5] = _mm_sub_epi32(v[1], v[5]);
1834  u[6] = _mm_sub_epi32(v[2], v[6]);
1835  u[7] = _mm_sub_epi32(v[3], v[7]);
1836  u[8] = _mm_add_epi32(v[8], v[12]);
1837  u[9] = _mm_add_epi32(v[9], v[13]);
1838  u[10] = _mm_add_epi32(v[10], v[14]);
1839  u[11] = _mm_add_epi32(v[11], v[15]);
1840  u[12] = _mm_sub_epi32(v[8], v[12]);
1841  u[13] = _mm_sub_epi32(v[9], v[13]);
1842  u[14] = _mm_sub_epi32(v[10], v[14]);
1843  u[15] = _mm_sub_epi32(v[11], v[15]);
1844
1845  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1846  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1847  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1848  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1849  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1850  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1851  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1852  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1853  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1854  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1855  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1856  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1857  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1858  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1859  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1860  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1861
1862  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1863  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1864  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1865  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1866  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1867  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1868  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1869  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1870  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1871  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1872  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1873  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1874  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1875  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1876  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1877  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1878
1879  s[0] = _mm_add_epi16(x[0], x[2]);
1880  s[1] = _mm_add_epi16(x[1], x[3]);
1881  s[2] = _mm_sub_epi16(x[0], x[2]);
1882  s[3] = _mm_sub_epi16(x[1], x[3]);
1883  s[4] = _mm_packs_epi32(v[0], v[1]);
1884  s[5] = _mm_packs_epi32(v[2], v[3]);
1885  s[6] = _mm_packs_epi32(v[4], v[5]);
1886  s[7] = _mm_packs_epi32(v[6], v[7]);
1887  s[8] = _mm_add_epi16(x[8], x[10]);
1888  s[9] = _mm_add_epi16(x[9], x[11]);
1889  s[10] = _mm_sub_epi16(x[8], x[10]);
1890  s[11] = _mm_sub_epi16(x[9], x[11]);
1891  s[12] = _mm_packs_epi32(v[8], v[9]);
1892  s[13] = _mm_packs_epi32(v[10], v[11]);
1893  s[14] = _mm_packs_epi32(v[12], v[13]);
1894  s[15] = _mm_packs_epi32(v[14], v[15]);
1895
1896  // stage 4
1897  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1898  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1899  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1900  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1901  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1902  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1903  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1904  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1905
1906  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1907  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1908  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1909  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1910  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1911  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1912  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1913  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1914  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1915  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1916  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1917  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1918  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1919  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1920  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1921  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1922
1923  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1924  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1925  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1926  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1927  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1928  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1929  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1930  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1931  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1932  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1933  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1934  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1935  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1936  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1937  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1938  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1939
1940  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1941  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1942  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1943  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1944  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1945  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1946  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1947  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1948  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1949  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1950  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1951  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1952  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1953  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1954  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1955  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1956
1957  in[0] = s[0];
1958  in[1] = _mm_sub_epi16(kZero, s[8]);
1959  in[2] = s[12];
1960  in[3] = _mm_sub_epi16(kZero, s[4]);
1961  in[4] = _mm_packs_epi32(v[4], v[5]);
1962  in[5] = _mm_packs_epi32(v[12], v[13]);
1963  in[6] = _mm_packs_epi32(v[8], v[9]);
1964  in[7] = _mm_packs_epi32(v[0], v[1]);
1965  in[8] = _mm_packs_epi32(v[2], v[3]);
1966  in[9] = _mm_packs_epi32(v[10], v[11]);
1967  in[10] = _mm_packs_epi32(v[14], v[15]);
1968  in[11] = _mm_packs_epi32(v[6], v[7]);
1969  in[12] = s[5];
1970  in[13] = _mm_sub_epi16(kZero, s[13]);
1971  in[14] = s[9];
1972  in[15] = _mm_sub_epi16(kZero, s[1]);
1973}
1974
1975static void idct16_8col(__m128i *in) {
1976  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1977  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1978  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1979  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1980  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1981  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1982  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1983  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1984  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1985  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1986  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1987  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1988  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1989  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1990  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1991  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1992  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1993  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1994  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1995  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1996  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1997  __m128i v[16], u[16], s[16], t[16];
1998
1999  // stage 1
2000  s[0] = in[0];
2001  s[1] = in[8];
2002  s[2] = in[4];
2003  s[3] = in[12];
2004  s[4] = in[2];
2005  s[5] = in[10];
2006  s[6] = in[6];
2007  s[7] = in[14];
2008  s[8] = in[1];
2009  s[9] = in[9];
2010  s[10] = in[5];
2011  s[11] = in[13];
2012  s[12] = in[3];
2013  s[13] = in[11];
2014  s[14] = in[7];
2015  s[15] = in[15];
2016
2017  // stage 2
2018  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
2019  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
2020  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
2021  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
2022  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
2023  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
2024  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
2025  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
2026
2027  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
2028  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
2029  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
2030  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
2031  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
2032  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
2033  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
2034  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
2035  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
2036  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
2037  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
2038  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
2039  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
2040  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
2041  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
2042  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
2043
2044  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2045  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2046  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2047  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2048  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2049  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2050  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2051  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2052  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2053  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2054  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2055  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2056  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2057  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2058  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2059  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2060
2061  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2062  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2063  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2064  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2065  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2066  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2067  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2068  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2069  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2070  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2071  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2072  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2073  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2074  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2075  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2076  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2077
2078  s[8]  = _mm_packs_epi32(u[0], u[1]);
2079  s[15] = _mm_packs_epi32(u[2], u[3]);
2080  s[9]  = _mm_packs_epi32(u[4], u[5]);
2081  s[14] = _mm_packs_epi32(u[6], u[7]);
2082  s[10] = _mm_packs_epi32(u[8], u[9]);
2083  s[13] = _mm_packs_epi32(u[10], u[11]);
2084  s[11] = _mm_packs_epi32(u[12], u[13]);
2085  s[12] = _mm_packs_epi32(u[14], u[15]);
2086
2087  // stage 3
2088  t[0] = s[0];
2089  t[1] = s[1];
2090  t[2] = s[2];
2091  t[3] = s[3];
2092  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
2093  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
2094  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
2095  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
2096
2097  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
2098  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
2099  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
2100  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
2101  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
2102  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
2103  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
2104  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
2105
2106  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2107  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2108  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2109  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2110  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2111  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2112  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2113  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2114
2115  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2116  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2117  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2118  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2119  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2120  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2121  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2122  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2123
2124  t[4] = _mm_packs_epi32(u[0], u[1]);
2125  t[7] = _mm_packs_epi32(u[2], u[3]);
2126  t[5] = _mm_packs_epi32(u[4], u[5]);
2127  t[6] = _mm_packs_epi32(u[6], u[7]);
2128  t[8] = _mm_add_epi16(s[8], s[9]);
2129  t[9] = _mm_sub_epi16(s[8], s[9]);
2130  t[10] = _mm_sub_epi16(s[11], s[10]);
2131  t[11] = _mm_add_epi16(s[10], s[11]);
2132  t[12] = _mm_add_epi16(s[12], s[13]);
2133  t[13] = _mm_sub_epi16(s[12], s[13]);
2134  t[14] = _mm_sub_epi16(s[15], s[14]);
2135  t[15] = _mm_add_epi16(s[14], s[15]);
2136
2137  // stage 4
2138  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
2139  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
2140  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
2141  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
2142  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
2143  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
2144  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
2145  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
2146
2147  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2148  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2149  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
2150  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
2151  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
2152  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
2153  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
2154  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
2155  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
2156  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
2157  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
2158  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
2159  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
2160  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
2161  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
2162  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
2163
2164  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2165  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2166  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2167  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2168  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2169  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2170  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2171  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2172  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2173  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2174  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2175  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2176  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2177  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2178  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2179  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2180
2181  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2182  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2183  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2184  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2185  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2186  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2187  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2188  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2189  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2190  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2191  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2192  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2193  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2194  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2195  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2196  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2197
2198  s[0] = _mm_packs_epi32(u[0], u[1]);
2199  s[1] = _mm_packs_epi32(u[2], u[3]);
2200  s[2] = _mm_packs_epi32(u[4], u[5]);
2201  s[3] = _mm_packs_epi32(u[6], u[7]);
2202  s[4] = _mm_add_epi16(t[4], t[5]);
2203  s[5] = _mm_sub_epi16(t[4], t[5]);
2204  s[6] = _mm_sub_epi16(t[7], t[6]);
2205  s[7] = _mm_add_epi16(t[6], t[7]);
2206  s[8] = t[8];
2207  s[15] = t[15];
2208  s[9]  = _mm_packs_epi32(u[8], u[9]);
2209  s[14] = _mm_packs_epi32(u[10], u[11]);
2210  s[10] = _mm_packs_epi32(u[12], u[13]);
2211  s[13] = _mm_packs_epi32(u[14], u[15]);
2212  s[11] = t[11];
2213  s[12] = t[12];
2214
2215  // stage 5
2216  t[0] = _mm_add_epi16(s[0], s[3]);
2217  t[1] = _mm_add_epi16(s[1], s[2]);
2218  t[2] = _mm_sub_epi16(s[1], s[2]);
2219  t[3] = _mm_sub_epi16(s[0], s[3]);
2220  t[4] = s[4];
2221  t[7] = s[7];
2222
2223  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
2224  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
2225  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2226  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2227  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2228  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2229  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2230  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2231  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2232  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2233  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2234  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2235  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2236  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2237  t[5] = _mm_packs_epi32(u[0], u[1]);
2238  t[6] = _mm_packs_epi32(u[2], u[3]);
2239
2240  t[8] = _mm_add_epi16(s[8], s[11]);
2241  t[9] = _mm_add_epi16(s[9], s[10]);
2242  t[10] = _mm_sub_epi16(s[9], s[10]);
2243  t[11] = _mm_sub_epi16(s[8], s[11]);
2244  t[12] = _mm_sub_epi16(s[15], s[12]);
2245  t[13] = _mm_sub_epi16(s[14], s[13]);
2246  t[14] = _mm_add_epi16(s[13], s[14]);
2247  t[15] = _mm_add_epi16(s[12], s[15]);
2248
2249  // stage 6
2250  s[0] = _mm_add_epi16(t[0], t[7]);
2251  s[1] = _mm_add_epi16(t[1], t[6]);
2252  s[2] = _mm_add_epi16(t[2], t[5]);
2253  s[3] = _mm_add_epi16(t[3], t[4]);
2254  s[4] = _mm_sub_epi16(t[3], t[4]);
2255  s[5] = _mm_sub_epi16(t[2], t[5]);
2256  s[6] = _mm_sub_epi16(t[1], t[6]);
2257  s[7] = _mm_sub_epi16(t[0], t[7]);
2258  s[8] = t[8];
2259  s[9] = t[9];
2260
2261  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
2262  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
2263  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
2264  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
2265
2266  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2267  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2268  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2269  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2270  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2271  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2272  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2273  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2274
2275  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2276  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2277  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2278  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2279  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2280  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2281  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2282  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2283
2284  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2285  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2286  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2287  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2288  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2289  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2290  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2291  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2292
2293  s[10] = _mm_packs_epi32(u[0], u[1]);
2294  s[13] = _mm_packs_epi32(u[2], u[3]);
2295  s[11] = _mm_packs_epi32(u[4], u[5]);
2296  s[12] = _mm_packs_epi32(u[6], u[7]);
2297  s[14] = t[14];
2298  s[15] = t[15];
2299
2300  // stage 7
2301  in[0] = _mm_add_epi16(s[0], s[15]);
2302  in[1] = _mm_add_epi16(s[1], s[14]);
2303  in[2] = _mm_add_epi16(s[2], s[13]);
2304  in[3] = _mm_add_epi16(s[3], s[12]);
2305  in[4] = _mm_add_epi16(s[4], s[11]);
2306  in[5] = _mm_add_epi16(s[5], s[10]);
2307  in[6] = _mm_add_epi16(s[6], s[9]);
2308  in[7] = _mm_add_epi16(s[7], s[8]);
2309  in[8] = _mm_sub_epi16(s[7], s[8]);
2310  in[9] = _mm_sub_epi16(s[6], s[9]);
2311  in[10] = _mm_sub_epi16(s[5], s[10]);
2312  in[11] = _mm_sub_epi16(s[4], s[11]);
2313  in[12] = _mm_sub_epi16(s[3], s[12]);
2314  in[13] = _mm_sub_epi16(s[2], s[13]);
2315  in[14] = _mm_sub_epi16(s[1], s[14]);
2316  in[15] = _mm_sub_epi16(s[0], s[15]);
2317}
2318
2319static void idct16_sse2(__m128i *in0, __m128i *in1) {
2320  array_transpose_16x16(in0, in1);
2321  idct16_8col(in0);
2322  idct16_8col(in1);
2323}
2324
2325static void iadst16_sse2(__m128i *in0, __m128i *in1) {
2326  array_transpose_16x16(in0, in1);
2327  iadst16_8col(in0);
2328  iadst16_8col(in1);
2329}
2330
2331void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
2332                               int tx_type) {
2333  __m128i in0[16], in1[16];
2334
2335  load_buffer_8x16(input, in0);
2336  input += 8;
2337  load_buffer_8x16(input, in1);
2338
2339  switch (tx_type) {
2340    case 0:  // DCT_DCT
2341      idct16_sse2(in0, in1);
2342      idct16_sse2(in0, in1);
2343      break;
2344    case 1:  // ADST_DCT
2345      idct16_sse2(in0, in1);
2346      iadst16_sse2(in0, in1);
2347      break;
2348    case 2:  // DCT_ADST
2349      iadst16_sse2(in0, in1);
2350      idct16_sse2(in0, in1);
2351      break;
2352    case 3:  // ADST_ADST
2353      iadst16_sse2(in0, in1);
2354      iadst16_sse2(in0, in1);
2355      break;
2356    default:
2357      assert(0);
2358      break;
2359  }
2360
2361  write_buffer_8x16(dest, in0, stride);
2362  dest += 8;
2363  write_buffer_8x16(dest, in1, stride);
2364}
2365
2366void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
2367                               int stride) {
2368  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2369  const __m128i final_rounding = _mm_set1_epi16(1<<5);
2370  const __m128i zero = _mm_setzero_si128();
2371
2372  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2373  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2374  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2375  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2376
2377  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2378  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2379
2380  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2381  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2382  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2383  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2384  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2385  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2386
2387  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2388  __m128i in[16], l[16];
2389  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
2390          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2391          stp1_8_0, stp1_12_0;
2392  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2393          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
2394  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2395  int i;
2396  // First 1-D inverse DCT
2397  // Load input data.
2398  in[0] = _mm_load_si128((const __m128i *)input);
2399  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
2400  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
2401  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
2402
2403  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
2404
2405  // Stage2
2406  {
2407    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
2408    const __m128i lo_13_3 =  _mm_unpackhi_epi16(zero, in[1]);
2409
2410    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2411    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2412    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2413    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2414
2415    tmp0 = _mm_add_epi32(tmp0, rounding);
2416    tmp2 = _mm_add_epi32(tmp2, rounding);
2417    tmp5 = _mm_add_epi32(tmp5, rounding);
2418    tmp7 = _mm_add_epi32(tmp7, rounding);
2419
2420    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2421    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2422    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2423    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2424
2425    stp2_8  = _mm_packs_epi32(tmp0, tmp2);
2426    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
2427  }
2428
2429  // Stage3
2430  {
2431    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
2432
2433    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2434    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2435
2436    tmp0 = _mm_add_epi32(tmp0, rounding);
2437    tmp2 = _mm_add_epi32(tmp2, rounding);
2438    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2439    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2440
2441    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
2442    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
2443
2444    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
2445  }
2446
2447  // Stage4
2448  {
2449    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
2450    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
2451    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
2452
2453    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2454    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2455    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2456    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2457    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2458    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2459
2460    tmp0 = _mm_add_epi32(tmp0, rounding);
2461    tmp2 = _mm_add_epi32(tmp2, rounding);
2462    tmp1 = _mm_add_epi32(tmp1, rounding);
2463    tmp3 = _mm_add_epi32(tmp3, rounding);
2464    tmp5 = _mm_add_epi32(tmp5, rounding);
2465    tmp7 = _mm_add_epi32(tmp7, rounding);
2466
2467    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2468    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2469    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2470    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2471    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2472    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2473
2474    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
2475    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
2476    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
2477    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
2478
2479    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
2480  }
2481
2482  // Stage5 and Stage6
2483  {
2484    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
2485    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
2486    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
2487    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
2488
2489    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
2490    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
2491    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
2492    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
2493
2494    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
2495    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
2496    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
2497    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
2498  }
2499
2500  // Stage6
2501  {
2502    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
2503    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2504    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2505
2506    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2507    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2508    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2509    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2510    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2511    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2512
2513    tmp1 = _mm_add_epi32(tmp1, rounding);
2514    tmp3 = _mm_add_epi32(tmp3, rounding);
2515    tmp0 = _mm_add_epi32(tmp0, rounding);
2516    tmp2 = _mm_add_epi32(tmp2, rounding);
2517    tmp4 = _mm_add_epi32(tmp4, rounding);
2518    tmp6 = _mm_add_epi32(tmp6, rounding);
2519
2520    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2521    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2522    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2523    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2524    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2525    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2526
2527    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
2528
2529    stp2_10 = _mm_packs_epi32(tmp0, zero);
2530    stp2_13 = _mm_packs_epi32(tmp2, zero);
2531    stp2_11 = _mm_packs_epi32(tmp4, zero);
2532    stp2_12 = _mm_packs_epi32(tmp6, zero);
2533
2534    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
2535    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
2536    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
2537    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
2538
2539    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
2540    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
2541    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
2542    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
2543    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
2544    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
2545    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
2546    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
2547  }
2548
2549  // Stage7. Left 8x16 only.
2550  l[0] = _mm_add_epi16(stp2_0, stp1_15);
2551  l[1] = _mm_add_epi16(stp2_1, stp1_14);
2552  l[2] = _mm_add_epi16(stp2_2, stp2_13);
2553  l[3] = _mm_add_epi16(stp2_3, stp2_12);
2554  l[4] = _mm_add_epi16(stp2_4, stp2_11);
2555  l[5] = _mm_add_epi16(stp2_5, stp2_10);
2556  l[6] = _mm_add_epi16(stp2_6, stp1_9);
2557  l[7] = _mm_add_epi16(stp2_7, stp1_8);
2558  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2559  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2560  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2561  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2562  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2563  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2564  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2565  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2566
2567  // Second 1-D inverse transform, performed per 8x16 block
2568  for (i = 0; i < 2; i++) {
2569    array_transpose_4X8(l + 8*i, in);
2570
2571    IDCT16_10
2572
2573    // Stage7
2574    in[0] = _mm_add_epi16(stp2_0, stp1_15);
2575    in[1] = _mm_add_epi16(stp2_1, stp1_14);
2576    in[2] = _mm_add_epi16(stp2_2, stp2_13);
2577    in[3] = _mm_add_epi16(stp2_3, stp2_12);
2578    in[4] = _mm_add_epi16(stp2_4, stp2_11);
2579    in[5] = _mm_add_epi16(stp2_5, stp2_10);
2580    in[6] = _mm_add_epi16(stp2_6, stp1_9);
2581    in[7] = _mm_add_epi16(stp2_7, stp1_8);
2582    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2583    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2584    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2585    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2586    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2587    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2588    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2589    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2590
2591    // Final rounding and shift
2592    in[0] = _mm_adds_epi16(in[0], final_rounding);
2593    in[1] = _mm_adds_epi16(in[1], final_rounding);
2594    in[2] = _mm_adds_epi16(in[2], final_rounding);
2595    in[3] = _mm_adds_epi16(in[3], final_rounding);
2596    in[4] = _mm_adds_epi16(in[4], final_rounding);
2597    in[5] = _mm_adds_epi16(in[5], final_rounding);
2598    in[6] = _mm_adds_epi16(in[6], final_rounding);
2599    in[7] = _mm_adds_epi16(in[7], final_rounding);
2600    in[8] = _mm_adds_epi16(in[8], final_rounding);
2601    in[9] = _mm_adds_epi16(in[9], final_rounding);
2602    in[10] = _mm_adds_epi16(in[10], final_rounding);
2603    in[11] = _mm_adds_epi16(in[11], final_rounding);
2604    in[12] = _mm_adds_epi16(in[12], final_rounding);
2605    in[13] = _mm_adds_epi16(in[13], final_rounding);
2606    in[14] = _mm_adds_epi16(in[14], final_rounding);
2607    in[15] = _mm_adds_epi16(in[15], final_rounding);
2608
2609    in[0] = _mm_srai_epi16(in[0], 6);
2610    in[1] = _mm_srai_epi16(in[1], 6);
2611    in[2] = _mm_srai_epi16(in[2], 6);
2612    in[3] = _mm_srai_epi16(in[3], 6);
2613    in[4] = _mm_srai_epi16(in[4], 6);
2614    in[5] = _mm_srai_epi16(in[5], 6);
2615    in[6] = _mm_srai_epi16(in[6], 6);
2616    in[7] = _mm_srai_epi16(in[7], 6);
2617    in[8] = _mm_srai_epi16(in[8], 6);
2618    in[9] = _mm_srai_epi16(in[9], 6);
2619    in[10] = _mm_srai_epi16(in[10], 6);
2620    in[11] = _mm_srai_epi16(in[11], 6);
2621    in[12] = _mm_srai_epi16(in[12], 6);
2622    in[13] = _mm_srai_epi16(in[13], 6);
2623    in[14] = _mm_srai_epi16(in[14], 6);
2624    in[15] = _mm_srai_epi16(in[15], 6);
2625
2626    RECON_AND_STORE(dest, in[0]);
2627    RECON_AND_STORE(dest, in[1]);
2628    RECON_AND_STORE(dest, in[2]);
2629    RECON_AND_STORE(dest, in[3]);
2630    RECON_AND_STORE(dest, in[4]);
2631    RECON_AND_STORE(dest, in[5]);
2632    RECON_AND_STORE(dest, in[6]);
2633    RECON_AND_STORE(dest, in[7]);
2634    RECON_AND_STORE(dest, in[8]);
2635    RECON_AND_STORE(dest, in[9]);
2636    RECON_AND_STORE(dest, in[10]);
2637    RECON_AND_STORE(dest, in[11]);
2638    RECON_AND_STORE(dest, in[12]);
2639    RECON_AND_STORE(dest, in[13]);
2640    RECON_AND_STORE(dest, in[14]);
2641    RECON_AND_STORE(dest, in[15]);
2642
2643    dest += 8 - (stride * 16);
2644  }
2645}
2646
2647#define LOAD_DQCOEFF(reg, input) \
2648  {  \
2649    reg = _mm_load_si128((const __m128i *) input); \
2650    input += 8; \
2651  }  \
2652
2653#define IDCT32_34 \
2654/* Stage1 */ \
2655{ \
2656  const __m128i zero = _mm_setzero_si128();\
2657  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
2658  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
2659  \
2660  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
2661  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
2662  \
2663  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
2664  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
2665  \
2666  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
2667  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
2668  \
2669  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
2670                         stg1_1, stp1_16, stp1_31); \
2671  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
2672                         stg1_7, stp1_19, stp1_28); \
2673  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
2674                         stg1_9, stp1_20, stp1_27); \
2675  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
2676                         stg1_15, stp1_23, stp1_24); \
2677} \
2678\
2679/* Stage2 */ \
2680{ \
2681  const __m128i zero = _mm_setzero_si128();\
2682  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
2683  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
2684  \
2685  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
2686  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
2687  \
2688  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
2689                         stg2_1, stp2_8, stp2_15); \
2690  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
2691                         stg2_7, stp2_11, stp2_12); \
2692  \
2693  stp2_16 = stp1_16; \
2694  stp2_19 = stp1_19; \
2695  \
2696  stp2_20 = stp1_20; \
2697  stp2_23 = stp1_23; \
2698  \
2699  stp2_24 = stp1_24; \
2700  stp2_27 = stp1_27; \
2701  \
2702  stp2_28 = stp1_28; \
2703  stp2_31 = stp1_31; \
2704} \
2705\
2706/* Stage3 */ \
2707{ \
2708  const __m128i zero = _mm_setzero_si128();\
2709  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
2710  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
2711  \
2712  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
2713  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
2714  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
2715  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
2716  \
2717  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
2718  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
2719  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
2720  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
2721  \
2722  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
2723                         stg3_1, stp1_4, stp1_7); \
2724  \
2725  stp1_8 = stp2_8; \
2726  stp1_11 = stp2_11; \
2727  stp1_12 = stp2_12; \
2728  stp1_15 = stp2_15; \
2729  \
2730  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2731                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2732                         stp1_18, stp1_29) \
2733  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2734                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2735                         stp1_22, stp1_25) \
2736  \
2737  stp1_16 = stp2_16; \
2738  stp1_31 = stp2_31; \
2739  stp1_19 = stp2_19; \
2740  stp1_20 = stp2_20; \
2741  stp1_23 = stp2_23; \
2742  stp1_24 = stp2_24; \
2743  stp1_27 = stp2_27; \
2744  stp1_28 = stp2_28; \
2745} \
2746\
2747/* Stage4 */ \
2748{ \
2749  const __m128i zero = _mm_setzero_si128();\
2750  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
2751  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
2752  \
2753  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
2754  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
2755  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
2756  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
2757  \
2758  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
2759                         stg4_1, stp2_0, stp2_1); \
2760  \
2761  stp2_4 = stp1_4; \
2762  stp2_5 = stp1_4; \
2763  stp2_6 = stp1_7; \
2764  stp2_7 = stp1_7; \
2765  \
2766  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2767                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2768                         stp2_10, stp2_13) \
2769  \
2770  stp2_8 = stp1_8; \
2771  stp2_15 = stp1_15; \
2772  stp2_11 = stp1_11; \
2773  stp2_12 = stp1_12; \
2774  \
2775  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2776  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2777  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2778  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2779  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2780  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2781  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2782  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2783  \
2784  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2785  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2786  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2787  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2788  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2789  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2790  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2791  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2792} \
2793\
2794/* Stage5 */ \
2795{ \
2796  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2797  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2798  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2799  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2800  \
2801  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2802  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2803  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2804  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2805  \
2806  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2807  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2808  \
2809  stp1_0 = stp2_0; \
2810  stp1_1 = stp2_1; \
2811  stp1_2 = stp2_1; \
2812  stp1_3 = stp2_0; \
2813  \
2814  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2815  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2816  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2817  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2818  \
2819  tmp0 = _mm_add_epi32(tmp0, rounding); \
2820  tmp1 = _mm_add_epi32(tmp1, rounding); \
2821  tmp2 = _mm_add_epi32(tmp2, rounding); \
2822  tmp3 = _mm_add_epi32(tmp3, rounding); \
2823  \
2824  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2825  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2826  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2827  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2828  \
2829  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2830  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2831  \
2832  stp1_4 = stp2_4; \
2833  stp1_7 = stp2_7; \
2834  \
2835  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2836  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2837  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2838  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2839  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2840  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2841  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2842  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2843  \
2844  stp1_16 = stp2_16; \
2845  stp1_17 = stp2_17; \
2846  \
2847  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2848                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2849                         stp1_19, stp1_28) \
2850  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2851                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2852                         stp1_21, stp1_26) \
2853  \
2854  stp1_22 = stp2_22; \
2855  stp1_23 = stp2_23; \
2856  stp1_24 = stp2_24; \
2857  stp1_25 = stp2_25; \
2858  stp1_30 = stp2_30; \
2859  stp1_31 = stp2_31; \
2860} \
2861\
2862/* Stage6 */ \
2863{ \
2864  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2865  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2866  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2867  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2868  \
2869  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2870  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2871  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2872  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2873  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2874  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2875  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2876  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2877  \
2878  stp2_8 = stp1_8; \
2879  stp2_9 = stp1_9; \
2880  stp2_14 = stp1_14; \
2881  stp2_15 = stp1_15; \
2882  \
2883  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
2884                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
2885                         stp2_13, stp2_11, stp2_12) \
2886  \
2887  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2888  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2889  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2890  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2891  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2892  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2893  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2894  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2895  \
2896  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2897  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2898  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2899  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2900  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2901  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2902  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2903  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2904} \
2905\
2906/* Stage7 */ \
2907{ \
2908  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2909  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2910  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2911  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2912  \
2913  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2914  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2915  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2916  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2917  \
2918  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2919  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2920  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2921  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
2922  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
2923  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2924  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2925  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2926  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2927  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2928  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2929  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
2930  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
2931  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
2932  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
2933  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
2934  \
2935  stp1_16 = stp2_16; \
2936  stp1_17 = stp2_17; \
2937  stp1_18 = stp2_18; \
2938  stp1_19 = stp2_19; \
2939  \
2940  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
2941                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
2942                         stp1_21, stp1_26) \
2943  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
2944                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
2945                         stp1_23, stp1_24) \
2946  \
2947  stp1_28 = stp2_28; \
2948  stp1_29 = stp2_29; \
2949  stp1_30 = stp2_30; \
2950  stp1_31 = stp2_31; \
2951}
2952
2953
2954#define IDCT32 \
2955/* Stage1 */ \
2956{ \
2957  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
2958  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
2959  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
2960  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
2961  \
2962  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
2963  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
2964  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
2965  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
2966  \
2967  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
2968  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
2969  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
2970  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
2971  \
2972  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
2973  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
2974  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
2975  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
2976  \
2977  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
2978                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
2979                         stp1_17, stp1_30) \
2980  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
2981                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
2982                         stp1_19, stp1_28) \
2983  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
2984                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
2985                         stp1_21, stp1_26) \
2986  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
2987                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
2988                         stp1_23, stp1_24) \
2989} \
2990\
2991/* Stage2 */ \
2992{ \
2993  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
2994  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
2995  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
2996  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
2997  \
2998  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
2999  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
3000  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
3001  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
3002  \
3003  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
3004                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
3005                         stp2_14) \
3006  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
3007                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
3008                         stp2_11, stp2_12) \
3009  \
3010  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
3011  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
3012  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
3013  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
3014  \
3015  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
3016  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
3017  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
3018  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
3019  \
3020  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
3021  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
3022  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
3023  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
3024  \
3025  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
3026  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
3027  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
3028  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
3029} \
3030\
3031/* Stage3 */ \
3032{ \
3033  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
3034  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
3035  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
3036  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
3037  \
3038  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
3039  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
3040  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
3041  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
3042  \
3043  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3044  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3045  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3046  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3047  \
3048  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
3049                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
3050                         stp1_6) \
3051  \
3052  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
3053  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
3054  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
3055  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
3056  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
3057  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
3058  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
3059  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
3060  \
3061  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
3062                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
3063                         stp1_18, stp1_29) \
3064  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
3065                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
3066                         stp1_22, stp1_25) \
3067  \
3068  stp1_16 = stp2_16; \
3069  stp1_31 = stp2_31; \
3070  stp1_19 = stp2_19; \
3071  stp1_20 = stp2_20; \
3072  stp1_23 = stp2_23; \
3073  stp1_24 = stp2_24; \
3074  stp1_27 = stp2_27; \
3075  stp1_28 = stp2_28; \
3076} \
3077\
3078/* Stage4 */ \
3079{ \
3080  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
3081  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
3082  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
3083  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
3084  \
3085  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
3086  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
3087  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3088  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3089  \
3090  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
3091                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
3092                         stp2_2, stp2_3) \
3093  \
3094  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
3095  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
3096  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
3097  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
3098  \
3099  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
3100                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
3101                         stp2_10, stp2_13) \
3102  \
3103  stp2_8 = stp1_8; \
3104  stp2_15 = stp1_15; \
3105  stp2_11 = stp1_11; \
3106  stp2_12 = stp1_12; \
3107  \
3108  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
3109  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
3110  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
3111  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
3112  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
3113  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
3114  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
3115  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
3116  \
3117  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
3118  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
3119  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
3120  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
3121  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
3122  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
3123  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
3124  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
3125} \
3126\
3127/* Stage5 */ \
3128{ \
3129  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
3130  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
3131  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
3132  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
3133  \
3134  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
3135  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
3136  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3137  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3138  \
3139  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3140  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3141  \
3142  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
3143  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
3144  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
3145  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
3146  \
3147  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
3148  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
3149  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
3150  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
3151  \
3152  tmp0 = _mm_add_epi32(tmp0, rounding); \
3153  tmp1 = _mm_add_epi32(tmp1, rounding); \
3154  tmp2 = _mm_add_epi32(tmp2, rounding); \
3155  tmp3 = _mm_add_epi32(tmp3, rounding); \
3156  \
3157  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
3158  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
3159  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
3160  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
3161  \
3162  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
3163  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
3164  \
3165  stp1_4 = stp2_4; \
3166  stp1_7 = stp2_7; \
3167  \
3168  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
3169  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
3170  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
3171  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
3172  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
3173  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
3174  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
3175  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
3176  \
3177  stp1_16 = stp2_16; \
3178  stp1_17 = stp2_17; \
3179  \
3180  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
3181                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
3182                         stp1_19, stp1_28) \
3183  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
3184                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
3185                         stp1_21, stp1_26) \
3186  \
3187  stp1_22 = stp2_22; \
3188  stp1_23 = stp2_23; \
3189  stp1_24 = stp2_24; \
3190  stp1_25 = stp2_25; \
3191  stp1_30 = stp2_30; \
3192  stp1_31 = stp2_31; \
3193} \
3194\
3195/* Stage6 */ \
3196{ \
3197  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3198  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3199  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
3200  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
3201  \
3202  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
3203  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
3204  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
3205  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
3206  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
3207  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
3208  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
3209  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
3210  \
3211  stp2_8 = stp1_8; \
3212  stp2_9 = stp1_9; \
3213  stp2_14 = stp1_14; \
3214  stp2_15 = stp1_15; \
3215  \
3216  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
3217                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
3218                         stp2_13, stp2_11, stp2_12) \
3219  \
3220  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
3221  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
3222  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
3223  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
3224  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
3225  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
3226  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
3227  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
3228  \
3229  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
3230  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
3231  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
3232  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
3233  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
3234  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
3235  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
3236  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
3237} \
3238\
3239/* Stage7 */ \
3240{ \
3241  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3242  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3243  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3244  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3245  \
3246  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3247  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3248  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
3249  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
3250  \
3251  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
3252  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
3253  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
3254  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3255  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3256  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3257  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3258  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3259  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3260  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3261  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3262  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3263  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3264  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3265  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3266  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3267  \
3268  stp1_16 = stp2_16; \
3269  stp1_17 = stp2_17; \
3270  stp1_18 = stp2_18; \
3271  stp1_19 = stp2_19; \
3272  \
3273  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3274                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3275                         stp1_21, stp1_26) \
3276  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3277                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3278                         stp1_23, stp1_24) \
3279  \
3280  stp1_28 = stp2_28; \
3281  stp1_29 = stp2_29; \
3282  stp1_30 = stp2_30; \
3283  stp1_31 = stp2_31; \
3284}
3285
3286// Only upper-left 8x8 has non-zero coeff
3287void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
3288                                 int stride) {
3289  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3290  const __m128i final_rounding = _mm_set1_epi16(1<<5);
3291
3292  // idct constants for each stage
3293  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3294  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3295  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3296  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3297  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3298  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3299  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3300  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3301  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3302  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3303  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3304  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3305  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3306  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3307  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3308  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3309
3310  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3311  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3312  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3313  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3314  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3315  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3316  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3317  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3318
3319  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3320  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3321  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3322  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3323  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3324  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3325  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3326  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3327  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3328  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3329
3330  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3331  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3332  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3333  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3334  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3335  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3336  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3337
3338  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3339
3340  __m128i in[32], col[32];
3341  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3342          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3343          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3344          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3345          stp1_30, stp1_31;
3346  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3347          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3348          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3349          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3350          stp2_30, stp2_31;
3351  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3352  int i;
3353  // Load input data.
3354  LOAD_DQCOEFF(in[0], input);
3355  LOAD_DQCOEFF(in[8], input);
3356  LOAD_DQCOEFF(in[16], input);
3357  LOAD_DQCOEFF(in[24], input);
3358  LOAD_DQCOEFF(in[1], input);
3359  LOAD_DQCOEFF(in[9], input);
3360  LOAD_DQCOEFF(in[17], input);
3361  LOAD_DQCOEFF(in[25], input);
3362  LOAD_DQCOEFF(in[2], input);
3363  LOAD_DQCOEFF(in[10], input);
3364  LOAD_DQCOEFF(in[18], input);
3365  LOAD_DQCOEFF(in[26], input);
3366  LOAD_DQCOEFF(in[3], input);
3367  LOAD_DQCOEFF(in[11], input);
3368  LOAD_DQCOEFF(in[19], input);
3369  LOAD_DQCOEFF(in[27], input);
3370
3371  LOAD_DQCOEFF(in[4], input);
3372  LOAD_DQCOEFF(in[12], input);
3373  LOAD_DQCOEFF(in[20], input);
3374  LOAD_DQCOEFF(in[28], input);
3375  LOAD_DQCOEFF(in[5], input);
3376  LOAD_DQCOEFF(in[13], input);
3377  LOAD_DQCOEFF(in[21], input);
3378  LOAD_DQCOEFF(in[29], input);
3379  LOAD_DQCOEFF(in[6], input);
3380  LOAD_DQCOEFF(in[14], input);
3381  LOAD_DQCOEFF(in[22], input);
3382  LOAD_DQCOEFF(in[30], input);
3383  LOAD_DQCOEFF(in[7], input);
3384  LOAD_DQCOEFF(in[15], input);
3385  LOAD_DQCOEFF(in[23], input);
3386  LOAD_DQCOEFF(in[31], input);
3387
3388  array_transpose_8x8(in, in);
3389  array_transpose_8x8(in+8, in+8);
3390  array_transpose_8x8(in+16, in+16);
3391  array_transpose_8x8(in+24, in+24);
3392
3393  IDCT32
3394
3395  // 1_D: Store 32 intermediate results for each 8x32 block.
3396  col[0] = _mm_add_epi16(stp1_0, stp1_31);
3397  col[1] = _mm_add_epi16(stp1_1, stp1_30);
3398  col[2] = _mm_add_epi16(stp1_2, stp1_29);
3399  col[3] = _mm_add_epi16(stp1_3, stp1_28);
3400  col[4] = _mm_add_epi16(stp1_4, stp1_27);
3401  col[5] = _mm_add_epi16(stp1_5, stp1_26);
3402  col[6] = _mm_add_epi16(stp1_6, stp1_25);
3403  col[7] = _mm_add_epi16(stp1_7, stp1_24);
3404  col[8] = _mm_add_epi16(stp1_8, stp1_23);
3405  col[9] = _mm_add_epi16(stp1_9, stp1_22);
3406  col[10] = _mm_add_epi16(stp1_10, stp1_21);
3407  col[11] = _mm_add_epi16(stp1_11, stp1_20);
3408  col[12] = _mm_add_epi16(stp1_12, stp1_19);
3409  col[13] = _mm_add_epi16(stp1_13, stp1_18);
3410  col[14] = _mm_add_epi16(stp1_14, stp1_17);
3411  col[15] = _mm_add_epi16(stp1_15, stp1_16);
3412  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
3413  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
3414  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
3415  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
3416  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
3417  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
3418  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
3419  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
3420  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
3421  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
3422  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
3423  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
3424  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
3425  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
3426  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
3427  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
3428  for (i = 0; i < 4; i++) {
3429      const __m128i zero = _mm_setzero_si128();
3430      // Transpose 32x8 block to 8x32 block
3431      array_transpose_8x8(col+i*8, in);
3432      IDCT32_34
3433
3434      // 2_D: Calculate the results and store them to destination.
3435      in[0] = _mm_add_epi16(stp1_0, stp1_31);
3436      in[1] = _mm_add_epi16(stp1_1, stp1_30);
3437      in[2] = _mm_add_epi16(stp1_2, stp1_29);
3438      in[3] = _mm_add_epi16(stp1_3, stp1_28);
3439      in[4] = _mm_add_epi16(stp1_4, stp1_27);
3440      in[5] = _mm_add_epi16(stp1_5, stp1_26);
3441      in[6] = _mm_add_epi16(stp1_6, stp1_25);
3442      in[7] = _mm_add_epi16(stp1_7, stp1_24);
3443      in[8] = _mm_add_epi16(stp1_8, stp1_23);
3444      in[9] = _mm_add_epi16(stp1_9, stp1_22);
3445      in[10] = _mm_add_epi16(stp1_10, stp1_21);
3446      in[11] = _mm_add_epi16(stp1_11, stp1_20);
3447      in[12] = _mm_add_epi16(stp1_12, stp1_19);
3448      in[13] = _mm_add_epi16(stp1_13, stp1_18);
3449      in[14] = _mm_add_epi16(stp1_14, stp1_17);
3450      in[15] = _mm_add_epi16(stp1_15, stp1_16);
3451      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3452      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3453      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3454      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3455      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3456      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3457      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3458      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3459      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3460      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3461      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3462      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3463      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3464      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3465      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3466      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3467
3468      // Final rounding and shift
3469      in[0] = _mm_adds_epi16(in[0], final_rounding);
3470      in[1] = _mm_adds_epi16(in[1], final_rounding);
3471      in[2] = _mm_adds_epi16(in[2], final_rounding);
3472      in[3] = _mm_adds_epi16(in[3], final_rounding);
3473      in[4] = _mm_adds_epi16(in[4], final_rounding);
3474      in[5] = _mm_adds_epi16(in[5], final_rounding);
3475      in[6] = _mm_adds_epi16(in[6], final_rounding);
3476      in[7] = _mm_adds_epi16(in[7], final_rounding);
3477      in[8] = _mm_adds_epi16(in[8], final_rounding);
3478      in[9] = _mm_adds_epi16(in[9], final_rounding);
3479      in[10] = _mm_adds_epi16(in[10], final_rounding);
3480      in[11] = _mm_adds_epi16(in[11], final_rounding);
3481      in[12] = _mm_adds_epi16(in[12], final_rounding);
3482      in[13] = _mm_adds_epi16(in[13], final_rounding);
3483      in[14] = _mm_adds_epi16(in[14], final_rounding);
3484      in[15] = _mm_adds_epi16(in[15], final_rounding);
3485      in[16] = _mm_adds_epi16(in[16], final_rounding);
3486      in[17] = _mm_adds_epi16(in[17], final_rounding);
3487      in[18] = _mm_adds_epi16(in[18], final_rounding);
3488      in[19] = _mm_adds_epi16(in[19], final_rounding);
3489      in[20] = _mm_adds_epi16(in[20], final_rounding);
3490      in[21] = _mm_adds_epi16(in[21], final_rounding);
3491      in[22] = _mm_adds_epi16(in[22], final_rounding);
3492      in[23] = _mm_adds_epi16(in[23], final_rounding);
3493      in[24] = _mm_adds_epi16(in[24], final_rounding);
3494      in[25] = _mm_adds_epi16(in[25], final_rounding);
3495      in[26] = _mm_adds_epi16(in[26], final_rounding);
3496      in[27] = _mm_adds_epi16(in[27], final_rounding);
3497      in[28] = _mm_adds_epi16(in[28], final_rounding);
3498      in[29] = _mm_adds_epi16(in[29], final_rounding);
3499      in[30] = _mm_adds_epi16(in[30], final_rounding);
3500      in[31] = _mm_adds_epi16(in[31], final_rounding);
3501
3502      in[0] = _mm_srai_epi16(in[0], 6);
3503      in[1] = _mm_srai_epi16(in[1], 6);
3504      in[2] = _mm_srai_epi16(in[2], 6);
3505      in[3] = _mm_srai_epi16(in[3], 6);
3506      in[4] = _mm_srai_epi16(in[4], 6);
3507      in[5] = _mm_srai_epi16(in[5], 6);
3508      in[6] = _mm_srai_epi16(in[6], 6);
3509      in[7] = _mm_srai_epi16(in[7], 6);
3510      in[8] = _mm_srai_epi16(in[8], 6);
3511      in[9] = _mm_srai_epi16(in[9], 6);
3512      in[10] = _mm_srai_epi16(in[10], 6);
3513      in[11] = _mm_srai_epi16(in[11], 6);
3514      in[12] = _mm_srai_epi16(in[12], 6);
3515      in[13] = _mm_srai_epi16(in[13], 6);
3516      in[14] = _mm_srai_epi16(in[14], 6);
3517      in[15] = _mm_srai_epi16(in[15], 6);
3518      in[16] = _mm_srai_epi16(in[16], 6);
3519      in[17] = _mm_srai_epi16(in[17], 6);
3520      in[18] = _mm_srai_epi16(in[18], 6);
3521      in[19] = _mm_srai_epi16(in[19], 6);
3522      in[20] = _mm_srai_epi16(in[20], 6);
3523      in[21] = _mm_srai_epi16(in[21], 6);
3524      in[22] = _mm_srai_epi16(in[22], 6);
3525      in[23] = _mm_srai_epi16(in[23], 6);
3526      in[24] = _mm_srai_epi16(in[24], 6);
3527      in[25] = _mm_srai_epi16(in[25], 6);
3528      in[26] = _mm_srai_epi16(in[26], 6);
3529      in[27] = _mm_srai_epi16(in[27], 6);
3530      in[28] = _mm_srai_epi16(in[28], 6);
3531      in[29] = _mm_srai_epi16(in[29], 6);
3532      in[30] = _mm_srai_epi16(in[30], 6);
3533      in[31] = _mm_srai_epi16(in[31], 6);
3534
3535      RECON_AND_STORE(dest, in[0]);
3536      RECON_AND_STORE(dest, in[1]);
3537      RECON_AND_STORE(dest, in[2]);
3538      RECON_AND_STORE(dest, in[3]);
3539      RECON_AND_STORE(dest, in[4]);
3540      RECON_AND_STORE(dest, in[5]);
3541      RECON_AND_STORE(dest, in[6]);
3542      RECON_AND_STORE(dest, in[7]);
3543      RECON_AND_STORE(dest, in[8]);
3544      RECON_AND_STORE(dest, in[9]);
3545      RECON_AND_STORE(dest, in[10]);
3546      RECON_AND_STORE(dest, in[11]);
3547      RECON_AND_STORE(dest, in[12]);
3548      RECON_AND_STORE(dest, in[13]);
3549      RECON_AND_STORE(dest, in[14]);
3550      RECON_AND_STORE(dest, in[15]);
3551      RECON_AND_STORE(dest, in[16]);
3552      RECON_AND_STORE(dest, in[17]);
3553      RECON_AND_STORE(dest, in[18]);
3554      RECON_AND_STORE(dest, in[19]);
3555      RECON_AND_STORE(dest, in[20]);
3556      RECON_AND_STORE(dest, in[21]);
3557      RECON_AND_STORE(dest, in[22]);
3558      RECON_AND_STORE(dest, in[23]);
3559      RECON_AND_STORE(dest, in[24]);
3560      RECON_AND_STORE(dest, in[25]);
3561      RECON_AND_STORE(dest, in[26]);
3562      RECON_AND_STORE(dest, in[27]);
3563      RECON_AND_STORE(dest, in[28]);
3564      RECON_AND_STORE(dest, in[29]);
3565      RECON_AND_STORE(dest, in[30]);
3566      RECON_AND_STORE(dest, in[31]);
3567
3568      dest += 8 - (stride * 32);
3569    }
3570  }
3571
3572void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
3573                                 int stride) {
3574  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3575  const __m128i final_rounding = _mm_set1_epi16(1<<5);
3576  const __m128i zero = _mm_setzero_si128();
3577
3578  // idct constants for each stage
3579  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3580  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3581  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3582  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3583  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3584  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3585  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3586  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3587  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3588  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3589  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3590  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3591  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3592  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3593  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3594  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3595
3596  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3597  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3598  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3599  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3600  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3601  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3602  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3603  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3604
3605  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3606  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3607  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3608  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3609  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3610  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3611  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3612  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3613  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3614  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3615
3616  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3617  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3618  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3619  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3620  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3621  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3622  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3623
3624  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3625
3626  __m128i in[32], col[128], zero_idx[16];
3627  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3628          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3629          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3630          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3631          stp1_30, stp1_31;
3632  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3633          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3634          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3635          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3636          stp2_30, stp2_31;
3637  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3638  int i, j, i32;
3639
3640  for (i = 0; i < 4; i++) {
3641    i32 = (i << 5);
3642      // First 1-D idct
3643      // Load input data.
3644      LOAD_DQCOEFF(in[0], input);
3645      LOAD_DQCOEFF(in[8], input);
3646      LOAD_DQCOEFF(in[16], input);
3647      LOAD_DQCOEFF(in[24], input);
3648      LOAD_DQCOEFF(in[1], input);
3649      LOAD_DQCOEFF(in[9], input);
3650      LOAD_DQCOEFF(in[17], input);
3651      LOAD_DQCOEFF(in[25], input);
3652      LOAD_DQCOEFF(in[2], input);
3653      LOAD_DQCOEFF(in[10], input);
3654      LOAD_DQCOEFF(in[18], input);
3655      LOAD_DQCOEFF(in[26], input);
3656      LOAD_DQCOEFF(in[3], input);
3657      LOAD_DQCOEFF(in[11], input);
3658      LOAD_DQCOEFF(in[19], input);
3659      LOAD_DQCOEFF(in[27], input);
3660
3661      LOAD_DQCOEFF(in[4], input);
3662      LOAD_DQCOEFF(in[12], input);
3663      LOAD_DQCOEFF(in[20], input);
3664      LOAD_DQCOEFF(in[28], input);
3665      LOAD_DQCOEFF(in[5], input);
3666      LOAD_DQCOEFF(in[13], input);
3667      LOAD_DQCOEFF(in[21], input);
3668      LOAD_DQCOEFF(in[29], input);
3669      LOAD_DQCOEFF(in[6], input);
3670      LOAD_DQCOEFF(in[14], input);
3671      LOAD_DQCOEFF(in[22], input);
3672      LOAD_DQCOEFF(in[30], input);
3673      LOAD_DQCOEFF(in[7], input);
3674      LOAD_DQCOEFF(in[15], input);
3675      LOAD_DQCOEFF(in[23], input);
3676      LOAD_DQCOEFF(in[31], input);
3677
3678      // checking if all entries are zero
3679      zero_idx[0] = _mm_or_si128(in[0], in[1]);
3680      zero_idx[1] = _mm_or_si128(in[2], in[3]);
3681      zero_idx[2] = _mm_or_si128(in[4], in[5]);
3682      zero_idx[3] = _mm_or_si128(in[6], in[7]);
3683      zero_idx[4] = _mm_or_si128(in[8], in[9]);
3684      zero_idx[5] = _mm_or_si128(in[10], in[11]);
3685      zero_idx[6] = _mm_or_si128(in[12], in[13]);
3686      zero_idx[7] = _mm_or_si128(in[14], in[15]);
3687      zero_idx[8] = _mm_or_si128(in[16], in[17]);
3688      zero_idx[9] = _mm_or_si128(in[18], in[19]);
3689      zero_idx[10] = _mm_or_si128(in[20], in[21]);
3690      zero_idx[11] = _mm_or_si128(in[22], in[23]);
3691      zero_idx[12] = _mm_or_si128(in[24], in[25]);
3692      zero_idx[13] = _mm_or_si128(in[26], in[27]);
3693      zero_idx[14] = _mm_or_si128(in[28], in[29]);
3694      zero_idx[15] = _mm_or_si128(in[30], in[31]);
3695
3696      zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3697      zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3698      zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3699      zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3700      zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3701      zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3702      zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3703      zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3704
3705      zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3706      zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3707      zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3708      zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3709      zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3710      zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3711      zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3712
3713      if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
3714        col[i32 + 0] = _mm_setzero_si128();
3715        col[i32 + 1] = _mm_setzero_si128();
3716        col[i32 + 2] = _mm_setzero_si128();
3717        col[i32 + 3] = _mm_setzero_si128();
3718        col[i32 + 4] = _mm_setzero_si128();
3719        col[i32 + 5] = _mm_setzero_si128();
3720        col[i32 + 6] = _mm_setzero_si128();
3721        col[i32 + 7] = _mm_setzero_si128();
3722        col[i32 + 8] = _mm_setzero_si128();
3723        col[i32 + 9] = _mm_setzero_si128();
3724        col[i32 + 10] = _mm_setzero_si128();
3725        col[i32 + 11] = _mm_setzero_si128();
3726        col[i32 + 12] = _mm_setzero_si128();
3727        col[i32 + 13] = _mm_setzero_si128();
3728        col[i32 + 14] = _mm_setzero_si128();
3729        col[i32 + 15] = _mm_setzero_si128();
3730        col[i32 + 16] = _mm_setzero_si128();
3731        col[i32 + 17] = _mm_setzero_si128();
3732        col[i32 + 18] = _mm_setzero_si128();
3733        col[i32 + 19] = _mm_setzero_si128();
3734        col[i32 + 20] = _mm_setzero_si128();
3735        col[i32 + 21] = _mm_setzero_si128();
3736        col[i32 + 22] = _mm_setzero_si128();
3737        col[i32 + 23] = _mm_setzero_si128();
3738        col[i32 + 24] = _mm_setzero_si128();
3739        col[i32 + 25] = _mm_setzero_si128();
3740        col[i32 + 26] = _mm_setzero_si128();
3741        col[i32 + 27] = _mm_setzero_si128();
3742        col[i32 + 28] = _mm_setzero_si128();
3743        col[i32 + 29] = _mm_setzero_si128();
3744        col[i32 + 30] = _mm_setzero_si128();
3745        col[i32 + 31] = _mm_setzero_si128();
3746        continue;
3747      }
3748
3749      // Transpose 32x8 block to 8x32 block
3750      array_transpose_8x8(in, in);
3751      array_transpose_8x8(in+8, in+8);
3752      array_transpose_8x8(in+16, in+16);
3753      array_transpose_8x8(in+24, in+24);
3754
3755      IDCT32
3756
3757      // 1_D: Store 32 intermediate results for each 8x32 block.
3758      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3759      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3760      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3761      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3762      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3763      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3764      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3765      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3766      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3767      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3768      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3769      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3770      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3771      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3772      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3773      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3774      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3775      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3776      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3777      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3778      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3779      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3780      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3781      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3782      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3783      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3784      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3785      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3786      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3787      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3788      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3789      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3790    }
3791  for (i = 0; i < 4; i++) {
3792      // Second 1-D idct
3793      j = i <<