1/*
2 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <emmintrin.h>  // SSE2
13#include "./vpx_config.h"
14#include "vpx/vpx_integer.h"
15#include "vp9/common/vp9_common.h"
16#include "vp9/common/vp9_idct.h"
17
18#define RECON_AND_STORE4X4(dest, in_x) \
19{                                                     \
20  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
21  d0 = _mm_unpacklo_epi8(d0, zero); \
22  d0 = _mm_add_epi16(in_x, d0); \
23  d0 = _mm_packus_epi16(d0, d0); \
24  *(int *)dest = _mm_cvtsi128_si32(d0); \
25  dest += stride; \
26}
27
28void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
29  const __m128i zero = _mm_setzero_si128();
30  const __m128i eight = _mm_set1_epi16(8);
31  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
32                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
33                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
34                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
35  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
36  __m128i input0, input1, input2, input3;
37
38  // Rows
39  input0 = _mm_load_si128((const __m128i *)input);
40  input2 = _mm_load_si128((const __m128i *)(input + 8));
41
42  // Construct i3, i1, i3, i1, i2, i0, i2, i0
43  input0 = _mm_shufflelo_epi16(input0, 0xd8);
44  input0 = _mm_shufflehi_epi16(input0, 0xd8);
45  input2 = _mm_shufflelo_epi16(input2, 0xd8);
46  input2 = _mm_shufflehi_epi16(input2, 0xd8);
47
48  input1 = _mm_unpackhi_epi32(input0, input0);
49  input0 = _mm_unpacklo_epi32(input0, input0);
50  input3 = _mm_unpackhi_epi32(input2, input2);
51  input2 = _mm_unpacklo_epi32(input2, input2);
52
53  // Stage 1
54  input0 = _mm_madd_epi16(input0, cst);
55  input1 = _mm_madd_epi16(input1, cst);
56  input2 = _mm_madd_epi16(input2, cst);
57  input3 = _mm_madd_epi16(input3, cst);
58
59  input0 = _mm_add_epi32(input0, rounding);
60  input1 = _mm_add_epi32(input1, rounding);
61  input2 = _mm_add_epi32(input2, rounding);
62  input3 = _mm_add_epi32(input3, rounding);
63
64  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
65  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
66  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
67  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
68
69  // Stage 2
70  input0 = _mm_packs_epi32(input0, input1);
71  input1 = _mm_packs_epi32(input2, input3);
72
73  // Transpose
74  input2 = _mm_unpacklo_epi16(input0, input1);
75  input3 = _mm_unpackhi_epi16(input0, input1);
76  input0 = _mm_unpacklo_epi32(input2, input3);
77  input1 = _mm_unpackhi_epi32(input2, input3);
78
79  // Switch column2, column 3, and then, we got:
80  // input2: column1, column 0;  input3: column2, column 3.
81  input1 = _mm_shuffle_epi32(input1, 0x4e);
82  input2 = _mm_add_epi16(input0, input1);
83  input3 = _mm_sub_epi16(input0, input1);
84
85  // Columns
86  // Construct i3, i1, i3, i1, i2, i0, i2, i0
87  input0 = _mm_unpacklo_epi32(input2, input2);
88  input1 = _mm_unpackhi_epi32(input2, input2);
89  input2 = _mm_unpackhi_epi32(input3, input3);
90  input3 = _mm_unpacklo_epi32(input3, input3);
91
92  // Stage 1
93  input0 = _mm_madd_epi16(input0, cst);
94  input1 = _mm_madd_epi16(input1, cst);
95  input2 = _mm_madd_epi16(input2, cst);
96  input3 = _mm_madd_epi16(input3, cst);
97
98  input0 = _mm_add_epi32(input0, rounding);
99  input1 = _mm_add_epi32(input1, rounding);
100  input2 = _mm_add_epi32(input2, rounding);
101  input3 = _mm_add_epi32(input3, rounding);
102
103  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
104  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
105  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
106  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
107
108  // Stage 2
109  input0 = _mm_packs_epi32(input0, input2);
110  input1 = _mm_packs_epi32(input1, input3);
111
112  // Transpose
113  input2 = _mm_unpacklo_epi16(input0, input1);
114  input3 = _mm_unpackhi_epi16(input0, input1);
115  input0 = _mm_unpacklo_epi32(input2, input3);
116  input1 = _mm_unpackhi_epi32(input2, input3);
117
118  // Switch column2, column 3, and then, we got:
119  // input2: column1, column 0;  input3: column2, column 3.
120  input1 = _mm_shuffle_epi32(input1, 0x4e);
121  input2 = _mm_add_epi16(input0, input1);
122  input3 = _mm_sub_epi16(input0, input1);
123
124  // Final round and shift
125  input2 = _mm_add_epi16(input2, eight);
126  input3 = _mm_add_epi16(input3, eight);
127
128  input2 = _mm_srai_epi16(input2, 4);
129  input3 = _mm_srai_epi16(input3, 4);
130
131  // Reconstruction and Store
132  {
133     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
134     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
135     d0 = _mm_unpacklo_epi32(d0,
136          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
137     d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
138                    *(const int *) (dest + stride * 3)), d2);
139     d0 = _mm_unpacklo_epi8(d0, zero);
140     d2 = _mm_unpacklo_epi8(d2, zero);
141     d0 = _mm_add_epi16(d0, input2);
142     d2 = _mm_add_epi16(d2, input3);
143     d0 = _mm_packus_epi16(d0, d2);
144     // store input0
145     *(int *)dest = _mm_cvtsi128_si32(d0);
146     // store input1
147     d0 = _mm_srli_si128(d0, 4);
148     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
149     // store input2
150     d0 = _mm_srli_si128(d0, 4);
151     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
152     // store input3
153     d0 = _mm_srli_si128(d0, 4);
154     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
155  }
156}
157
158void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
159  __m128i dc_value;
160  const __m128i zero = _mm_setzero_si128();
161  int a;
162
163  a = dct_const_round_shift(input[0] * cospi_16_64);
164  a = dct_const_round_shift(a * cospi_16_64);
165  a = ROUND_POWER_OF_TWO(a, 4);
166
167  dc_value = _mm_set1_epi16(a);
168
169  RECON_AND_STORE4X4(dest, dc_value);
170  RECON_AND_STORE4X4(dest, dc_value);
171  RECON_AND_STORE4X4(dest, dc_value);
172  RECON_AND_STORE4X4(dest, dc_value);
173}
174
175static INLINE void transpose_4x4(__m128i *res) {
176  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
177  const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
178  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
179  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
180
181  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
182  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
183}
184
185static void idct4_1d_sse2(__m128i *in) {
186  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
187  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
188  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
189  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
190  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
191  __m128i u[8], v[8];
192
193  transpose_4x4(in);
194  // stage 1
195  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
196  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
197  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
198  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
199  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
200  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
201
202  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
203  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
204  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
205  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
206
207  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
208  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
209  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
210  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
211
212  u[0] = _mm_packs_epi32(v[0], v[2]);
213  u[1] = _mm_packs_epi32(v[1], v[3]);
214  u[2] = _mm_unpackhi_epi64(u[0], u[0]);
215  u[3] = _mm_unpackhi_epi64(u[1], u[1]);
216
217  // stage 2
218  in[0] = _mm_add_epi16(u[0], u[3]);
219  in[1] = _mm_add_epi16(u[1], u[2]);
220  in[2] = _mm_sub_epi16(u[1], u[2]);
221  in[3] = _mm_sub_epi16(u[0], u[3]);
222}
223
224static void iadst4_1d_sse2(__m128i *in) {
225  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
226  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
227  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
228  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
229  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
230  const __m128i kZero = _mm_set1_epi16(0);
231  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
232  __m128i u[8], v[8], in7;
233
234  transpose_4x4(in);
235  in7 = _mm_add_epi16(in[0], in[3]);
236  in7 = _mm_sub_epi16(in7, in[2]);
237
238  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
239  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
240  u[2] = _mm_unpacklo_epi16(in7, kZero);
241  u[3] = _mm_unpacklo_epi16(in[1], kZero);
242
243  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
244  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
245  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
246  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
247  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
248  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
249
250  u[0] = _mm_add_epi32(v[0], v[1]);
251  u[1] = _mm_add_epi32(v[3], v[4]);
252  u[2] = v[2];
253  u[3] = _mm_add_epi32(u[0], u[1]);
254  u[4] = _mm_slli_epi32(v[5], 2);
255  u[5] = _mm_add_epi32(u[3], v[5]);
256  u[6] = _mm_sub_epi32(u[5], u[4]);
257
258  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
259  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
260  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
261  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
262
263  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
264  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
265  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
266  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
267
268  in[0] = _mm_packs_epi32(u[0], u[2]);
269  in[1] = _mm_packs_epi32(u[1], u[3]);
270  in[2] = _mm_unpackhi_epi64(in[0], in[0]);
271  in[3] = _mm_unpackhi_epi64(in[1], in[1]);
272}
273
274void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
275                            int tx_type) {
276  __m128i in[4];
277  const __m128i zero = _mm_setzero_si128();
278  const __m128i eight = _mm_set1_epi16(8);
279
280  in[0] = _mm_loadl_epi64((const __m128i *)input);
281  in[1] = _mm_loadl_epi64((const __m128i *)(input + 4));
282  in[2] = _mm_loadl_epi64((const __m128i *)(input + 8));
283  in[3] = _mm_loadl_epi64((const __m128i *)(input + 12));
284
285  switch (tx_type) {
286    case 0:  // DCT_DCT
287      idct4_1d_sse2(in);
288      idct4_1d_sse2(in);
289      break;
290    case 1:  // ADST_DCT
291      idct4_1d_sse2(in);
292      iadst4_1d_sse2(in);
293      break;
294    case 2:  // DCT_ADST
295      iadst4_1d_sse2(in);
296      idct4_1d_sse2(in);
297      break;
298    case 3:  // ADST_ADST
299      iadst4_1d_sse2(in);
300      iadst4_1d_sse2(in);
301      break;
302    default:
303      assert(0);
304      break;
305  }
306
307  // Final round and shift
308  in[0] = _mm_add_epi16(in[0], eight);
309  in[1] = _mm_add_epi16(in[1], eight);
310  in[2] = _mm_add_epi16(in[2], eight);
311  in[3] = _mm_add_epi16(in[3], eight);
312
313  in[0] = _mm_srai_epi16(in[0], 4);
314  in[1] = _mm_srai_epi16(in[1], 4);
315  in[2] = _mm_srai_epi16(in[2], 4);
316  in[3] = _mm_srai_epi16(in[3], 4);
317
318  RECON_AND_STORE4X4(dest, in[0]);
319  RECON_AND_STORE4X4(dest, in[1]);
320  RECON_AND_STORE4X4(dest, in[2]);
321  RECON_AND_STORE4X4(dest, in[3]);
322}
323
324#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
325                      out0, out1, out2, out3, out4, out5, out6, out7) \
326  {                                                     \
327    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
328    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
329    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
330    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
331    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
332    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
333    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
334    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
335                                                        \
336    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
337    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
338    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
339    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
340    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
341    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
342    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
343    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
344                                                            \
345    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
346    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
347    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
348    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
349    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
350    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
351    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
352    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
353  }
354
355#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \
356                      out0, out1, out2, out3, out4, out5, out6, out7) \
357  {                                                     \
358    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
359    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
360    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
361    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
362                                                        \
363    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
364    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
365    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
366    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
367                                                            \
368    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
369    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
370    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
371    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
372    out4 = out5 = out6 = out7 = zero; \
373  }
374
375#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \
376  {                                                     \
377    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
378    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
379    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
380    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
381                                                        \
382    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
383    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
384    in2 = _mm_unpacklo_epi32(tr0_2, tr0_3);  /* i5 i4 */  \
385    in3 = _mm_unpackhi_epi32(tr0_2, tr0_3);  /* i7 i6 */  \
386  }
387
388// Define Macro for multiplying elements by constants and adding them together.
389#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
390                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
391  {   \
392      tmp0 = _mm_madd_epi16(lo_0, cst0); \
393      tmp1 = _mm_madd_epi16(hi_0, cst0); \
394      tmp2 = _mm_madd_epi16(lo_0, cst1); \
395      tmp3 = _mm_madd_epi16(hi_0, cst1); \
396      tmp4 = _mm_madd_epi16(lo_1, cst2); \
397      tmp5 = _mm_madd_epi16(hi_1, cst2); \
398      tmp6 = _mm_madd_epi16(lo_1, cst3); \
399      tmp7 = _mm_madd_epi16(hi_1, cst3); \
400      \
401      tmp0 = _mm_add_epi32(tmp0, rounding); \
402      tmp1 = _mm_add_epi32(tmp1, rounding); \
403      tmp2 = _mm_add_epi32(tmp2, rounding); \
404      tmp3 = _mm_add_epi32(tmp3, rounding); \
405      tmp4 = _mm_add_epi32(tmp4, rounding); \
406      tmp5 = _mm_add_epi32(tmp5, rounding); \
407      tmp6 = _mm_add_epi32(tmp6, rounding); \
408      tmp7 = _mm_add_epi32(tmp7, rounding); \
409      \
410      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
411      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
412      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
413      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
414      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
415      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
416      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
417      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
418      \
419      res0 = _mm_packs_epi32(tmp0, tmp1); \
420      res1 = _mm_packs_epi32(tmp2, tmp3); \
421      res2 = _mm_packs_epi32(tmp4, tmp5); \
422      res3 = _mm_packs_epi32(tmp6, tmp7); \
423  }
424
425#define IDCT8_1D  \
426  /* Stage1 */      \
427  { \
428    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
429    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
430    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
431    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
432    \
433    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
434                          stg1_1, stg1_2, stg1_3, stp1_4,      \
435                          stp1_7, stp1_5, stp1_6)              \
436  } \
437    \
438  /* Stage2 */ \
439  { \
440    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
441    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
442    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
443    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
444    \
445    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
446                           stg2_1, stg2_2, stg2_3, stp2_0,     \
447                           stp2_1, stp2_2, stp2_3)             \
448    \
449    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
450    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
451    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
452    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
453  } \
454    \
455  /* Stage3 */ \
456  { \
457    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
458    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
459    \
460    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
461    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
462    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
463    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
464    \
465    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
466    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
467    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
468    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
469    \
470    tmp0 = _mm_add_epi32(tmp0, rounding); \
471    tmp1 = _mm_add_epi32(tmp1, rounding); \
472    tmp2 = _mm_add_epi32(tmp2, rounding); \
473    tmp3 = _mm_add_epi32(tmp3, rounding); \
474    \
475    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
476    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
477    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
478    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
479    \
480    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
481    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
482  } \
483  \
484  /* Stage4  */ \
485  in0 = _mm_adds_epi16(stp1_0, stp2_7); \
486  in1 = _mm_adds_epi16(stp1_1, stp1_6); \
487  in2 = _mm_adds_epi16(stp1_2, stp1_5); \
488  in3 = _mm_adds_epi16(stp1_3, stp2_4); \
489  in4 = _mm_subs_epi16(stp1_3, stp2_4); \
490  in5 = _mm_subs_epi16(stp1_2, stp1_5); \
491  in6 = _mm_subs_epi16(stp1_1, stp1_6); \
492  in7 = _mm_subs_epi16(stp1_0, stp2_7);
493
494#define RECON_AND_STORE(dest, in_x) \
495  {                                                     \
496     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
497      d0 = _mm_unpacklo_epi8(d0, zero); \
498      d0 = _mm_add_epi16(in_x, d0); \
499      d0 = _mm_packus_epi16(d0, d0); \
500      _mm_storel_epi64((__m128i *)(dest), d0); \
501      dest += stride; \
502  }
503
504void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
505  const __m128i zero = _mm_setzero_si128();
506  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
507  const __m128i final_rounding = _mm_set1_epi16(1<<4);
508  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
509  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
510  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
511  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
512  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
513  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
514  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
515  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
516
517  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
518  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
519  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
520  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
521  int i;
522
523  // Load input data.
524  in0 = _mm_load_si128((const __m128i *)input);
525  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
526  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
527  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
528  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
529  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
530  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
531  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
532
533  // 2-D
534  for (i = 0; i < 2; i++) {
535    // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
536    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
537                  in4, in5, in6, in7);
538
539    // 4-stage 1D idct8x8
540    IDCT8_1D
541  }
542
543  // Final rounding and shift
544  in0 = _mm_adds_epi16(in0, final_rounding);
545  in1 = _mm_adds_epi16(in1, final_rounding);
546  in2 = _mm_adds_epi16(in2, final_rounding);
547  in3 = _mm_adds_epi16(in3, final_rounding);
548  in4 = _mm_adds_epi16(in4, final_rounding);
549  in5 = _mm_adds_epi16(in5, final_rounding);
550  in6 = _mm_adds_epi16(in6, final_rounding);
551  in7 = _mm_adds_epi16(in7, final_rounding);
552
553  in0 = _mm_srai_epi16(in0, 5);
554  in1 = _mm_srai_epi16(in1, 5);
555  in2 = _mm_srai_epi16(in2, 5);
556  in3 = _mm_srai_epi16(in3, 5);
557  in4 = _mm_srai_epi16(in4, 5);
558  in5 = _mm_srai_epi16(in5, 5);
559  in6 = _mm_srai_epi16(in6, 5);
560  in7 = _mm_srai_epi16(in7, 5);
561
562  RECON_AND_STORE(dest, in0);
563  RECON_AND_STORE(dest, in1);
564  RECON_AND_STORE(dest, in2);
565  RECON_AND_STORE(dest, in3);
566  RECON_AND_STORE(dest, in4);
567  RECON_AND_STORE(dest, in5);
568  RECON_AND_STORE(dest, in6);
569  RECON_AND_STORE(dest, in7);
570}
571
572void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
573  __m128i dc_value;
574  const __m128i zero = _mm_setzero_si128();
575  int a;
576
577  a = dct_const_round_shift(input[0] * cospi_16_64);
578  a = dct_const_round_shift(a * cospi_16_64);
579  a = ROUND_POWER_OF_TWO(a, 5);
580
581  dc_value = _mm_set1_epi16(a);
582
583  RECON_AND_STORE(dest, dc_value);
584  RECON_AND_STORE(dest, dc_value);
585  RECON_AND_STORE(dest, dc_value);
586  RECON_AND_STORE(dest, dc_value);
587  RECON_AND_STORE(dest, dc_value);
588  RECON_AND_STORE(dest, dc_value);
589  RECON_AND_STORE(dest, dc_value);
590  RECON_AND_STORE(dest, dc_value);
591}
592
593// perform 8x8 transpose
594static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
595  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
596  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
597  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
598  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
599  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
600  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
601  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
602  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
603
604  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
605  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
606  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
607  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
608  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
609  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
610  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
611  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
612
613  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
614  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
615  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
616  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
617  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
618  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
619  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
620  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
621}
622
623static void idct8_1d_sse2(__m128i *in) {
624  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
625  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
626  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
627  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
628  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
629  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
630  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
631  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
632  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
633
634  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
635  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
636  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
637  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
638
639  in0 = in[0];
640  in1 = in[1];
641  in2 = in[2];
642  in3 = in[3];
643  in4 = in[4];
644  in5 = in[5];
645  in6 = in[6];
646  in7 = in[7];
647
648  // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
649  TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
650                in4, in5, in6, in7);
651
652  // 4-stage 1D idct8x8
653  IDCT8_1D
654  in[0] = in0;
655  in[1] = in1;
656  in[2] = in2;
657  in[3] = in3;
658  in[4] = in4;
659  in[5] = in5;
660  in[6] = in6;
661  in[7] = in7;
662}
663
664static void iadst8_1d_sse2(__m128i *in) {
665  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
666  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
667  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
668  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
669  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
670  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
671  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
672  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
673  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
674  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
675  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
676  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
677  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
678  const __m128i k__const_0 = _mm_set1_epi16(0);
679  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
680
681  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
682  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
683  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
684  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
685  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
686
687  // transpose
688  array_transpose_8x8(in, in);
689
690  // properly aligned for butterfly input
691  in0  = in[7];
692  in1  = in[0];
693  in2  = in[5];
694  in3  = in[2];
695  in4  = in[3];
696  in5  = in[4];
697  in6  = in[1];
698  in7  = in[6];
699
700  // column transformation
701  // stage 1
702  // interleave and multiply/add into 32-bit integer
703  s0 = _mm_unpacklo_epi16(in0, in1);
704  s1 = _mm_unpackhi_epi16(in0, in1);
705  s2 = _mm_unpacklo_epi16(in2, in3);
706  s3 = _mm_unpackhi_epi16(in2, in3);
707  s4 = _mm_unpacklo_epi16(in4, in5);
708  s5 = _mm_unpackhi_epi16(in4, in5);
709  s6 = _mm_unpacklo_epi16(in6, in7);
710  s7 = _mm_unpackhi_epi16(in6, in7);
711
712  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
713  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
714  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
715  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
716  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
717  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
718  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
719  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
720  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
721  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
722  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
723  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
724  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
725  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
726  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
727  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
728
729  // addition
730  w0 = _mm_add_epi32(u0, u8);
731  w1 = _mm_add_epi32(u1, u9);
732  w2 = _mm_add_epi32(u2, u10);
733  w3 = _mm_add_epi32(u3, u11);
734  w4 = _mm_add_epi32(u4, u12);
735  w5 = _mm_add_epi32(u5, u13);
736  w6 = _mm_add_epi32(u6, u14);
737  w7 = _mm_add_epi32(u7, u15);
738  w8 = _mm_sub_epi32(u0, u8);
739  w9 = _mm_sub_epi32(u1, u9);
740  w10 = _mm_sub_epi32(u2, u10);
741  w11 = _mm_sub_epi32(u3, u11);
742  w12 = _mm_sub_epi32(u4, u12);
743  w13 = _mm_sub_epi32(u5, u13);
744  w14 = _mm_sub_epi32(u6, u14);
745  w15 = _mm_sub_epi32(u7, u15);
746
747  // shift and rounding
748  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
749  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
750  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
751  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
752  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
753  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
754  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
755  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
756  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
757  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
758  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
759  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
760  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
761  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
762  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
763  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
764
765  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
766  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
767  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
768  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
769  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
770  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
771  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
772  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
773  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
774  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
775  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
776  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
777  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
778  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
779  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
780  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
781
782  // back to 16-bit and pack 8 integers into __m128i
783  in[0] = _mm_packs_epi32(u0, u1);
784  in[1] = _mm_packs_epi32(u2, u3);
785  in[2] = _mm_packs_epi32(u4, u5);
786  in[3] = _mm_packs_epi32(u6, u7);
787  in[4] = _mm_packs_epi32(u8, u9);
788  in[5] = _mm_packs_epi32(u10, u11);
789  in[6] = _mm_packs_epi32(u12, u13);
790  in[7] = _mm_packs_epi32(u14, u15);
791
792  // stage 2
793  s0 = _mm_add_epi16(in[0], in[2]);
794  s1 = _mm_add_epi16(in[1], in[3]);
795  s2 = _mm_sub_epi16(in[0], in[2]);
796  s3 = _mm_sub_epi16(in[1], in[3]);
797  u0 = _mm_unpacklo_epi16(in[4], in[5]);
798  u1 = _mm_unpackhi_epi16(in[4], in[5]);
799  u2 = _mm_unpacklo_epi16(in[6], in[7]);
800  u3 = _mm_unpackhi_epi16(in[6], in[7]);
801
802  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
803  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
804  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
805  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
806  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
807  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
808  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
809  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
810
811  w0 = _mm_add_epi32(v0, v4);
812  w1 = _mm_add_epi32(v1, v5);
813  w2 = _mm_add_epi32(v2, v6);
814  w3 = _mm_add_epi32(v3, v7);
815  w4 = _mm_sub_epi32(v0, v4);
816  w5 = _mm_sub_epi32(v1, v5);
817  w6 = _mm_sub_epi32(v2, v6);
818  w7 = _mm_sub_epi32(v3, v7);
819
820  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
821  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
822  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
823  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
824  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
825  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
826  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
827  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
828
829  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
830  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
831  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
832  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
833  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
834  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
835  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
836  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
837
838  // back to 16-bit intergers
839  s4 = _mm_packs_epi32(u0, u1);
840  s5 = _mm_packs_epi32(u2, u3);
841  s6 = _mm_packs_epi32(u4, u5);
842  s7 = _mm_packs_epi32(u6, u7);
843
844  // stage 3
845  u0 = _mm_unpacklo_epi16(s2, s3);
846  u1 = _mm_unpackhi_epi16(s2, s3);
847  u2 = _mm_unpacklo_epi16(s6, s7);
848  u3 = _mm_unpackhi_epi16(s6, s7);
849
850  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
851  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
852  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
853  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
854  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
855  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
856  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
857  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
858
859  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
860  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
861  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
862  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
863  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
864  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
865  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
866  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
867
868  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
869  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
870  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
871  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
872  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
873  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
874  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
875  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
876
877  s2 = _mm_packs_epi32(v0, v1);
878  s3 = _mm_packs_epi32(v2, v3);
879  s6 = _mm_packs_epi32(v4, v5);
880  s7 = _mm_packs_epi32(v6, v7);
881
882  in[0] = s0;
883  in[1] = _mm_sub_epi16(k__const_0, s4);
884  in[2] = s6;
885  in[3] = _mm_sub_epi16(k__const_0, s2);
886  in[4] = s3;
887  in[5] = _mm_sub_epi16(k__const_0, s7);
888  in[6] = s5;
889  in[7] = _mm_sub_epi16(k__const_0, s1);
890}
891
892
893void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
894                            int tx_type) {
895  __m128i in[8];
896  const __m128i zero = _mm_setzero_si128();
897  const __m128i final_rounding = _mm_set1_epi16(1<<4);
898
899  // load input data
900  in[0] = _mm_load_si128((const __m128i *)input);
901  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
902  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
903  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
904  in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
905  in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
906  in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
907  in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
908
909  switch (tx_type) {
910    case 0:  // DCT_DCT
911      idct8_1d_sse2(in);
912      idct8_1d_sse2(in);
913      break;
914    case 1:  // ADST_DCT
915      idct8_1d_sse2(in);
916      iadst8_1d_sse2(in);
917      break;
918    case 2:  // DCT_ADST
919      iadst8_1d_sse2(in);
920      idct8_1d_sse2(in);
921      break;
922    case 3:  // ADST_ADST
923      iadst8_1d_sse2(in);
924      iadst8_1d_sse2(in);
925      break;
926    default:
927      assert(0);
928      break;
929  }
930
931  // Final rounding and shift
932  in[0] = _mm_adds_epi16(in[0], final_rounding);
933  in[1] = _mm_adds_epi16(in[1], final_rounding);
934  in[2] = _mm_adds_epi16(in[2], final_rounding);
935  in[3] = _mm_adds_epi16(in[3], final_rounding);
936  in[4] = _mm_adds_epi16(in[4], final_rounding);
937  in[5] = _mm_adds_epi16(in[5], final_rounding);
938  in[6] = _mm_adds_epi16(in[6], final_rounding);
939  in[7] = _mm_adds_epi16(in[7], final_rounding);
940
941  in[0] = _mm_srai_epi16(in[0], 5);
942  in[1] = _mm_srai_epi16(in[1], 5);
943  in[2] = _mm_srai_epi16(in[2], 5);
944  in[3] = _mm_srai_epi16(in[3], 5);
945  in[4] = _mm_srai_epi16(in[4], 5);
946  in[5] = _mm_srai_epi16(in[5], 5);
947  in[6] = _mm_srai_epi16(in[6], 5);
948  in[7] = _mm_srai_epi16(in[7], 5);
949
950  RECON_AND_STORE(dest, in[0]);
951  RECON_AND_STORE(dest, in[1]);
952  RECON_AND_STORE(dest, in[2]);
953  RECON_AND_STORE(dest, in[3]);
954  RECON_AND_STORE(dest, in[4]);
955  RECON_AND_STORE(dest, in[5]);
956  RECON_AND_STORE(dest, in[6]);
957  RECON_AND_STORE(dest, in[7]);
958}
959
960void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
961  const __m128i zero = _mm_setzero_si128();
962  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
963  const __m128i final_rounding = _mm_set1_epi16(1<<4);
964  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
965  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
966  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
967  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
968  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
969  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
970  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
971  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
972  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
973
974  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
975  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
976  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
977  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
978
979  // Rows. Load 4-row input data.
980  in0 = _mm_load_si128((const __m128i *)input);
981  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
982  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
983  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
984
985  // 8x4 Transpose
986  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
987
988  // Stage1
989  { //NOLINT
990    const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);
991    const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);
992
993    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
994    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
995    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
996    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
997
998    tmp0 = _mm_add_epi32(tmp0, rounding);
999    tmp2 = _mm_add_epi32(tmp2, rounding);
1000    tmp4 = _mm_add_epi32(tmp4, rounding);
1001    tmp6 = _mm_add_epi32(tmp6, rounding);
1002    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1003    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1004    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1005    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1006
1007    stp1_4 = _mm_packs_epi32(tmp0, zero);
1008    stp1_7 = _mm_packs_epi32(tmp2, zero);
1009    stp1_5 = _mm_packs_epi32(tmp4, zero);
1010    stp1_6 = _mm_packs_epi32(tmp6, zero);
1011  }
1012
1013  // Stage2
1014  { //NOLINT
1015    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);
1016    const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);
1017
1018    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
1019    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
1020    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
1021    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
1022
1023    tmp0 = _mm_add_epi32(tmp0, rounding);
1024    tmp2 = _mm_add_epi32(tmp2, rounding);
1025    tmp4 = _mm_add_epi32(tmp4, rounding);
1026    tmp6 = _mm_add_epi32(tmp6, rounding);
1027    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1028    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1029    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1030    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1031
1032    stp2_0 = _mm_packs_epi32(tmp0, zero);
1033    stp2_1 = _mm_packs_epi32(tmp2, zero);
1034    stp2_2 = _mm_packs_epi32(tmp4, zero);
1035    stp2_3 = _mm_packs_epi32(tmp6, zero);
1036
1037    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);
1038    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);
1039    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);
1040    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);
1041  }
1042
1043  // Stage3
1044  { //NOLINT
1045    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
1046    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);
1047    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);
1048    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);
1049    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);
1050
1051    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
1052    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
1053
1054    tmp0 = _mm_add_epi32(tmp0, rounding);
1055    tmp2 = _mm_add_epi32(tmp2, rounding);
1056    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1057    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1058
1059    stp1_5 = _mm_packs_epi32(tmp0, zero);
1060    stp1_6 = _mm_packs_epi32(tmp2, zero);
1061  }
1062
1063  // Stage4
1064  in0 = _mm_adds_epi16(stp1_0, stp2_7);
1065  in1 = _mm_adds_epi16(stp1_1, stp1_6);
1066  in2 = _mm_adds_epi16(stp1_2, stp1_5);
1067  in3 = _mm_adds_epi16(stp1_3, stp2_4);
1068  in4 = _mm_subs_epi16(stp1_3, stp2_4);
1069  in5 = _mm_subs_epi16(stp1_2, stp1_5);
1070  in6 = _mm_subs_epi16(stp1_1, stp1_6);
1071  in7 = _mm_subs_epi16(stp1_0, stp2_7);
1072
1073  // Columns. 4x8 Transpose
1074  TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
1075                in4, in5, in6, in7)
1076
1077  // 1D idct8x8
1078  IDCT8_1D
1079
1080  // Final rounding and shift
1081  in0 = _mm_adds_epi16(in0, final_rounding);
1082  in1 = _mm_adds_epi16(in1, final_rounding);
1083  in2 = _mm_adds_epi16(in2, final_rounding);
1084  in3 = _mm_adds_epi16(in3, final_rounding);
1085  in4 = _mm_adds_epi16(in4, final_rounding);
1086  in5 = _mm_adds_epi16(in5, final_rounding);
1087  in6 = _mm_adds_epi16(in6, final_rounding);
1088  in7 = _mm_adds_epi16(in7, final_rounding);
1089
1090  in0 = _mm_srai_epi16(in0, 5);
1091  in1 = _mm_srai_epi16(in1, 5);
1092  in2 = _mm_srai_epi16(in2, 5);
1093  in3 = _mm_srai_epi16(in3, 5);
1094  in4 = _mm_srai_epi16(in4, 5);
1095  in5 = _mm_srai_epi16(in5, 5);
1096  in6 = _mm_srai_epi16(in6, 5);
1097  in7 = _mm_srai_epi16(in7, 5);
1098
1099  RECON_AND_STORE(dest, in0);
1100  RECON_AND_STORE(dest, in1);
1101  RECON_AND_STORE(dest, in2);
1102  RECON_AND_STORE(dest, in3);
1103  RECON_AND_STORE(dest, in4);
1104  RECON_AND_STORE(dest, in5);
1105  RECON_AND_STORE(dest, in6);
1106  RECON_AND_STORE(dest, in7);
1107}
1108
1109#define IDCT16_1D \
1110  /* Stage2 */ \
1111  { \
1112    const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
1113    const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
1114    const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7);   \
1115    const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7);   \
1116    const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \
1117    const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
1118    const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
1119    const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
1120    \
1121    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
1122                           stg2_0, stg2_1, stg2_2, stg2_3, \
1123                           stp2_8, stp2_15, stp2_9, stp2_14) \
1124    \
1125    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
1126                           stg2_4, stg2_5, stg2_6, stg2_7, \
1127                           stp2_10, stp2_13, stp2_11, stp2_12) \
1128  } \
1129    \
1130  /* Stage3 */ \
1131  { \
1132    const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
1133    const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
1134    const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
1135    const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
1136    \
1137    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
1138                           stg3_0, stg3_1, stg3_2, stg3_3, \
1139                           stp1_4, stp1_7, stp1_5, stp1_6) \
1140    \
1141    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
1142    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
1143    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
1144    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
1145    \
1146    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
1147    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
1148    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
1149    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
1150  } \
1151  \
1152  /* Stage4 */ \
1153  { \
1154    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
1155    const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
1156    const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
1157    const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
1158    \
1159    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1160    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1161    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1162    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1163    \
1164    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
1165                           stg4_0, stg4_1, stg4_2, stg4_3, \
1166                           stp2_0, stp2_1, stp2_2, stp2_3) \
1167    \
1168    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1169    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1170    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1171    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1172    \
1173    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1174                           stg4_4, stg4_5, stg4_6, stg4_7, \
1175                           stp2_9, stp2_14, stp2_10, stp2_13) \
1176  } \
1177    \
1178  /* Stage5 */ \
1179  { \
1180    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1181    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1182    \
1183    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1184    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1185    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1186    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1187    \
1188    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1189    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1190    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1191    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1192    \
1193    tmp0 = _mm_add_epi32(tmp0, rounding); \
1194    tmp1 = _mm_add_epi32(tmp1, rounding); \
1195    tmp2 = _mm_add_epi32(tmp2, rounding); \
1196    tmp3 = _mm_add_epi32(tmp3, rounding); \
1197    \
1198    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1199    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1200    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1201    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1202    \
1203    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1204    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1205    \
1206    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1207    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1208    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1209    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1210    \
1211    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1212    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1213    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1214    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1215  } \
1216    \
1217  /* Stage6 */ \
1218  { \
1219    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1220    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1221    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1222    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1223    \
1224    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1225    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1226    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1227    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1228    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1229    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1230    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1231    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1232    \
1233    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1234                           stg6_0, stg4_0, stg6_0, stg4_0, \
1235                           stp2_10, stp2_13, stp2_11, stp2_12) \
1236  }
1237
1238void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
1239                                int stride) {
1240  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1241  const __m128i final_rounding = _mm_set1_epi16(1<<5);
1242  const __m128i zero = _mm_setzero_si128();
1243
1244  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1245  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1246  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1247  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1248  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1249  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1250  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1251  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1252
1253  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1254  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1255  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1256  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1257
1258  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1259  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1260  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1261  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1262  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1263  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1264  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1265  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1266
1267  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1268
1269  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
1270          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
1271          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
1272          in14 = zero, in15 = zero;
1273  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
1274          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
1275          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
1276  __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero,
1277          r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero,
1278          r12 = zero, r13 = zero, r14 = zero, r15 = zero;
1279  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1280          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1281          stp1_8_0, stp1_12_0;
1282  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1283          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1284  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1285  int i;
1286
1287  // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct.
1288  for (i = 0; i < 4; i++) {
1289    // 1-D idct
1290    if (i < 2) {
1291      if (i == 1) input += 128;
1292
1293      // Load input data.
1294      in0 = _mm_load_si128((const __m128i *)input);
1295      in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
1296      in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
1297      in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
1298      in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
1299      in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
1300      in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
1301      in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
1302      in4 = _mm_load_si128((const __m128i *)(input + 8 * 8));
1303      in12 = _mm_load_si128((const __m128i *)(input + 8 * 9));
1304      in5 = _mm_load_si128((const __m128i *)(input + 8 * 10));
1305      in13 = _mm_load_si128((const __m128i *)(input + 8 * 11));
1306      in6 = _mm_load_si128((const __m128i *)(input + 8 * 12));
1307      in14 = _mm_load_si128((const __m128i *)(input + 8 * 13));
1308      in7 = _mm_load_si128((const __m128i *)(input + 8 * 14));
1309      in15 = _mm_load_si128((const __m128i *)(input + 8 * 15));
1310
1311      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
1312                    in4, in5, in6, in7);
1313      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
1314                    in10, in11, in12, in13, in14, in15);
1315    }
1316
1317    if (i == 2) {
1318      TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
1319                    in5, in6, in7);
1320      TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12,
1321                    in13, in14, in15);
1322    }
1323
1324    if (i == 3) {
1325      TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
1326                    in4, in5, in6, in7);
1327      TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11,
1328                    in12, in13, in14, in15);
1329    }
1330
1331    IDCT16_1D
1332
1333    // Stage7
1334    if (i == 0) {
1335      // Left 8x16
1336      l0 = _mm_add_epi16(stp2_0, stp1_15);
1337      l1 = _mm_add_epi16(stp2_1, stp1_14);
1338      l2 = _mm_add_epi16(stp2_2, stp2_13);
1339      l3 = _mm_add_epi16(stp2_3, stp2_12);
1340      l4 = _mm_add_epi16(stp2_4, stp2_11);
1341      l5 = _mm_add_epi16(stp2_5, stp2_10);
1342      l6 = _mm_add_epi16(stp2_6, stp1_9);
1343      l7 = _mm_add_epi16(stp2_7, stp1_8);
1344      l8 = _mm_sub_epi16(stp2_7, stp1_8);
1345      l9 = _mm_sub_epi16(stp2_6, stp1_9);
1346      l10 = _mm_sub_epi16(stp2_5, stp2_10);
1347      l11 = _mm_sub_epi16(stp2_4, stp2_11);
1348      l12 = _mm_sub_epi16(stp2_3, stp2_12);
1349      l13 = _mm_sub_epi16(stp2_2, stp2_13);
1350      l14 = _mm_sub_epi16(stp2_1, stp1_14);
1351      l15 = _mm_sub_epi16(stp2_0, stp1_15);
1352    } else if (i == 1) {
1353      // Right 8x16
1354      r0 = _mm_add_epi16(stp2_0, stp1_15);
1355      r1 = _mm_add_epi16(stp2_1, stp1_14);
1356      r2 = _mm_add_epi16(stp2_2, stp2_13);
1357      r3 = _mm_add_epi16(stp2_3, stp2_12);
1358      r4 = _mm_add_epi16(stp2_4, stp2_11);
1359      r5 = _mm_add_epi16(stp2_5, stp2_10);
1360      r6 = _mm_add_epi16(stp2_6, stp1_9);
1361      r7 = _mm_add_epi16(stp2_7, stp1_8);
1362      r8 = _mm_sub_epi16(stp2_7, stp1_8);
1363      r9 = _mm_sub_epi16(stp2_6, stp1_9);
1364      r10 = _mm_sub_epi16(stp2_5, stp2_10);
1365      r11 = _mm_sub_epi16(stp2_4, stp2_11);
1366      r12 = _mm_sub_epi16(stp2_3, stp2_12);
1367      r13 = _mm_sub_epi16(stp2_2, stp2_13);
1368      r14 = _mm_sub_epi16(stp2_1, stp1_14);
1369      r15 = _mm_sub_epi16(stp2_0, stp1_15);
1370    } else {
1371      // 2-D
1372      in0 = _mm_add_epi16(stp2_0, stp1_15);
1373      in1 = _mm_add_epi16(stp2_1, stp1_14);
1374      in2 = _mm_add_epi16(stp2_2, stp2_13);
1375      in3 = _mm_add_epi16(stp2_3, stp2_12);
1376      in4 = _mm_add_epi16(stp2_4, stp2_11);
1377      in5 = _mm_add_epi16(stp2_5, stp2_10);
1378      in6 = _mm_add_epi16(stp2_6, stp1_9);
1379      in7 = _mm_add_epi16(stp2_7, stp1_8);
1380      in8 = _mm_sub_epi16(stp2_7, stp1_8);
1381      in9 = _mm_sub_epi16(stp2_6, stp1_9);
1382      in10 = _mm_sub_epi16(stp2_5, stp2_10);
1383      in11 = _mm_sub_epi16(stp2_4, stp2_11);
1384      in12 = _mm_sub_epi16(stp2_3, stp2_12);
1385      in13 = _mm_sub_epi16(stp2_2, stp2_13);
1386      in14 = _mm_sub_epi16(stp2_1, stp1_14);
1387      in15 = _mm_sub_epi16(stp2_0, stp1_15);
1388
1389      // Final rounding and shift
1390      in0 = _mm_adds_epi16(in0, final_rounding);
1391      in1 = _mm_adds_epi16(in1, final_rounding);
1392      in2 = _mm_adds_epi16(in2, final_rounding);
1393      in3 = _mm_adds_epi16(in3, final_rounding);
1394      in4 = _mm_adds_epi16(in4, final_rounding);
1395      in5 = _mm_adds_epi16(in5, final_rounding);
1396      in6 = _mm_adds_epi16(in6, final_rounding);
1397      in7 = _mm_adds_epi16(in7, final_rounding);
1398      in8 = _mm_adds_epi16(in8, final_rounding);
1399      in9 = _mm_adds_epi16(in9, final_rounding);
1400      in10 = _mm_adds_epi16(in10, final_rounding);
1401      in11 = _mm_adds_epi16(in11, final_rounding);
1402      in12 = _mm_adds_epi16(in12, final_rounding);
1403      in13 = _mm_adds_epi16(in13, final_rounding);
1404      in14 = _mm_adds_epi16(in14, final_rounding);
1405      in15 = _mm_adds_epi16(in15, final_rounding);
1406
1407      in0 = _mm_srai_epi16(in0, 6);
1408      in1 = _mm_srai_epi16(in1, 6);
1409      in2 = _mm_srai_epi16(in2, 6);
1410      in3 = _mm_srai_epi16(in3, 6);
1411      in4 = _mm_srai_epi16(in4, 6);
1412      in5 = _mm_srai_epi16(in5, 6);
1413      in6 = _mm_srai_epi16(in6, 6);
1414      in7 = _mm_srai_epi16(in7, 6);
1415      in8 = _mm_srai_epi16(in8, 6);
1416      in9 = _mm_srai_epi16(in9, 6);
1417      in10 = _mm_srai_epi16(in10, 6);
1418      in11 = _mm_srai_epi16(in11, 6);
1419      in12 = _mm_srai_epi16(in12, 6);
1420      in13 = _mm_srai_epi16(in13, 6);
1421      in14 = _mm_srai_epi16(in14, 6);
1422      in15 = _mm_srai_epi16(in15, 6);
1423
1424      RECON_AND_STORE(dest, in0);
1425      RECON_AND_STORE(dest, in1);
1426      RECON_AND_STORE(dest, in2);
1427      RECON_AND_STORE(dest, in3);
1428      RECON_AND_STORE(dest, in4);
1429      RECON_AND_STORE(dest, in5);
1430      RECON_AND_STORE(dest, in6);
1431      RECON_AND_STORE(dest, in7);
1432      RECON_AND_STORE(dest, in8);
1433      RECON_AND_STORE(dest, in9);
1434      RECON_AND_STORE(dest, in10);
1435      RECON_AND_STORE(dest, in11);
1436      RECON_AND_STORE(dest, in12);
1437      RECON_AND_STORE(dest, in13);
1438      RECON_AND_STORE(dest, in14);
1439      RECON_AND_STORE(dest, in15);
1440
1441      dest += 8 - (stride * 16);
1442    }
1443  }
1444}
1445
1446void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
1447  __m128i dc_value;
1448  const __m128i zero = _mm_setzero_si128();
1449  int a, i;
1450
1451  a = dct_const_round_shift(input[0] * cospi_16_64);
1452  a = dct_const_round_shift(a * cospi_16_64);
1453  a = ROUND_POWER_OF_TWO(a, 6);
1454
1455  dc_value = _mm_set1_epi16(a);
1456
1457  for (i = 0; i < 2; ++i) {
1458    RECON_AND_STORE(dest, dc_value);
1459    RECON_AND_STORE(dest, dc_value);
1460    RECON_AND_STORE(dest, dc_value);
1461    RECON_AND_STORE(dest, dc_value);
1462    RECON_AND_STORE(dest, dc_value);
1463    RECON_AND_STORE(dest, dc_value);
1464    RECON_AND_STORE(dest, dc_value);
1465    RECON_AND_STORE(dest, dc_value);
1466    RECON_AND_STORE(dest, dc_value);
1467    RECON_AND_STORE(dest, dc_value);
1468    RECON_AND_STORE(dest, dc_value);
1469    RECON_AND_STORE(dest, dc_value);
1470    RECON_AND_STORE(dest, dc_value);
1471    RECON_AND_STORE(dest, dc_value);
1472    RECON_AND_STORE(dest, dc_value);
1473    RECON_AND_STORE(dest, dc_value);
1474    dest += 8 - (stride * 16);
1475  }
1476}
1477
1478static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
1479  __m128i tbuf[8];
1480  array_transpose_8x8(res0, res0);
1481  array_transpose_8x8(res1, tbuf);
1482  array_transpose_8x8(res0 + 8, res1);
1483  array_transpose_8x8(res1 + 8, res1 + 8);
1484
1485  res0[8] = tbuf[0];
1486  res0[9] = tbuf[1];
1487  res0[10] = tbuf[2];
1488  res0[11] = tbuf[3];
1489  res0[12] = tbuf[4];
1490  res0[13] = tbuf[5];
1491  res0[14] = tbuf[6];
1492  res0[15] = tbuf[7];
1493}
1494
1495static void iadst16_1d_8col(__m128i *in) {
1496  // perform 16x16 1-D ADST for 8 columns
1497  __m128i s[16], x[16], u[32], v[32];
1498  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1499  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1500  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1501  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1502  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1503  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1504  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1505  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1506  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1507  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1508  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1509  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1510  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1511  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1512  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1513  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1514  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1515  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1516  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1517  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1518  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1519  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1520  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1521  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1522  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1523  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
1524  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1525  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1526  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1527  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1528  const __m128i kZero = _mm_set1_epi16(0);
1529
1530  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1531  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1532  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1533  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1534  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1535  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1536  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1537  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1538  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1539  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1540  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1541  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1542  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1543  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1544  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1545  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1546
1547  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1548  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1549  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1550  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1551  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1552  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1553  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1554  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1555  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1556  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1557  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1558  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1559  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1560  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1561  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1562  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1563  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1564  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1565  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1566  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1567  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1568  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1569  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1570  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1571  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1572  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1573  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1574  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1575  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1576  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1577  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1578  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1579
1580  u[0] = _mm_add_epi32(v[0], v[16]);
1581  u[1] = _mm_add_epi32(v[1], v[17]);
1582  u[2] = _mm_add_epi32(v[2], v[18]);
1583  u[3] = _mm_add_epi32(v[3], v[19]);
1584  u[4] = _mm_add_epi32(v[4], v[20]);
1585  u[5] = _mm_add_epi32(v[5], v[21]);
1586  u[6] = _mm_add_epi32(v[6], v[22]);
1587  u[7] = _mm_add_epi32(v[7], v[23]);
1588  u[8] = _mm_add_epi32(v[8], v[24]);
1589  u[9] = _mm_add_epi32(v[9], v[25]);
1590  u[10] = _mm_add_epi32(v[10], v[26]);
1591  u[11] = _mm_add_epi32(v[11], v[27]);
1592  u[12] = _mm_add_epi32(v[12], v[28]);
1593  u[13] = _mm_add_epi32(v[13], v[29]);
1594  u[14] = _mm_add_epi32(v[14], v[30]);
1595  u[15] = _mm_add_epi32(v[15], v[31]);
1596  u[16] = _mm_sub_epi32(v[0], v[16]);
1597  u[17] = _mm_sub_epi32(v[1], v[17]);
1598  u[18] = _mm_sub_epi32(v[2], v[18]);
1599  u[19] = _mm_sub_epi32(v[3], v[19]);
1600  u[20] = _mm_sub_epi32(v[4], v[20]);
1601  u[21] = _mm_sub_epi32(v[5], v[21]);
1602  u[22] = _mm_sub_epi32(v[6], v[22]);
1603  u[23] = _mm_sub_epi32(v[7], v[23]);
1604  u[24] = _mm_sub_epi32(v[8], v[24]);
1605  u[25] = _mm_sub_epi32(v[9], v[25]);
1606  u[26] = _mm_sub_epi32(v[10], v[26]);
1607  u[27] = _mm_sub_epi32(v[11], v[27]);
1608  u[28] = _mm_sub_epi32(v[12], v[28]);
1609  u[29] = _mm_sub_epi32(v[13], v[29]);
1610  u[30] = _mm_sub_epi32(v[14], v[30]);
1611  u[31] = _mm_sub_epi32(v[15], v[31]);
1612
1613  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1614  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1615  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1616  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1617  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1618  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1619  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1620  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1621  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1622  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1623  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1624  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1625  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1626  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1627  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1628  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1629  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1630  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1631  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1632  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1633  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1634  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1635  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1636  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1637  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1638  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1639  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1640  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1641  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1642  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1643  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1644  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1645
1646  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1647  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1648  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1649  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1650  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1651  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1652  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1653  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1654  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1655  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1656  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1657  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1658  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1659  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1660  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1661  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1662  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1663  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1664  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1665  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1666  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1667  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1668  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1669  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1670  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1671  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1672  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1673  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1674  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1675  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1676  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1677  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1678
1679  s[0] = _mm_packs_epi32(u[0], u[1]);
1680  s[1] = _mm_packs_epi32(u[2], u[3]);
1681  s[2] = _mm_packs_epi32(u[4], u[5]);
1682  s[3] = _mm_packs_epi32(u[6], u[7]);
1683  s[4] = _mm_packs_epi32(u[8], u[9]);
1684  s[5] = _mm_packs_epi32(u[10], u[11]);
1685  s[6] = _mm_packs_epi32(u[12], u[13]);
1686  s[7] = _mm_packs_epi32(u[14], u[15]);
1687  s[8] = _mm_packs_epi32(u[16], u[17]);
1688  s[9] = _mm_packs_epi32(u[18], u[19]);
1689  s[10] = _mm_packs_epi32(u[20], u[21]);
1690  s[11] = _mm_packs_epi32(u[22], u[23]);
1691  s[12] = _mm_packs_epi32(u[24], u[25]);
1692  s[13] = _mm_packs_epi32(u[26], u[27]);
1693  s[14] = _mm_packs_epi32(u[28], u[29]);
1694  s[15] = _mm_packs_epi32(u[30], u[31]);
1695
1696  // stage 2
1697  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1698  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1699  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1700  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1701  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1702  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1703  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1704  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1705
1706  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1707  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1708  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1709  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1710  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1711  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1712  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1713  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1714  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1715  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1716  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1717  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1718  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1719  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1720  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1721  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1722
1723  u[0] = _mm_add_epi32(v[0], v[8]);
1724  u[1] = _mm_add_epi32(v[1], v[9]);
1725  u[2] = _mm_add_epi32(v[2], v[10]);
1726  u[3] = _mm_add_epi32(v[3], v[11]);
1727  u[4] = _mm_add_epi32(v[4], v[12]);
1728  u[5] = _mm_add_epi32(v[5], v[13]);
1729  u[6] = _mm_add_epi32(v[6], v[14]);
1730  u[7] = _mm_add_epi32(v[7], v[15]);
1731  u[8] = _mm_sub_epi32(v[0], v[8]);
1732  u[9] = _mm_sub_epi32(v[1], v[9]);
1733  u[10] = _mm_sub_epi32(v[2], v[10]);
1734  u[11] = _mm_sub_epi32(v[3], v[11]);
1735  u[12] = _mm_sub_epi32(v[4], v[12]);
1736  u[13] = _mm_sub_epi32(v[5], v[13]);
1737  u[14] = _mm_sub_epi32(v[6], v[14]);
1738  u[15] = _mm_sub_epi32(v[7], v[15]);
1739
1740  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1741  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1742  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1743  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1744  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1745  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1746  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1747  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1748  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1749  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1750  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1751  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1752  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1753  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1754  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1755  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1756
1757  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1758  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1759  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1760  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1761  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1762  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1763  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1764  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1765  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1766  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1767  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1768  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1769  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1770  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1771  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1772  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1773
1774  x[0] = _mm_add_epi16(s[0], s[4]);
1775  x[1] = _mm_add_epi16(s[1], s[5]);
1776  x[2] = _mm_add_epi16(s[2], s[6]);
1777  x[3] = _mm_add_epi16(s[3], s[7]);
1778  x[4] = _mm_sub_epi16(s[0], s[4]);
1779  x[5] = _mm_sub_epi16(s[1], s[5]);
1780  x[6] = _mm_sub_epi16(s[2], s[6]);
1781  x[7] = _mm_sub_epi16(s[3], s[7]);
1782  x[8] = _mm_packs_epi32(u[0], u[1]);
1783  x[9] = _mm_packs_epi32(u[2], u[3]);
1784  x[10] = _mm_packs_epi32(u[4], u[5]);
1785  x[11] = _mm_packs_epi32(u[6], u[7]);
1786  x[12] = _mm_packs_epi32(u[8], u[9]);
1787  x[13] = _mm_packs_epi32(u[10], u[11]);
1788  x[14] = _mm_packs_epi32(u[12], u[13]);
1789  x[15] = _mm_packs_epi32(u[14], u[15]);
1790
1791  // stage 3
1792  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1793  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1794  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1795  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1796  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1797  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1798  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1799  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1800
1801  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1802  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1803  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1804  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1805  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1806  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1807  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1808  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1809  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1810  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1811  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1812  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1813  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1814  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1815  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1816  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1817
1818  u[0] = _mm_add_epi32(v[0], v[4]);
1819  u[1] = _mm_add_epi32(v[1], v[5]);
1820  u[2] = _mm_add_epi32(v[2], v[6]);
1821  u[3] = _mm_add_epi32(v[3], v[7]);
1822  u[4] = _mm_sub_epi32(v[0], v[4]);
1823  u[5] = _mm_sub_epi32(v[1], v[5]);
1824  u[6] = _mm_sub_epi32(v[2], v[6]);
1825  u[7] = _mm_sub_epi32(v[3], v[7]);
1826  u[8] = _mm_add_epi32(v[8], v[12]);
1827  u[9] = _mm_add_epi32(v[9], v[13]);
1828  u[10] = _mm_add_epi32(v[10], v[14]);
1829  u[11] = _mm_add_epi32(v[11], v[15]);
1830  u[12] = _mm_sub_epi32(v[8], v[12]);
1831  u[13] = _mm_sub_epi32(v[9], v[13]);
1832  u[14] = _mm_sub_epi32(v[10], v[14]);
1833  u[15] = _mm_sub_epi32(v[11], v[15]);
1834
1835  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1836  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1837  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1838  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1839  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1840  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1841  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1842  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1843  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1844  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1845  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1846  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1847  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1848  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1849  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1850  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1851
1852  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1853  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1854  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1855  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1856  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1857  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1858  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1859  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1860  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1861  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1862  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1863  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1864  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1865  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1866  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1867  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1868
1869  s[0] = _mm_add_epi16(x[0], x[2]);
1870  s[1] = _mm_add_epi16(x[1], x[3]);
1871  s[2] = _mm_sub_epi16(x[0], x[2]);
1872  s[3] = _mm_sub_epi16(x[1], x[3]);
1873  s[4] = _mm_packs_epi32(v[0], v[1]);
1874  s[5] = _mm_packs_epi32(v[2], v[3]);
1875  s[6] = _mm_packs_epi32(v[4], v[5]);
1876  s[7] = _mm_packs_epi32(v[6], v[7]);
1877  s[8] = _mm_add_epi16(x[8], x[10]);
1878  s[9] = _mm_add_epi16(x[9], x[11]);
1879  s[10] = _mm_sub_epi16(x[8], x[10]);
1880  s[11] = _mm_sub_epi16(x[9], x[11]);
1881  s[12] = _mm_packs_epi32(v[8], v[9]);
1882  s[13] = _mm_packs_epi32(v[10], v[11]);
1883  s[14] = _mm_packs_epi32(v[12], v[13]);
1884  s[15] = _mm_packs_epi32(v[14], v[15]);
1885
1886  // stage 4
1887  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1888  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1889  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1890  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1891  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1892  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1893  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1894  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1895
1896  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1897  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1898  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1899  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1900  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1901  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1902  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1903  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1904  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1905  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1906  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1907  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1908  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1909  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1910  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1911  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1912
1913  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1914  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1915  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1916  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1917  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1918  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1919  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1920  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1921  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1922  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1923  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1924  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1925  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1926  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1927  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1928  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1929
1930  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1931  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1932  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1933  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1934  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1935  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1936  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1937  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1938  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1939  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1940  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1941  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1942  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1943  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1944  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1945  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1946
1947  in[0] = s[0];
1948  in[1] = _mm_sub_epi16(kZero, s[8]);
1949  in[2] = s[12];
1950  in[3] = _mm_sub_epi16(kZero, s[4]);
1951  in[4] = _mm_packs_epi32(v[4], v[5]);
1952  in[5] = _mm_packs_epi32(v[12], v[13]);
1953  in[6] = _mm_packs_epi32(v[8], v[9]);
1954  in[7] = _mm_packs_epi32(v[0], v[1]);
1955  in[8] = _mm_packs_epi32(v[2], v[3]);
1956  in[9] = _mm_packs_epi32(v[10], v[11]);
1957  in[10] = _mm_packs_epi32(v[14], v[15]);
1958  in[11] = _mm_packs_epi32(v[6], v[7]);
1959  in[12] = s[5];
1960  in[13] = _mm_sub_epi16(kZero, s[13]);
1961  in[14] = s[9];
1962  in[15] = _mm_sub_epi16(kZero, s[1]);
1963}
1964
1965static void idct16_1d_8col(__m128i *in) {
1966  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1967  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1968  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1969  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1970  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1971  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1972  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1973  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1974  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1975  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1976  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1977  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1978  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1979  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1980  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1981  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1982  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1983  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1984  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1985  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1986  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1987  __m128i v[16], u[16], s[16], t[16];
1988
1989  // stage 1
1990  s[0] = in[0];
1991  s[1] = in[8];
1992  s[2] = in[4];
1993  s[3] = in[12];
1994  s[4] = in[2];
1995  s[5] = in[10];
1996  s[6] = in[6];
1997  s[7] = in[14];
1998  s[8] = in[1];
1999  s[9] = in[9];
2000  s[10] = in[5];
2001  s[11] = in[13];
2002  s[12] = in[3];
2003  s[13] = in[11];
2004  s[14] = in[7];
2005  s[15] = in[15];
2006
2007  // stage 2
2008  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
2009  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
2010  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
2011  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
2012  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
2013  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
2014  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
2015  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
2016
2017  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
2018  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
2019  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
2020  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
2021  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
2022  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
2023  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
2024  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
2025  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
2026  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
2027  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
2028  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
2029  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
2030  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
2031  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
2032  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
2033
2034  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2035  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2036  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2037  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2038  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2039  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2040  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2041  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2042  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2043  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2044  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2045  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2046  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2047  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2048  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2049  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2050
2051  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2052  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2053  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2054  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2055  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2056  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2057  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2058  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2059  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2060  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2061  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2062  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2063  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2064  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2065  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2066  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2067
2068  s[8]  = _mm_packs_epi32(u[0], u[1]);
2069  s[15] = _mm_packs_epi32(u[2], u[3]);
2070  s[9]  = _mm_packs_epi32(u[4], u[5]);
2071  s[14] = _mm_packs_epi32(u[6], u[7]);
2072  s[10] = _mm_packs_epi32(u[8], u[9]);
2073  s[13] = _mm_packs_epi32(u[10], u[11]);
2074  s[11] = _mm_packs_epi32(u[12], u[13]);
2075  s[12] = _mm_packs_epi32(u[14], u[15]);
2076
2077  // stage 3
2078  t[0] = s[0];
2079  t[1] = s[1];
2080  t[2] = s[2];
2081  t[3] = s[3];
2082  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
2083  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
2084  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
2085  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
2086
2087  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
2088  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
2089  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
2090  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
2091  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
2092  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
2093  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
2094  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
2095
2096  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2097  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2098  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2099  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2100  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2101  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2102  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2103  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2104
2105  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2106  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2107  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2108  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2109  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2110  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2111  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2112  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2113
2114  t[4] = _mm_packs_epi32(u[0], u[1]);
2115  t[7] = _mm_packs_epi32(u[2], u[3]);
2116  t[5] = _mm_packs_epi32(u[4], u[5]);
2117  t[6] = _mm_packs_epi32(u[6], u[7]);
2118  t[8] = _mm_add_epi16(s[8], s[9]);
2119  t[9] = _mm_sub_epi16(s[8], s[9]);
2120  t[10] = _mm_sub_epi16(s[11], s[10]);
2121  t[11] = _mm_add_epi16(s[10], s[11]);
2122  t[12] = _mm_add_epi16(s[12], s[13]);
2123  t[13] = _mm_sub_epi16(s[12], s[13]);
2124  t[14] = _mm_sub_epi16(s[15], s[14]);
2125  t[15] = _mm_add_epi16(s[14], s[15]);
2126
2127  // stage 4
2128  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
2129  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
2130  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
2131  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
2132  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
2133  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
2134  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
2135  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
2136
2137  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2138  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2139  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
2140  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
2141  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
2142  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
2143  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
2144  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
2145  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
2146  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
2147  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
2148  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
2149  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
2150  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
2151  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
2152  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
2153
2154  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2155  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2156  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2157  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2158  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2159  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2160  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2161  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2162  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2163  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2164  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2165  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2166  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2167  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2168  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2169  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2170
2171  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2172  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2173  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2174  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2175  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2176  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2177  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2178  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2179  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2180  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2181  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2182  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2183  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2184  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2185  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2186  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2187
2188  s[0] = _mm_packs_epi32(u[0], u[1]);
2189  s[1] = _mm_packs_epi32(u[2], u[3]);
2190  s[2] = _mm_packs_epi32(u[4], u[5]);
2191  s[3] = _mm_packs_epi32(u[6], u[7]);
2192  s[4] = _mm_add_epi16(t[4], t[5]);
2193  s[5] = _mm_sub_epi16(t[4], t[5]);
2194  s[6] = _mm_sub_epi16(t[7], t[6]);
2195  s[7] = _mm_add_epi16(t[6], t[7]);
2196  s[8] = t[8];
2197  s[15] = t[15];
2198  s[9]  = _mm_packs_epi32(u[8], u[9]);
2199  s[14] = _mm_packs_epi32(u[10], u[11]);
2200  s[10] = _mm_packs_epi32(u[12], u[13]);
2201  s[13] = _mm_packs_epi32(u[14], u[15]);
2202  s[11] = t[11];
2203  s[12] = t[12];
2204
2205  // stage 5
2206  t[0] = _mm_add_epi16(s[0], s[3]);
2207  t[1] = _mm_add_epi16(s[1], s[2]);
2208  t[2] = _mm_sub_epi16(s[1], s[2]);
2209  t[3] = _mm_sub_epi16(s[0], s[3]);
2210  t[4] = s[4];
2211  t[7] = s[7];
2212
2213  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
2214  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
2215  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2216  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2217  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2218  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2219  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2220  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2221  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2222  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2223  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2224  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2225  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2226  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2227  t[5] = _mm_packs_epi32(u[0], u[1]);
2228  t[6] = _mm_packs_epi32(u[2], u[3]);
2229
2230  t[8] = _mm_add_epi16(s[8], s[11]);
2231  t[9] = _mm_add_epi16(s[9], s[10]);
2232  t[10] = _mm_sub_epi16(s[9], s[10]);
2233  t[11] = _mm_sub_epi16(s[8], s[11]);
2234  t[12] = _mm_sub_epi16(s[15], s[12]);
2235  t[13] = _mm_sub_epi16(s[14], s[13]);
2236  t[14] = _mm_add_epi16(s[13], s[14]);
2237  t[15] = _mm_add_epi16(s[12], s[15]);
2238
2239  // stage 6
2240  s[0] = _mm_add_epi16(t[0], t[7]);
2241  s[1] = _mm_add_epi16(t[1], t[6]);
2242  s[2] = _mm_add_epi16(t[2], t[5]);
2243  s[3] = _mm_add_epi16(t[3], t[4]);
2244  s[4] = _mm_sub_epi16(t[3], t[4]);
2245  s[5] = _mm_sub_epi16(t[2], t[5]);
2246  s[6] = _mm_sub_epi16(t[1], t[6]);
2247  s[7] = _mm_sub_epi16(t[0], t[7]);
2248  s[8] = t[8];
2249  s[9] = t[9];
2250
2251  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
2252  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
2253  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
2254  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
2255
2256  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2257  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2258  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2259  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2260  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2261  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2262  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2263  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2264
2265  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2266  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2267  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2268  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2269  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2270  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2271  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2272  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2273
2274  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2275  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2276  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2277  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2278  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2279  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2280  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2281  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2282
2283  s[10] = _mm_packs_epi32(u[0], u[1]);
2284  s[13] = _mm_packs_epi32(u[2], u[3]);
2285  s[11] = _mm_packs_epi32(u[4], u[5]);
2286  s[12] = _mm_packs_epi32(u[6], u[7]);
2287  s[14] = t[14];
2288  s[15] = t[15];
2289
2290  // stage 7
2291  in[0] = _mm_add_epi16(s[0], s[15]);
2292  in[1] = _mm_add_epi16(s[1], s[14]);
2293  in[2] = _mm_add_epi16(s[2], s[13]);
2294  in[3] = _mm_add_epi16(s[3], s[12]);
2295  in[4] = _mm_add_epi16(s[4], s[11]);
2296  in[5] = _mm_add_epi16(s[5], s[10]);
2297  in[6] = _mm_add_epi16(s[6], s[9]);
2298  in[7] = _mm_add_epi16(s[7], s[8]);
2299  in[8] = _mm_sub_epi16(s[7], s[8]);
2300  in[9] = _mm_sub_epi16(s[6], s[9]);
2301  in[10] = _mm_sub_epi16(s[5], s[10]);
2302  in[11] = _mm_sub_epi16(s[4], s[11]);
2303  in[12] = _mm_sub_epi16(s[3], s[12]);
2304  in[13] = _mm_sub_epi16(s[2], s[13]);
2305  in[14] = _mm_sub_epi16(s[1], s[14]);
2306  in[15] = _mm_sub_epi16(s[0], s[15]);
2307}
2308
2309static void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
2310  array_transpose_16x16(in0, in1);
2311  idct16_1d_8col(in0);
2312  idct16_1d_8col(in1);
2313}
2314
2315static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
2316  array_transpose_16x16(in0, in1);
2317  iadst16_1d_8col(in0);
2318  iadst16_1d_8col(in1);
2319}
2320
2321static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
2322  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
2323  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
2324  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
2325  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
2326  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
2327  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
2328  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
2329  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
2330
2331  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
2332  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
2333  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
2334  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
2335  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
2336  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
2337  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
2338  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
2339}
2340
2341static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
2342  const __m128i final_rounding = _mm_set1_epi16(1<<5);
2343  const __m128i zero = _mm_setzero_si128();
2344  // Final rounding and shift
2345  in[0] = _mm_adds_epi16(in[0], final_rounding);
2346  in[1] = _mm_adds_epi16(in[1], final_rounding);
2347  in[2] = _mm_adds_epi16(in[2], final_rounding);
2348  in[3] = _mm_adds_epi16(in[3], final_rounding);
2349  in[4] = _mm_adds_epi16(in[4], final_rounding);
2350  in[5] = _mm_adds_epi16(in[5], final_rounding);
2351  in[6] = _mm_adds_epi16(in[6], final_rounding);
2352  in[7] = _mm_adds_epi16(in[7], final_rounding);
2353  in[8] = _mm_adds_epi16(in[8], final_rounding);
2354  in[9] = _mm_adds_epi16(in[9], final_rounding);
2355  in[10] = _mm_adds_epi16(in[10], final_rounding);
2356  in[11] = _mm_adds_epi16(in[11], final_rounding);
2357  in[12] = _mm_adds_epi16(in[12], final_rounding);
2358  in[13] = _mm_adds_epi16(in[13], final_rounding);
2359  in[14] = _mm_adds_epi16(in[14], final_rounding);
2360  in[15] = _mm_adds_epi16(in[15], final_rounding);
2361
2362  in[0] = _mm_srai_epi16(in[0], 6);
2363  in[1] = _mm_srai_epi16(in[1], 6);
2364  in[2] = _mm_srai_epi16(in[2], 6);
2365  in[3] = _mm_srai_epi16(in[3], 6);
2366  in[4] = _mm_srai_epi16(in[4], 6);
2367  in[5] = _mm_srai_epi16(in[5], 6);
2368  in[6] = _mm_srai_epi16(in[6], 6);
2369  in[7] = _mm_srai_epi16(in[7], 6);
2370  in[8] = _mm_srai_epi16(in[8], 6);
2371  in[9] = _mm_srai_epi16(in[9], 6);
2372  in[10] = _mm_srai_epi16(in[10], 6);
2373  in[11] = _mm_srai_epi16(in[11], 6);
2374  in[12] = _mm_srai_epi16(in[12], 6);
2375  in[13] = _mm_srai_epi16(in[13], 6);
2376  in[14] = _mm_srai_epi16(in[14], 6);
2377  in[15] = _mm_srai_epi16(in[15], 6);
2378
2379  RECON_AND_STORE(dest, in[0]);
2380  RECON_AND_STORE(dest, in[1]);
2381  RECON_AND_STORE(dest, in[2]);
2382  RECON_AND_STORE(dest, in[3]);
2383  RECON_AND_STORE(dest, in[4]);
2384  RECON_AND_STORE(dest, in[5]);
2385  RECON_AND_STORE(dest, in[6]);
2386  RECON_AND_STORE(dest, in[7]);
2387  RECON_AND_STORE(dest, in[8]);
2388  RECON_AND_STORE(dest, in[9]);
2389  RECON_AND_STORE(dest, in[10]);
2390  RECON_AND_STORE(dest, in[11]);
2391  RECON_AND_STORE(dest, in[12]);
2392  RECON_AND_STORE(dest, in[13]);
2393  RECON_AND_STORE(dest, in[14]);
2394  RECON_AND_STORE(dest, in[15]);
2395}
2396
2397void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
2398                               int tx_type) {
2399  __m128i in0[16], in1[16];
2400
2401  load_buffer_8x16(input, in0);
2402  input += 8;
2403  load_buffer_8x16(input, in1);
2404
2405  switch (tx_type) {
2406    case 0:  // DCT_DCT
2407      idct16_1d_sse2(in0, in1);
2408      idct16_1d_sse2(in0, in1);
2409      break;
2410    case 1:  // ADST_DCT
2411      idct16_1d_sse2(in0, in1);
2412      iadst16_1d_sse2(in0, in1);
2413      break;
2414    case 2:  // DCT_ADST
2415      iadst16_1d_sse2(in0, in1);
2416      idct16_1d_sse2(in0, in1);
2417      break;
2418    case 3:  // ADST_ADST
2419      iadst16_1d_sse2(in0, in1);
2420      iadst16_1d_sse2(in0, in1);
2421      break;
2422    default:
2423      assert(0);
2424      break;
2425  }
2426
2427  write_buffer_8x16(dest, in0, stride);
2428  dest += 8;
2429  write_buffer_8x16(dest, in1, stride);
2430}
2431
2432void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
2433                               int stride) {
2434  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2435  const __m128i final_rounding = _mm_set1_epi16(1<<5);
2436  const __m128i zero = _mm_setzero_si128();
2437
2438  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2439  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2440  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
2441  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
2442  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
2443  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
2444  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2445  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2446
2447  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2448  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2449  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2450  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
2451
2452  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2453  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2454  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2455  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
2456  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2457  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2458  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2459  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2460
2461  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2462
2463  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
2464          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
2465          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
2466          in14 = zero, in15 = zero;
2467  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
2468          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
2469          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
2470
2471  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
2472          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2473          stp1_8_0, stp1_12_0;
2474  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2475          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
2476  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2477  int i;
2478  // 1-D idct. Load input data.
2479  in0 = _mm_load_si128((const __m128i *)input);
2480  in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
2481  in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
2482  in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
2483  in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
2484  in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
2485  in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
2486  in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
2487
2488  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
2489  TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
2490
2491  // Stage2
2492  {
2493    const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11);
2494    const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3);
2495    const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9);
2496    const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1);
2497
2498    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2499    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2500    tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);
2501    tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);
2502    tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);
2503    tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);
2504    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2505    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2506
2507    tmp0 = _mm_add_epi32(tmp0, rounding);
2508    tmp2 = _mm_add_epi32(tmp2, rounding);
2509    tmp4 = _mm_add_epi32(tmp4, rounding);
2510    tmp6 = _mm_add_epi32(tmp6, rounding);
2511    tmp1 = _mm_add_epi32(tmp1, rounding);
2512    tmp3 = _mm_add_epi32(tmp3, rounding);
2513    tmp5 = _mm_add_epi32(tmp5, rounding);
2514    tmp7 = _mm_add_epi32(tmp7, rounding);
2515
2516    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2517    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2518    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2519    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2520    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2521    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2522    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2523    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2524
2525    stp2_8 = _mm_packs_epi32(tmp0, zero);
2526    stp2_15 = _mm_packs_epi32(tmp2, zero);
2527    stp2_9 = _mm_packs_epi32(tmp4, zero);
2528    stp2_14 = _mm_packs_epi32(tmp6, zero);
2529
2530    stp2_10 = _mm_packs_epi32(tmp1, zero);
2531    stp2_13 = _mm_packs_epi32(tmp3, zero);
2532    stp2_11 = _mm_packs_epi32(tmp5, zero);
2533    stp2_12 = _mm_packs_epi32(tmp7, zero);
2534  }
2535
2536  // Stage3
2537  {
2538    const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11);
2539    const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3);
2540
2541    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2542    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2543    tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);
2544    tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);
2545
2546    tmp0 = _mm_add_epi32(tmp0, rounding);
2547    tmp2 = _mm_add_epi32(tmp2, rounding);
2548    tmp4 = _mm_add_epi32(tmp4, rounding);
2549    tmp6 = _mm_add_epi32(tmp6, rounding);
2550
2551    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2552    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2553    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2554    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2555
2556    stp1_4 = _mm_packs_epi32(tmp0, zero);
2557    stp1_7 = _mm_packs_epi32(tmp2, zero);
2558    stp1_5 = _mm_packs_epi32(tmp4, zero);
2559    stp1_6 = _mm_packs_epi32(tmp6, zero);
2560
2561    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);
2562    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
2563    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
2564    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
2565
2566    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);
2567    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
2568    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
2569    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
2570  }
2571
2572  // Stage4
2573  {
2574    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8);
2575    const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10);
2576    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
2577    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2578
2579    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2580    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2581    tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);
2582    tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);
2583    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2584    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2585    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2586    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2587
2588    tmp0 = _mm_add_epi32(tmp0, rounding);
2589    tmp2 = _mm_add_epi32(tmp2, rounding);
2590    tmp4 = _mm_add_epi32(tmp4, rounding);
2591    tmp6 = _mm_add_epi32(tmp6, rounding);
2592    tmp1 = _mm_add_epi32(tmp1, rounding);
2593    tmp3 = _mm_add_epi32(tmp3, rounding);
2594    tmp5 = _mm_add_epi32(tmp5, rounding);
2595    tmp7 = _mm_add_epi32(tmp7, rounding);
2596
2597    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2598    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2599    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2600    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2601    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2602    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2603    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2604    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2605
2606    stp2_0 = _mm_packs_epi32(tmp0, zero);
2607    stp2_1 = _mm_packs_epi32(tmp2, zero);
2608    stp2_2 = _mm_packs_epi32(tmp4, zero);
2609    stp2_3 = _mm_packs_epi32(tmp6, zero);
2610    stp2_9 = _mm_packs_epi32(tmp1, zero);
2611    stp2_14 = _mm_packs_epi32(tmp3, zero);
2612    stp2_10 = _mm_packs_epi32(tmp5, zero);
2613    stp2_13 = _mm_packs_epi32(tmp7, zero);
2614
2615    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
2616    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
2617    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
2618    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
2619  }
2620
2621  // Stage5 and Stage6
2622  {
2623    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
2624    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
2625    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
2626    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
2627
2628    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
2629    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
2630    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
2631    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
2632
2633    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
2634    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
2635    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
2636    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
2637  }
2638
2639  // Stage6
2640  {
2641    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
2642    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2643    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2644
2645    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2646    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2647    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2648    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2649    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2650    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2651
2652    tmp1 = _mm_add_epi32(tmp1, rounding);
2653    tmp3 = _mm_add_epi32(tmp3, rounding);
2654    tmp0 = _mm_add_epi32(tmp0, rounding);
2655    tmp2 = _mm_add_epi32(tmp2, rounding);
2656    tmp4 = _mm_add_epi32(tmp4, rounding);
2657    tmp6 = _mm_add_epi32(tmp6, rounding);
2658
2659    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2660    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2661    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2662    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2663    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2664    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2665
2666    stp1_5 = _mm_packs_epi32(tmp1, zero);
2667    stp1_6 = _mm_packs_epi32(tmp3, zero);
2668    stp2_10 = _mm_packs_epi32(tmp0, zero);
2669    stp2_13 = _mm_packs_epi32(tmp2, zero);
2670    stp2_11 = _mm_packs_epi32(tmp4, zero);
2671    stp2_12 = _mm_packs_epi32(tmp6, zero);
2672
2673    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
2674    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
2675    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
2676    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
2677    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
2678    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
2679    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
2680    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
2681  }
2682
2683  // Stage7. Left 8x16 only.
2684  l0 = _mm_add_epi16(stp2_0, stp1_15);
2685  l1 = _mm_add_epi16(stp2_1, stp1_14);
2686  l2 = _mm_add_epi16(stp2_2, stp2_13);
2687  l3 = _mm_add_epi16(stp2_3, stp2_12);
2688  l4 = _mm_add_epi16(stp2_4, stp2_11);
2689  l5 = _mm_add_epi16(stp2_5, stp2_10);
2690  l6 = _mm_add_epi16(stp2_6, stp1_9);
2691  l7 = _mm_add_epi16(stp2_7, stp1_8);
2692  l8 = _mm_sub_epi16(stp2_7, stp1_8);
2693  l9 = _mm_sub_epi16(stp2_6, stp1_9);
2694  l10 = _mm_sub_epi16(stp2_5, stp2_10);
2695  l11 = _mm_sub_epi16(stp2_4, stp2_11);
2696  l12 = _mm_sub_epi16(stp2_3, stp2_12);
2697  l13 = _mm_sub_epi16(stp2_2, stp2_13);
2698  l14 = _mm_sub_epi16(stp2_1, stp1_14);
2699  l15 = _mm_sub_epi16(stp2_0, stp1_15);
2700
2701  // 2-D idct. We do 2 8x16 blocks.
2702  for (i = 0; i < 2; i++) {
2703    if (i == 0)
2704      TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
2705                    in5, in6, in7);
2706
2707    if (i == 1)
2708      TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
2709                    in4, in5, in6, in7);
2710
2711    in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
2712
2713    IDCT16_1D
2714
2715    // Stage7
2716    in0 = _mm_add_epi16(stp2_0, stp1_15);
2717    in1 = _mm_add_epi16(stp2_1, stp1_14);
2718    in2 = _mm_add_epi16(stp2_2, stp2_13);
2719    in3 = _mm_add_epi16(stp2_3, stp2_12);
2720    in4 = _mm_add_epi16(stp2_4, stp2_11);
2721    in5 = _mm_add_epi16(stp2_5, stp2_10);
2722    in6 = _mm_add_epi16(stp2_6, stp1_9);
2723    in7 = _mm_add_epi16(stp2_7, stp1_8);
2724    in8 = _mm_sub_epi16(stp2_7, stp1_8);
2725    in9 = _mm_sub_epi16(stp2_6, stp1_9);
2726    in10 = _mm_sub_epi16(stp2_5, stp2_10);
2727    in11 = _mm_sub_epi16(stp2_4, stp2_11);
2728    in12 = _mm_sub_epi16(stp2_3, stp2_12);
2729    in13 = _mm_sub_epi16(stp2_2, stp2_13);
2730    in14 = _mm_sub_epi16(stp2_1, stp1_14);
2731    in15 = _mm_sub_epi16(stp2_0, stp1_15);
2732
2733    // Final rounding and shift
2734    in0 = _mm_adds_epi16(in0, final_rounding);
2735    in1 = _mm_adds_epi16(in1, final_rounding);
2736    in2 = _mm_adds_epi16(in2, final_rounding);
2737    in3 = _mm_adds_epi16(in3, final_rounding);
2738    in4 = _mm_adds_epi16(in4, final_rounding);
2739    in5 = _mm_adds_epi16(in5, final_rounding);
2740    in6 = _mm_adds_epi16(in6, final_rounding);
2741    in7 = _mm_adds_epi16(in7, final_rounding);
2742    in8 = _mm_adds_epi16(in8, final_rounding);
2743    in9 = _mm_adds_epi16(in9, final_rounding);
2744    in10 = _mm_adds_epi16(in10, final_rounding);
2745    in11 = _mm_adds_epi16(in11, final_rounding);
2746    in12 = _mm_adds_epi16(in12, final_rounding);
2747    in13 = _mm_adds_epi16(in13, final_rounding);
2748    in14 = _mm_adds_epi16(in14, final_rounding);
2749    in15 = _mm_adds_epi16(in15, final_rounding);
2750
2751    in0 = _mm_srai_epi16(in0, 6);
2752    in1 = _mm_srai_epi16(in1, 6);
2753    in2 = _mm_srai_epi16(in2, 6);
2754    in3 = _mm_srai_epi16(in3, 6);
2755    in4 = _mm_srai_epi16(in4, 6);
2756    in5 = _mm_srai_epi16(in5, 6);
2757    in6 = _mm_srai_epi16(in6, 6);
2758    in7 = _mm_srai_epi16(in7, 6);
2759    in8 = _mm_srai_epi16(in8, 6);
2760    in9 = _mm_srai_epi16(in9, 6);
2761    in10 = _mm_srai_epi16(in10, 6);
2762    in11 = _mm_srai_epi16(in11, 6);
2763    in12 = _mm_srai_epi16(in12, 6);
2764    in13 = _mm_srai_epi16(in13, 6);
2765    in14 = _mm_srai_epi16(in14, 6);
2766    in15 = _mm_srai_epi16(in15, 6);
2767
2768    RECON_AND_STORE(dest, in0);
2769    RECON_AND_STORE(dest, in1);
2770    RECON_AND_STORE(dest, in2);
2771    RECON_AND_STORE(dest, in3);
2772    RECON_AND_STORE(dest, in4);
2773    RECON_AND_STORE(dest, in5);
2774    RECON_AND_STORE(dest, in6);
2775    RECON_AND_STORE(dest, in7);
2776    RECON_AND_STORE(dest, in8);
2777    RECON_AND_STORE(dest, in9);
2778    RECON_AND_STORE(dest, in10);
2779    RECON_AND_STORE(dest, in11);
2780    RECON_AND_STORE(dest, in12);
2781    RECON_AND_STORE(dest, in13);
2782    RECON_AND_STORE(dest, in14);
2783    RECON_AND_STORE(dest, in15);
2784
2785    dest += 8 - (stride * 16);
2786  }
2787}
2788
2789#define LOAD_DQCOEFF(reg, input) \
2790  {  \
2791    reg = _mm_load_si128((const __m128i *) input); \
2792    input += 8; \
2793  }  \
2794
2795#define IDCT32_1D \
2796/* Stage1 */ \
2797{ \
2798  const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
2799  const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
2800  const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
2801  const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
2802  \
2803  const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
2804  const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
2805  const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
2806  const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
2807  \
2808  const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
2809  const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
2810  const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
2811  const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
2812  \
2813  const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
2814  const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
2815  const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
2816  const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
2817  \
2818  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
2819                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
2820                         stp1_17, stp1_30) \
2821  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
2822                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
2823                         stp1_19, stp1_28) \
2824  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
2825                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
2826                         stp1_21, stp1_26) \
2827  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
2828                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
2829                         stp1_23, stp1_24) \
2830} \
2831\
2832/* Stage2 */ \
2833{ \
2834  const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
2835  const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
2836  const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
2837  const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
2838  \
2839  const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
2840  const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
2841  const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
2842  const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
2843  \
2844  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
2845                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
2846                         stp2_14) \
2847  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
2848                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
2849                         stp2_11, stp2_12) \
2850  \
2851  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
2852  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
2853  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
2854  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
2855  \
2856  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
2857  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
2858  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
2859  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
2860  \
2861  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
2862  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
2863  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
2864  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
2865  \
2866  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
2867  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
2868  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
2869  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
2870} \
2871\
2872/* Stage3 */ \
2873{ \
2874  const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
2875  const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
2876  const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
2877  const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
2878  \
2879  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
2880  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
2881  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2882  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2883  \
2884  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2885  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2886  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2887  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2888  \
2889  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
2890                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
2891                         stp1_6) \
2892  \
2893  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
2894  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
2895  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
2896  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
2897  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
2898  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
2899  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
2900  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
2901  \
2902  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2903                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2904                         stp1_18, stp1_29) \
2905  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2906                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2907                         stp1_22, stp1_25) \
2908  \
2909  stp1_16 = stp2_16; \
2910  stp1_31 = stp2_31; \
2911  stp1_19 = stp2_19; \
2912  stp1_20 = stp2_20; \
2913  stp1_23 = stp2_23; \
2914  stp1_24 = stp2_24; \
2915  stp1_27 = stp2_27; \
2916  stp1_28 = stp2_28; \
2917} \
2918\
2919/* Stage4 */ \
2920{ \
2921  const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
2922  const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
2923  const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
2924  const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
2925  \
2926  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
2927  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
2928  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2929  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2930  \
2931  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
2932                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
2933                         stp2_2, stp2_3) \
2934  \
2935  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
2936  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
2937  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
2938  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
2939  \
2940  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2941                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2942                         stp2_10, stp2_13) \
2943  \
2944  stp2_8 = stp1_8; \
2945  stp2_15 = stp1_15; \
2946  stp2_11 = stp1_11; \
2947  stp2_12 = stp1_12; \
2948  \
2949  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2950  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2951  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2952  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2953  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2954  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2955  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2956  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2957  \
2958  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2959  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2960  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2961  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2962  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2963  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2964  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2965  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2966} \
2967\
2968/* Stage5 */ \
2969{ \
2970  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2971  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2972  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2973  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2974  \
2975  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2976  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2977  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2978  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2979  \
2980  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2981  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2982  \
2983  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
2984  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
2985  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
2986  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
2987  \
2988  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2989  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2990  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2991  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2992  \
2993  tmp0 = _mm_add_epi32(tmp0, rounding); \
2994  tmp1 = _mm_add_epi32(tmp1, rounding); \
2995  tmp2 = _mm_add_epi32(tmp2, rounding); \
2996  tmp3 = _mm_add_epi32(tmp3, rounding); \
2997  \
2998  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2999  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
3000  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
3001  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
3002  \
3003  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
3004  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
3005  \
3006  stp1_4 = stp2_4; \
3007  stp1_7 = stp2_7; \
3008  \
3009  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
3010  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
3011  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
3012  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
3013  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
3014  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
3015  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
3016  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
3017  \
3018  stp1_16 = stp2_16; \
3019  stp1_17 = stp2_17; \
3020  \
3021  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
3022                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
3023                         stp1_19, stp1_28) \
3024  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
3025                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
3026                         stp1_21, stp1_26) \
3027  \
3028  stp1_22 = stp2_22; \
3029  stp1_23 = stp2_23; \
3030  stp1_24 = stp2_24; \
3031  stp1_25 = stp2_25; \
3032  stp1_30 = stp2_30; \
3033  stp1_31 = stp2_31; \
3034} \
3035\
3036/* Stage6 */ \
3037{ \
3038  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3039  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3040  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
3041  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
3042  \
3043  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
3044  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
3045  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
3046  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
3047  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
3048  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
3049  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
3050  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
3051  \
3052  stp2_8 = stp1_8; \
3053  stp2_9 = stp1_9; \
3054  stp2_14 = stp1_14; \
3055  stp2_15 = stp1_15; \
3056  \
3057  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
3058                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
3059                         stp2_13, stp2_11, stp2_12) \
3060  \
3061  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
3062  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
3063  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
3064  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
3065  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
3066  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
3067  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
3068  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
3069  \
3070  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
3071  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
3072  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
3073  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
3074  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
3075  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
3076  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
3077  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
3078} \
3079\
3080/* Stage7 */ \
3081{ \
3082  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3083  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3084  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3085  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3086  \
3087  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3088  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3089  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
3090  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
3091  \
3092  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
3093  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
3094  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
3095  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3096  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3097  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3098  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3099  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3100  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3101  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3102  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3103  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3104  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3105  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3106  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3107  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3108  \
3109  stp1_16 = stp2_16; \
3110  stp1_17 = stp2_17; \
3111  stp1_18 = stp2_18; \
3112  stp1_19 = stp2_19; \
3113  \
3114  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3115                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3116                         stp1_21, stp1_26) \
3117  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3118                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3119                         stp1_23, stp1_24) \
3120  \
3121  stp1_28 = stp2_28; \
3122  stp1_29 = stp2_29; \
3123  stp1_30 = stp2_30; \
3124  stp1_31 = stp2_31; \
3125}
3126
3127// Only upper-left 8x8 has non-zero coeff
3128void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
3129                                 int stride) {
3130  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3131  const __m128i final_rounding = _mm_set1_epi16(1<<5);
3132
3133  // idct constants for each stage
3134  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3135  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3136  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3137  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3138  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3139  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3140  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3141  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3142  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3143  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3144  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3145  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3146  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3147  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3148  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3149  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3150
3151  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3152  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3153  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3154  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3155  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3156  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3157  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3158  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3159
3160  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3161  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3162  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3163  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3164  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3165  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3166  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3167  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3168  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3169  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3170
3171  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3172  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3173  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3174  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3175  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3176  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3177  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3178
3179  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3180
3181  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
3182          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
3183          in24, in25, in26, in27, in28, in29, in30, in31;
3184  __m128i col[128];
3185  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3186          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3187          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3188          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3189          stp1_30, stp1_31;
3190  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3191          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3192          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3193          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3194          stp2_30, stp2_31;
3195  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3196  int i, j, i32;
3197
3198  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
3199  for (i = 0; i < 8; i++) {
3200    i32 = (i << 5);
3201    if (i == 0) {
3202      // First 1-D idct: first 8 rows
3203      // Load input data.
3204      LOAD_DQCOEFF(in0, input);
3205      LOAD_DQCOEFF(in8, input);
3206      LOAD_DQCOEFF(in16, input);
3207      LOAD_DQCOEFF(in24, input);
3208      LOAD_DQCOEFF(in1, input);
3209      LOAD_DQCOEFF(in9, input);
3210      LOAD_DQCOEFF(in17, input);
3211      LOAD_DQCOEFF(in25, input);
3212      LOAD_DQCOEFF(in2, input);
3213      LOAD_DQCOEFF(in10, input);
3214      LOAD_DQCOEFF(in18, input);
3215      LOAD_DQCOEFF(in26, input);
3216      LOAD_DQCOEFF(in3, input);
3217      LOAD_DQCOEFF(in11, input);
3218      LOAD_DQCOEFF(in19, input);
3219      LOAD_DQCOEFF(in27, input);
3220
3221      LOAD_DQCOEFF(in4, input);
3222      LOAD_DQCOEFF(in12, input);
3223      LOAD_DQCOEFF(in20, input);
3224      LOAD_DQCOEFF(in28, input);
3225      LOAD_DQCOEFF(in5, input);
3226      LOAD_DQCOEFF(in13, input);
3227      LOAD_DQCOEFF(in21, input);
3228      LOAD_DQCOEFF(in29, input);
3229      LOAD_DQCOEFF(in6, input);
3230      LOAD_DQCOEFF(in14, input);
3231      LOAD_DQCOEFF(in22, input);
3232      LOAD_DQCOEFF(in30, input);
3233      LOAD_DQCOEFF(in7, input);
3234      LOAD_DQCOEFF(in15, input);
3235      LOAD_DQCOEFF(in23, input);
3236      LOAD_DQCOEFF(in31, input);
3237
3238      // Transpose 32x8 block to 8x32 block
3239      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
3240                    in4, in5, in6, in7);
3241      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
3242                    in10, in11, in12, in13, in14, in15);
3243      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
3244                    in18, in19, in20, in21, in22, in23);
3245      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
3246                    in26, in27, in28, in29, in30, in31);
3247    } else if (i < 4) {
3248      // First 1-D idct: next 24 zero-coeff rows
3249      col[i32 + 0] = _mm_setzero_si128();
3250      col[i32 + 1] = _mm_setzero_si128();
3251      col[i32 + 2] = _mm_setzero_si128();
3252      col[i32 + 3] = _mm_setzero_si128();
3253      col[i32 + 4] = _mm_setzero_si128();
3254      col[i32 + 5] = _mm_setzero_si128();
3255      col[i32 + 6] = _mm_setzero_si128();
3256      col[i32 + 7] = _mm_setzero_si128();
3257      col[i32 + 8] = _mm_setzero_si128();
3258      col[i32 + 9] = _mm_setzero_si128();
3259      col[i32 + 10] = _mm_setzero_si128();
3260      col[i32 + 11] = _mm_setzero_si128();
3261      col[i32 + 12] = _mm_setzero_si128();
3262      col[i32 + 13] = _mm_setzero_si128();
3263      col[i32 + 14] = _mm_setzero_si128();
3264      col[i32 + 15] = _mm_setzero_si128();
3265      col[i32 + 16] = _mm_setzero_si128();
3266      col[i32 + 17] = _mm_setzero_si128();
3267      col[i32 + 18] = _mm_setzero_si128();
3268      col[i32 + 19] = _mm_setzero_si128();
3269      col[i32 + 20] = _mm_setzero_si128();
3270      col[i32 + 21] = _mm_setzero_si128();
3271      col[i32 + 22] = _mm_setzero_si128();
3272      col[i32 + 23] = _mm_setzero_si128();
3273      col[i32 + 24] = _mm_setzero_si128();
3274      col[i32 + 25] = _mm_setzero_si128();
3275      col[i32 + 26] = _mm_setzero_si128();
3276      col[i32 + 27] = _mm_setzero_si128();
3277      col[i32 + 28] = _mm_setzero_si128();
3278      col[i32 + 29] = _mm_setzero_si128();
3279      col[i32 + 30] = _mm_setzero_si128();
3280      col[i32 + 31] = _mm_setzero_si128();
3281      continue;
3282    } else {
3283      // Second 1-D idct
3284      j = i - 4;
3285
3286      // Transpose 32x8 block to 8x32 block
3287      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3288                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3289                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
3290                    in5, in6, in7);
3291      j += 4;
3292      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3293                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3294                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
3295                    in11, in12, in13, in14, in15);
3296      j += 4;
3297      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3298                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3299                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
3300                    in19, in20, in21, in22, in23);
3301      j += 4;
3302      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3303                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3304                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
3305                    in28, in29, in30, in31);
3306    }
3307
3308    IDCT32_1D
3309
3310    // final stage
3311    if (i < 4) {
3312      // 1_D: Store 32 intermediate results for each 8x32 block.
3313      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3314      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3315      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3316      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3317      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3318      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3319      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3320      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3321      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3322      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3323      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3324      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3325      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3326      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3327      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3328      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3329      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3330      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3331      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3332      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3333      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3334      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3335      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3336      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3337      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3338      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3339      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3340      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3341      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3342      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3343      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3344      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3345    } else {
3346      const __m128i zero = _mm_setzero_si128();
3347
3348      // 2_D: Calculate the results and store them to destination.
3349      in0 = _mm_add_epi16(stp1_0, stp1_31);
3350      in1 = _mm_add_epi16(stp1_1, stp1_30);
3351      in2 = _mm_add_epi16(stp1_2, stp1_29);
3352      in3 = _mm_add_epi16(stp1_3, stp1_28);
3353      in4 = _mm_add_epi16(stp1_4, stp1_27);
3354      in5 = _mm_add_epi16(stp1_5, stp1_26);
3355      in6 = _mm_add_epi16(stp1_6, stp1_25);
3356      in7 = _mm_add_epi16(stp1_7, stp1_24);
3357      in8 = _mm_add_epi16(stp1_8, stp1_23);
3358      in9 = _mm_add_epi16(stp1_9, stp1_22);
3359      in10 = _mm_add_epi16(stp1_10, stp1_21);
3360      in11 = _mm_add_epi16(stp1_11, stp1_20);
3361      in12 = _mm_add_epi16(stp1_12, stp1_19);
3362      in13 = _mm_add_epi16(stp1_13, stp1_18);
3363      in14 = _mm_add_epi16(stp1_14, stp1_17);
3364      in15 = _mm_add_epi16(stp1_15, stp1_16);
3365      in16 = _mm_sub_epi16(stp1_15, stp1_16);
3366      in17 = _mm_sub_epi16(stp1_14, stp1_17);
3367      in18 = _mm_sub_epi16(stp1_13, stp1_18);
3368      in19 = _mm_sub_epi16(stp1_12, stp1_19);
3369      in20 = _mm_sub_epi16(stp1_11, stp1_20);
3370      in21 = _mm_sub_epi16(stp1_10, stp1_21);
3371      in22 = _mm_sub_epi16(stp1_9, stp1_22);
3372      in23 = _mm_sub_epi16(stp1_8, stp1_23);
3373      in24 = _mm_sub_epi16(stp1_7, stp1_24);
3374      in25 = _mm_sub_epi16(stp1_6, stp1_25);
3375      in26 = _mm_sub_epi16(stp1_5, stp1_26);
3376      in27 = _mm_sub_epi16(stp1_4, stp1_27);
3377      in28 = _mm_sub_epi16(stp1_3, stp1_28);
3378      in29 = _mm_sub_epi16(stp1_2, stp1_29);
3379      in30 = _mm_sub_epi16(stp1_1, stp1_30);
3380      in31 = _mm_sub_epi16(stp1_0, stp1_31);
3381
3382      // Final rounding and shift
3383      in0 = _mm_adds_epi16(in0, final_rounding);
3384      in1 = _mm_adds_epi16(in1, final_rounding);
3385      in2 = _mm_adds_epi16(in2, final_rounding);
3386      in3 = _mm_adds_epi16(in3, final_rounding);
3387      in4 = _mm_adds_epi16(in4, final_rounding);
3388      in5 = _mm_adds_epi16(in5, final_rounding);
3389      in6 = _mm_adds_epi16(in6, final_rounding);
3390      in7 = _mm_adds_epi16(in7, final_rounding);
3391      in8 = _mm_adds_epi16(in8, final_rounding);
3392      in9 = _mm_adds_epi16(in9, final_rounding);
3393      in10 = _mm_adds_epi16(in10, final_rounding);
3394      in11 = _mm_adds_epi16(in11, final_rounding);
3395      in12 = _mm_adds_epi16(in12, final_rounding);
3396      in13 = _mm_adds_epi16(in13, final_rounding);
3397      in14 = _mm_adds_epi16(in14, final_rounding);
3398      in15 = _mm_adds_epi16(in15, final_rounding);
3399      in16 = _mm_adds_epi16(in16, final_rounding);
3400      in17 = _mm_adds_epi16(in17, final_rounding);
3401      in18 = _mm_adds_epi16(in18, final_rounding);
3402      in19 = _mm_adds_epi16(in19, final_rounding);
3403      in20 = _mm_adds_epi16(in20, final_rounding);
3404      in21 = _mm_adds_epi16(in21, final_rounding);
3405      in22 = _mm_adds_epi16(in22, final_rounding);
3406      in23 = _mm_adds_epi16(in23, final_rounding);
3407      in24 = _mm_adds_epi16(in24, final_rounding);
3408      in25 = _mm_adds_epi16(in25, final_rounding);
3409      in26 = _mm_adds_epi16(in26, final_rounding);
3410      in27 = _mm_adds_epi16(in27, final_rounding);
3411      in28 = _mm_adds_epi16(in28, final_rounding);
3412      in29 = _mm_adds_epi16(in29, final_rounding);
3413      in30 = _mm_adds_epi16(in30, final_rounding);
3414      in31 = _mm_adds_epi16(in31, final_rounding);
3415
3416      in0 = _mm_srai_epi16(in0, 6);
3417      in1 = _mm_srai_epi16(in1, 6);
3418      in2 = _mm_srai_epi16(in2, 6);
3419      in3 = _mm_srai_epi16(in3, 6);
3420      in4 = _mm_srai_epi16(in4, 6);
3421      in5 = _mm_srai_epi16(in5, 6);
3422      in6 = _mm_srai_epi16(in6, 6);
3423      in7 = _mm_srai_epi16(in7, 6);
3424      in8 = _mm_srai_epi16(in8, 6);
3425      in9 = _mm_srai_epi16(in9, 6);
3426      in10 = _mm_srai_epi16(in10, 6);
3427      in11 = _mm_srai_epi16(in11, 6);
3428      in12 = _mm_srai_epi16(in12, 6);
3429      in13 = _mm_srai_epi16(in13, 6);
3430      in14 = _mm_srai_epi16(in14, 6);
3431      in15 = _mm_srai_epi16(in15, 6);
3432      in16 = _mm_srai_epi16(in16, 6);
3433      in17 = _mm_srai_epi16(in17, 6);
3434      in18 = _mm_srai_epi16(in18, 6);
3435      in19 = _mm_srai_epi16(in19, 6);
3436      in20 = _mm_srai_epi16(in20, 6);
3437      in21 = _mm_srai_epi16(in21, 6);
3438      in22 = _mm_srai_epi16(in22, 6);
3439      in23 = _mm_srai_epi16(in23, 6);
3440      in24 = _mm_srai_epi16(in24, 6);
3441      in25 = _mm_srai_epi16(in25, 6);
3442      in26 = _mm_srai_epi16(in26, 6);
3443      in27 = _mm_srai_epi16(in27, 6);
3444      in28 = _mm_srai_epi16(in28, 6);
3445      in29 = _mm_srai_epi16(in29, 6);
3446      in30 = _mm_srai_epi16(in30, 6);
3447      in31 = _mm_srai_epi16(in31, 6);
3448
3449      RECON_AND_STORE(dest, in0);
3450      RECON_AND_STORE(dest, in1);
3451      RECON_AND_STORE(dest, in2);
3452      RECON_AND_STORE(dest, in3);
3453      RECON_AND_STORE(dest, in4);
3454      RECON_AND_STORE(dest, in5);
3455      RECON_AND_STORE(dest, in6);
3456      RECON_AND_STORE(dest, in7);
3457      RECON_AND_STORE(dest, in8);
3458      RECON_AND_STORE(dest, in9);
3459      RECON_AND_STORE(dest, in10);
3460      RECON_AND_STORE(dest, in11);
3461      RECON_AND_STORE(dest, in12);
3462      RECON_AND_STORE(dest, in13);
3463      RECON_AND_STORE(dest, in14);
3464      RECON_AND_STORE(dest, in15);
3465      RECON_AND_STORE(dest, in16);
3466      RECON_AND_STORE(dest, in17);
3467      RECON_AND_STORE(dest, in18);
3468      RECON_AND_STORE(dest, in19);
3469      RECON_AND_STORE(dest, in20);
3470      RECON_AND_STORE(dest, in21);
3471      RECON_AND_STORE(dest, in22);
3472      RECON_AND_STORE(dest, in23);
3473      RECON_AND_STORE(dest, in24);
3474      RECON_AND_STORE(dest, in25);
3475      RECON_AND_STORE(dest, in26);
3476      RECON_AND_STORE(dest, in27);
3477      RECON_AND_STORE(dest, in28);
3478      RECON_AND_STORE(dest, in29);
3479      RECON_AND_STORE(dest, in30);
3480      RECON_AND_STORE(dest, in31);
3481
3482      dest += 8 - (stride * 32);
3483    }
3484  }
3485}
3486
3487void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
3488                                 int stride) {
3489  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3490  const __m128i final_rounding = _mm_set1_epi16(1<<5);
3491
3492  // idct constants for each stage
3493  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3494  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3495  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3496  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3497  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3498  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3499  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3500  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3501  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3502  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3503  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3504  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3505  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3506  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3507  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3508  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3509
3510  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3511  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3512  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3513  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3514  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3515  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3516  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3517  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3518
3519  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3520  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3521  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3522  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3523  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3524  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3525  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3526  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3527  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3528  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3529
3530  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3531  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3532  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3533  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3534  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3535  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3536  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3537
3538  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3539
3540  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
3541          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
3542          in24, in25, in26, in27, in28, in29, in30, in31;
3543  __m128i col[128];
3544  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3545          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3546          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3547          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3548          stp1_30, stp1_31;
3549  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3550          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3551          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3552          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3553          stp2_30, stp2_31;
3554  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3555  int i, j, i32;
3556  __m128i zero_idx[16];
3557  int zero_flag[2];
3558
3559  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
3560  for (i = 0; i < 8; i++) {
3561    i32 = (i << 5);
3562    if (i < 4) {
3563      // First 1-D idct
3564      // Load input data.
3565      LOAD_DQCOEFF(in0, input);
3566      LOAD_DQCOEFF(in8, input);
3567      LOAD_DQCOEFF(in16, input);
3568      LOAD_DQCOEFF(in24, input);
3569      LOAD_DQCOEFF(in1, input);
3570      LOAD_DQCOEFF(in9, input);
3571      LOAD_DQCOEFF(in17, input);
3572      LOAD_DQCOEFF(in25, input);
3573      LOAD_DQCOEFF(in2, input);
3574      LOAD_DQCOEFF(in10, input);
3575      LOAD_DQCOEFF(in18, input);
3576      LOAD_DQCOEFF(in26, input);
3577      LOAD_DQCOEFF(in3, input);
3578      LOAD_DQCOEFF(in11, input);
3579      LOAD_DQCOEFF(in19, input);
3580      LOAD_DQCOEFF(in27, input);
3581
3582      LOAD_DQCOEFF(in4, input);
3583      LOAD_DQCOEFF(in12, input);
3584      LOAD_DQCOEFF(in20, input);
3585      LOAD_DQCOEFF(in28, input);
3586      LOAD_DQCOEFF(in5, input);
3587      LOAD_DQCOEFF(in13, input);
3588      LOAD_DQCOEFF(in21, input);
3589      LOAD_DQCOEFF(in29, input);
3590      LOAD_DQCOEFF(in6, input);
3591      LOAD_DQCOEFF(in14, input);
3592      LOAD_DQCOEFF(in22, input);
3593      LOAD_DQCOEFF(in30, input);
3594      LOAD_DQCOEFF(in7, input);
3595      LOAD_DQCOEFF(in15, input);
3596      LOAD_DQCOEFF(in23, input);
3597      LOAD_DQCOEFF(in31, input);
3598
3599      // checking if all entries are zero
3600      zero_idx[0] = _mm_or_si128(in0, in1);
3601      zero_idx[1] = _mm_or_si128(in2, in3);
3602      zero_idx[2] = _mm_or_si128(in4, in5);
3603      zero_idx[3] = _mm_or_si128(in6, in7);
3604      zero_idx[4] = _mm_or_si128(in8, in9);
3605      zero_idx[5] = _mm_or_si128(in10, in11);
3606      zero_idx[6] = _mm_or_si128(in12, in13);
3607      zero_idx[7] = _mm_or_si128(in14, in15);
3608      zero_idx[8] = _mm_or_si128(in16, in17);
3609      zero_idx[9] = _mm_or_si128(in18, in19);
3610      zero_idx[10] = _mm_or_si128(in20, in21);
3611      zero_idx[11] = _mm_or_si128(in22, in23);
3612      zero_idx[12] = _mm_or_si128(in24, in25);
3613      zero_idx[13] = _mm_or_si128(in26, in27);
3614      zero_idx[14] = _mm_or_si128(in28, in29);
3615      zero_idx[15] = _mm_or_si128(in30, in31);
3616
3617      zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3618      zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3619      zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3620      zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3621      zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3622      zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3623      zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3624      zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3625
3626      zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3627      zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3628      zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3629      zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3630      zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3631      zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3632      zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3633
3634      zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
3635      zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
3636      zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
3637      zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
3638      zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
3639
3640      if (!zero_flag[0] && !zero_flag[1]) {
3641        col[i32 + 0] = _mm_setzero_si128();
3642        col[i32 + 1] = _mm_setzero_si128();
3643        col[i32 + 2] = _mm_setzero_si128();
3644        col[i32 + 3] = _mm_setzero_si128();
3645        col[i32 + 4] = _mm_setzero_si128();
3646        col[i32 + 5] = _mm_setzero_si128();
3647        col[i32 + 6] = _mm_setzero_si128();
3648        col[i32 + 7] = _mm_setzero_si128();
3649        col[i32 + 8] = _mm_setzero_si128();
3650        col[i32 + 9] = _mm_setzero_si128();
3651        col[i32 + 10] = _mm_setzero_si128();
3652        col[i32 + 11] = _mm_setzero_si128();
3653        col[i32 + 12] = _mm_setzero_si128();
3654        col[i32 + 13] = _mm_setzero_si128();
3655        col[i32 + 14] = _mm_setzero_si128();
3656        col[i32 + 15] = _mm_setzero_si128();
3657        col[i32 + 16] = _mm_setzero_si128();
3658        col[i32 + 17] = _mm_setzero_si128();
3659        col[i32 + 18] = _mm_setzero_si128();
3660        col[i32 + 19] = _mm_setzero_si128();
3661        col[i32 + 20] = _mm_setzero_si128();
3662        col[i32 + 21] = _mm_setzero_si128();
3663        col[i32 + 22] = _mm_setzero_si128();
3664        col[i32 + 23] = _mm_setzero_si128();
3665        col[i32 + 24] = _mm_setzero_si128();
3666        col[i32 + 25] = _mm_setzero_si128();
3667        col[i32 + 26] = _mm_setzero_si128();
3668        col[i32 + 27] = _mm_setzero_si128();
3669        col[i32 + 28] = _mm_setzero_si128();
3670        col[i32 + 29] = _mm_setzero_si128();
3671        col[i32 + 30] = _mm_setzero_si128();
3672        col[i32 + 31] = _mm_setzero_si128();
3673        continue;
3674      }
3675
3676      // Transpose 32x8 block to 8x32 block
3677      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
3678                    in4, in5, in6, in7);
3679      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
3680                    in10, in11, in12, in13, in14, in15);
3681      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
3682                    in18, in19, in20, in21, in22, in23);
3683      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
3684                    in26, in27, in28, in29, in30, in31);
3685    } else {
3686      // Second 1-D idct
3687      j = i - 4;
3688
3689      // Transpose 32x8 block to 8x32 block
3690      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3691                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3692                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
3693                    in5, in6, in7);
3694      j += 4;
3695      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3696                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3697                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
3698                    in11, in12, in13, in14, in15);
3699      j += 4;
3700      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3701                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3702                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
3703                    in19, in20, in21, in22, in23);
3704      j += 4;
3705      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3706                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3707                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
3708                    in28, in29, in30, in31);
3709    }
3710
3711    IDCT32_1D
3712
3713    // final stage
3714    if (i < 4) {
3715      // 1_D: Store 32 intermediate results for each 8x32 block.
3716      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3717      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3718      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3719      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3720      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3721      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3722      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3723      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3724      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3725      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3726      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3727      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3728      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3729      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3730      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3731      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3732      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3733      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3734      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3735      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3736      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3737      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3738      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3739      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3740      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3741      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3742      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3743      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3744      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3745      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3746      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3747      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3748    } else {
3749      const __m128i zero = _mm_setzero_si128();
3750
3751      // 2_D: Calculate the results and store them to destination.
3752      in0 = _mm_add_epi16(stp1_0, stp1_31);
3753      in1 = _mm_add_epi16(stp1_1, stp1_30);
3754      in2 = _mm_add_epi16(stp1_2, stp1_29);
3755      in3 = _mm_add_epi16(stp1_3, stp1_28);
3756      in4 = _mm_add_epi16(stp1_4, stp1_27);
3757      in5 = _mm_add_epi16(stp1_5, stp1_26);
3758      in6 = _mm_add_epi16(stp1_6, stp1_25);
3759      in7 = _mm_add_epi16(stp1_7, stp1_24);
3760      in8 = _mm_add_epi16(stp1_8, stp1_23);
3761      in9 = _mm_add_epi16(stp1_9, stp1_22);
3762      in10 = _mm_add_epi16(stp1_10, stp1_21);
3763      in11 = _mm_add_epi16(stp1_11, stp1_20);
3764      in12 = _mm_add_epi16(stp1_12, stp1_19);
3765      in13 = _mm_add_epi16(stp1_13, stp1_18);
3766      in14 = _mm_add_epi16(stp1_14, stp1_17);
3767      in15 = _mm_add_epi16(stp1_15, stp1_16);
3768      in16 = _mm_sub_epi16(stp1_15, stp1_16);
3769      in17 = _mm_sub_epi16(stp1_14, stp1_17);
3770      in18 = _mm_sub_epi16(stp1_13, stp1_18);
3771      in19 = _mm_sub_epi16(stp1_12, stp1_19);
3772      in20 = _mm_sub_epi16(stp1_11, stp1_20);
3773      in21 = _mm_sub_epi16(stp1_10, stp1_21);
3774      in22 = _mm_sub_epi16(stp1_9, stp1_22);
3775      in23 = _mm_sub_epi16(stp1_8, stp1_23);
3776      in24 = _mm_sub_epi16(stp1_7, stp1_24);
3777      in25 = _mm_sub_epi16(stp1_6, stp1_25);
3778      in26 = _mm_sub_epi16(stp1_5, stp1_26);
3779      in27 = _mm_sub_epi16(stp1_4, stp1_27);
3780      in28 = _mm_sub_epi16(stp1_3, stp1_28);
3781      in29 = _mm_sub_epi16(stp1_2, stp1_29);
3782      in30 = _mm_sub_epi16(stp1_1, stp1_30);
3783      in31 = _mm_sub_epi16(stp1_0, stp1_31);
3784
3785      // Final rounding and shift
3786      in0 = _mm_adds_epi16(in0, final_rounding);
3787      in1 = _mm_adds_epi16(in1, final_rounding);
3788      in2 = _mm_adds_epi16(in2, final_rounding);
3789      in3 = _mm_adds_epi16(in3, final_rounding);
3790      in4 = _mm_adds_epi16(in4, final_rounding);
3791      in5 = _mm_adds_epi16(in5, final_rounding);
3792      in6 = _mm_adds_epi16(in6, final_rounding);
3793      in7 = _mm_adds_epi16(in7, final_rounding);
3794      in8 = _mm_adds_epi16(in8, final_rounding);
3795      in9 = _mm_adds_epi16(in9, final_rounding);
3796      in10 = _mm_adds_epi16(in10, final_rounding);
3797      in11 = _mm_adds_epi16(in11, final_rounding);
3798      in12 = _mm_adds_epi16(in12, final_rounding);
3799      in13 = _mm_adds_epi16(in13, final_rounding);
3800      in14 = _mm_adds_epi16(in14, final_rounding);
3801      in15 = _mm_adds_epi16(in15, final_rounding);
3802      in16 = _mm_adds_epi16(in16, final_rounding);
3803      in17 = _mm_adds_epi16(in17, final_rounding);
3804      in18 = _mm_adds_epi16(in18, final_rounding);
3805      in19 = _mm_adds_epi16(in19, final_rounding);
3806      in20 = _mm_adds_epi16(in20, final_rounding);
3807      in21 = _mm_adds_epi16(in21, final_rounding);
3808      in22 = _mm_adds_epi16(in22, final_rounding);
3809      in23 = _mm_adds_epi16(in23, final_rounding);
3810      in24 = _mm_adds_epi16(in24, final_rounding);
3811      in25 = _mm_adds_epi16(in25, final_rounding);
3812      in26 = _mm_adds_epi16(in26, final_rounding);
3813      in27 = _mm_adds_epi16(in27, final_rounding);
3814      in28 = _mm_adds_epi16(in28, final_rounding);
3815      in29 = _mm_adds_epi16(in29, final_rounding);
3816      in30 = _mm_adds_epi16(in30, final_rounding);
3817      in31 = _mm_adds_epi16(in31, final_rounding);
3818
3819      in0 = _mm_srai_epi16(in0, 6);
3820      in1 = _mm_srai_epi16(in1, 6);
3821      in2 = _mm_srai_epi16(in2, 6);
3822      in3 = _mm_srai_epi16(in3, 6);
3823      in4 = _mm_srai_epi16(in4, 6);
3824      in5 = _mm_srai_epi16(in5, 6);
3825      in6 = _mm_srai_epi16(in6, 6);
3826      in7 = _mm_srai_epi16(in7, 6);
3827      in8 = _mm_srai_epi16(in8, 6);
3828      in9 = _mm_srai_epi16(in9, 6);
3829      in10 = _mm_srai_epi16(in10, 6);
3830      in11 = _mm_srai_epi16(in11, 6);
3831      in12 = _mm_srai_epi16(in12, 6);
3832      in13 = _mm_srai_epi16(in13, 6);
3833      in14 = _mm_srai_epi16(in14, 6);
3834      in15 = _mm_srai_epi16(in15, 6);
3835      in16 = _mm_srai_epi16(in16, 6);
3836      in17 = _mm_srai_epi16(in17, 6);
3837      in18 = _mm_srai_epi16(in18, 6);
3838      in19 = _mm_srai_epi16(in19, 6);
3839      in20 = _mm_srai_epi16(in20, 6);
3840      in21 = _mm_srai_epi16(in21, 6);
3841      in22 = _mm_srai_epi16(in22, 6);
3842      in23 = _mm_srai_epi16(in23, 6);
3843      in24 = _mm_srai_epi16(in24, 6);
3844      in25 = _mm_srai_epi16(in25, 6);
3845      in26 = _mm_srai_epi16(in26, 6);
3846      in27 = _mm_srai_epi16(in27, 6);
3847      in28 = _mm_srai_epi16(in28, 6);
3848      in29 = _mm_srai_epi16(in29, 6);
3849      in30 = _mm_srai_epi16(in30, 6);
3850      in31 = _mm_srai_epi16(in31, 6);
3851
3852      RECON_AND_STORE(dest, in0);
3853      RECON_AND_STORE(dest, in1);
3854      RECON_AND_STORE(dest, in2);
3855      RECON_AND_STORE(dest, in3);
3856      RECON_AND_STORE(dest, in4);
3857      RECON_AND_STORE(dest, in5);
3858      RECON_AND_STORE(dest, in6);
3859      RECON_AND_STORE(dest, in7);
3860      RECON_AND_STORE(dest, in8);
3861      RECON_AND_STORE(dest, in9);
3862      RECON_AND_STORE(dest, in10);
3863      RECON_AND_STORE(dest, in11);
3864      RECON_AND_STORE(dest, in12);
3865      RECON_AND_STORE(dest, in13);
3866      RECON_AND_STORE(dest, in14);
3867      RECON_AND_STORE(dest, in15);
3868      RECON_AND_STORE(dest, in16);
3869      RECON_AND_STORE(dest, in17);
3870      RECON_AND_STORE(dest, in18);
3871      RECON_AND_STORE(dest, in19);
3872      RECON_AND_STORE(dest, in20);
3873      RECON_AND_STORE(dest, in21);
3874      RECON_AND_STORE(dest, in22);
3875      RECON_AND_STORE(dest, in23);
3876      RECON_AND_STORE(dest, in24);
3877      RECON_AND_STORE(dest, in25);
3878      RECON_AND_STORE(dest, in26);
3879      RECON_AND_STORE(dest, in27);
3880      RECON_AND_STORE(dest, in28);
3881      RECON_AND_STORE(dest, in29);
3882      RECON_AND_STORE(dest, in30);
3883      RECON_AND_STORE(dest, in31);
3884
3885      dest += 8 - (stride * 32);
3886    }
3887  }
3888}  //NOLINT
3889
3890void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
3891  __m128i dc_value;
3892  const __m128i zero = _mm_setzero_si128();
3893  int a, i;
3894
3895  a = dct_const_round_shift(input[0] * cospi_16_64);
3896  a = dct_const_round_shift(a * cospi_16_64);
3897  a = ROUND_POWER_OF_TWO(a, 6);
3898
3899  dc_value = _mm_set1_epi16(a);
3900
3901  for (i = 0; i < 4; ++i) {
3902    RECON_AND_STORE(dest, dc_value);
3903    RECON_AND_STORE(dest, dc_value);
3904    RECON_AND_STORE(dest, dc_value);
3905    RECON_AND_STORE(dest, dc_value);
3906    RECON_AND_STORE(dest, dc_value);
3907    RECON_AND_STORE(dest, dc_value);
3908    RECON_AND_STORE(dest, dc_value);
3909    RECON_AND_STORE(dest, dc_value);
3910    RECON_AND_STORE(dest, dc_value);
3911    RECON_AND_STORE(dest, dc_value);
3912    RECON_AND_STORE(dest, dc_value);
3913    RECON_AND_STORE(dest, dc_value);
3914    RECON_AND_STORE(dest, dc_value);
3915    RECON_AND_STORE(dest, dc_value);
3916    RECON_AND_STORE(dest, dc_value);
3917    RECON_AND_STORE(dest, dc_value);
3918    RECON_AND_STORE(dest, dc_value);
3919    RECON_AND_STORE(dest, dc_value);
3920    RECON_AND_STORE(dest, dc_value);
3921    RECON_AND_STORE(dest, dc_value);
3922    RECON_AND_STORE(dest, dc_value);
3923    RECON_AND_STORE(dest, dc_value);
3924    RECON_AND_STORE(dest, dc_value);
3925    RECON_AND_STORE(dest, dc_value);
3926    RECON_AND_STORE(dest, dc_value);
3927    RECON_AND_STORE(dest, dc_value);
3928    RECON_AND_STORE(dest, dc_value);
3929    RECON_AND_STORE(dest, dc_value);
3930    RECON_AND_STORE(dest, dc_value);
3931    RECON_AND_STORE(dest, dc_value);
3932    RECON_AND_STORE(dest, dc_value);
3933    RECON_AND_STORE(dest, dc_value);
3934    dest += 8 - (stride * 32);
3935  }
3936}
3937